Objectives

The objectives of this analysis are:

Characterize the expenditures of the City’s Divisions or Cost Centres by establishing the number and frequency of transactions, their typical amounts, dispersion, etc.
Identify meaningful groups of Divisions or Cost Centres that behave similarly in terms of their expense behaviour.
Identify meaningful anomalies in the data that may require closer examination.

Notebook Setup

%load_ext lab_black
%load_ext autoreload
%autoreload 2
%matplotlib inline

%config Completer.use_jedi = False

Libraries

import io
import requests
import functools

from pathlib import Path

import klib
import stumpy

import numpy as np
import pandas as pd

import zipfile as zp

import matplotlib as mpl
import matplotlib.pyplot as plt

print(f"{np.__version__=}")
print(f"{pd.__version__=}")
print(f"{klib.__version__=}")
print(f"{mpl.__version__=}")
print(f"{stumpy.__version__=}")

np.__version__='1.20.3'
pd.__version__='1.2.3'
klib.__version__='0.1.5'
mpl.__version__='3.3.4'
stumpy.__version__='1.8.0'

from string_grouper import group_similar_strings

Global Variables

data_dir = Path("data")

data_dir.absolute()

WindowsPath('C:/Users/kslad/Documents/Projects/kslader8-thoughts/_notebooks/data')

Data

Loading

meta_data_req = (
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show",
    {"id": "ebc3f9c2-2f80-4405-bf4f-5fb309581485"},
)

meta_payload = requests.get(*meta_data_req).json()
meta_payload["result"]

{'license_title': 'Open Government Licence – Toronto',
 'owner_unit': None,
 'relationships_as_object': [],
 'topics': 'City government,Finance',
 'owner_email': 'pcard@toronto.ca',
 'excerpt': 'The dataset contains details of all purchases made by City staff members using City-issued credit cards also referred to as PCards or purchasing cards.   ',
 'private': False,
 'owner_division': 'Accounting Services',
 'num_tags': 7,
 'id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
 'metadata_created': '2019-07-23T17:52:47.150105',
 'refresh_rate': 'Monthly',
 'title': 'PCard Expenditures',
 'license_url': 'https://open.toronto.ca/open-data-license/',
 'state': 'active',
 'information_url': 'http://www.toronto.ca/finance/index.htm',
 'license_id': 'open-government-licence-toronto',
 'type': 'dataset',
 'resources': [{'cache_last_updated': None,
   'package_id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
   'datastore_active': False,
   'id': 'c3a307f6-09c1-4db4-b7de-c35a53344a65',
   'size': 26624,
   'format': 'XLS',
   'state': 'active',
   'hash': '',
   'description': '',
   'is_preview': False,
   'last_modified': '2019-07-24T20:25:05.559452',
   'url_type': 'upload',
   'mimetype': 'application/vnd.ms-excel',
   'cache_url': None,
   'extract_job': None,
   'name': 'readme-file',
   'created': '2019-07-23T17:52:47.544377',
   'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/c3a307f6-09c1-4db4-b7de-c35a53344a65/download/pcard_expenditures_readme.xls',
   'mimetype_inner': None,
   'position': 0,
   'revision_id': '2b64cfe2-0a96-4243-9eff-f27237c602c2',
   'resource_type': None},
  {'cache_last_updated': None,
   'package_id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
   'datastore_active': False,
   'id': '6b394ed0-1de6-403b-a907-a3608509e300',
   'size': 46902535,
   'format': 'ZIP',
   'state': 'active',
   'hash': '',
   'description': '',
   'is_preview': False,
   'last_modified': '2019-09-03T19:44:00',
   'url_type': 'upload',
   'mimetype': 'application/zip',
   'cache_url': None,
   'extract_job': 'Airflow: upload_remote_files',
   'name': 'pcard-expenses',
   'created': '2020-12-07T21:47:55.929457',
   'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/6b394ed0-1de6-403b-a907-a3608509e300/download/expenditures.zip',
   'mimetype_inner': None,
   'position': 1,
   'revision_id': '84e77a3a-7692-4696-8297-6d029c4a033a',
   'resource_type': None}],
 'limitations': None,
 'num_resources': 2,
 'collection_method': None,
 'tags': [{'vocabulary_id': None,
   'state': 'active',
   'display_name': 'council spending',
   'id': '8050749a-8ee9-4858-a640-50ef88ce9a73',
   'name': 'council spending'},
  {'vocabulary_id': None,
   'state': 'active',
   'display_name': 'pcard',
   'id': 'a8554c91-fbf8-4faf-b774-ab2ff619d448',
   'name': 'pcard'}],
 'is_retired': False,
 'groups': [],
 'creator_user_id': '150d5301-86ec-44a3-a070-50f2cea839c9',
 'dataset_category': 'Document',
 'relationships_as_subject': [],
 'name': 'pcard-expenditures',
 'metadata_modified': '2020-12-07T21:47:57.302587',
 'isopen': False,
 'url': None,
 'notes': '\n\nSee README file (PCard Expenditures Readme.xls)\n\nCouncil PCard expenses are not included in this report. Councillor expenses are disclosed separately through the quarterly Councillor Office Expense Reports.\n\nPCard expenses for the Offices of the Auditor General, Integrity Commissioner, Lobbyist Registrar and Ombudsman are not included as part of this report. PCard expenses for these offices are posted separately on the websites of the Offices of the Auditor General, Integrity Commissioner, Lobbyist Registrar and Ombudsman.\n',
 'owner_org': '95a064ae-77e8-4ef0-a4e3-4e2d43e1f066',
 'last_refreshed': '2019-09-03T19:44:00.000000',
 'image_url': None,
 'formats': 'XLS,ZIP',
 'owner_section': None,
 'organization': {'description': '',
  'created': '2019-07-23T15:51:35.236542',
  'title': 'City of Toronto',
  'name': 'city-of-toronto',
  'is_organization': True,
  'state': 'active',
  'image_url': '',
  'revision_id': '62817a72-3dc5-450a-8cc4-fef2c51660da',
  'type': 'organization',
  'id': '95a064ae-77e8-4ef0-a4e3-4e2d43e1f066',
  'approval_status': 'approved'},
 'revision_id': '2b64cfe2-0a96-4243-9eff-f27237c602c2',
 'civic_issues': 'Fiscal responsibility'}

resources_list = [
    resource.get("url")
    for resource in meta_payload.get("result", {}).get("resources", {})
]

resources_dict = {resource.split("""/""")[-1]: resource for resource in resources_list}
resources_dict.keys()

dict_keys(['pcard_expenditures_readme.xls', 'expenditures.zip'])

pd.read_excel(resources_dict.get("pcard_expenditures_readme.xls")).dropna()

expenditures_dir = data_dir / "expenditures"

r = requests.get(resources_dict.get("expenditures.zip"))
z = zp.ZipFile(io.BytesIO(r.content))
z.extractall(expenditures_dir)

list(expenditures_dir.glob("*"))

[WindowsPath('data/expenditures/PCard Expenses_201706.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201707.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201708.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201709.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201710.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2017_11_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2017_12_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_01_Final_revised.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_02_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_03_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_04_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_05_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_06_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_07.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_08.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_09.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_10.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_11.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_12.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_01.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_02.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_03.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_04.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_05.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_06.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_07.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCardExpenses_201101_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201102_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201103_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201104_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201105_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201106_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201107_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201108_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201109_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201110_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201111_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201112_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201201_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201202_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201203_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201204_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201205_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201206_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201207_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201208_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201209_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201210_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201211_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201212_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201301_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201302_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201303_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201304_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201305_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201306_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201307_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201308_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201309_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201310_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201311_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201312_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201401_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201402_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201403_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201404_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201405_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201406_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201407_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201408_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201409_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201410_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201411_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201412_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201501_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201502_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201503_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201504_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201505_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201506_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201507_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201508_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201509_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201510_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201511_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201512_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201601.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201602.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201603.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201604.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201605.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201606.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201607.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201608.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201609.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201610.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201611.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201612.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201701.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201702.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201703.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201704.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201705.xlsx')]

kb_conv = 1_024
mb_conv = kb_conv ** 2
gb_nv = kb_conv ** 3

bytes_size = expenditures_dir.stat().st_size

f"{bytes_size / mb_conv:,.3} MB"

'0.0469 MB'

dfs = [pd.read_excel(xls) for xls in expenditures_dir.glob("*.xlsx")]

aggr_df = pd.concat(dfs, ignore_index=True)

pd.concat(
    [aggr_df.head(1).T, aggr_df.sample(2).T, aggr_df.tail(1).T, aggr_df.dtypes], axis=1
)

Basic Preprocessing

clean and standardizes column names

aggr_df = klib.clean_column_names(aggr_df)
aggr_df.columns

Duplicate column names detected! Columns with index [17, 21, 29, 32, 34, 35, 36, 38, 39, 41, 45, 46] and names ['batch_transaction_id', 'division', 'trx_currency', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description']) have been renamed to ['batch_transaction_id_17', 'division_21', 'trx_currency_29', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_34', 'cost_centre_wbs_element_order_description_35', 'cost_centre_wbs_element_order_no_36', 'cost_centre_wbs_element_order_38', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41', 'cost_centre_wbs_element_order_45', 'cost_centre_wbs_element_order_description_46'].
Long column names detected (>25 characters). Consider renaming the following columns ['cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_center_wbs_element_order', 'cost_center_wbs_element_order_description', 'cost_center_wbls_element_order_description', 'cost_center_wbs_element_order_hash', 'cost_center_wbs_element_order_hash_description', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order_no_description', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_34', 'cost_centre_wbs_element_order_description_35', 'cost_centre_wbs_element_order_no_36', 'cost_centre_wbs_element_order_no_decription', 'cost_centre_wbs_element_order_38', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41', 'cost_centre_wbs_element_order_45', 'cost_centre_wbs_element_order_description_46'].

C:\tools\Anaconda3\envs\blog\lib\site-packages\klib\clean.py:76: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
  data.columns.str.replace("\n", "_")

Index(['division', 'batch_transaction_id', 'transaction_date',
       'card_posting_dt', 'merchant_name', 'transaction_amt', 'trx_currency',
       'original_amount', 'original_currency', 'g_l_account',
       'g_l_account_description', 'cost_centre_wbs_element_order',
       'cost_centre_wbs_element_order_description', 'merchant_type',
       'merchant_type_description', 'purpose', 'unnamed_16',
       'batch_transaction_id_17', 'cost_center_wbs_element_order',
       'cost_center_wbs_element_order_description', 'exp_type_desc',
       'division_21', 'cost_center_wbls_element_order_description',
       'cost_center_wbs_element_order_hash',
       'cost_center_wbs_element_order_hash_description', 'card_posting_date',
       'transaction_amount', 'transaction_currency', 'merchant_type_mcc',
       'trx_currency_29', 'cost_centre_wbs_element_order_no',
       'cost_centre_wbs_element_order_no_description',
       'cost_centre_wbs_element_order_no_32', 'tr_currency',
       'cost_centre_wbs_element_order_34',
       'cost_centre_wbs_element_order_description_35',
       'cost_centre_wbs_element_order_no_36',
       'cost_centre_wbs_element_order_no_decription',
       'cost_centre_wbs_element_order_38',
       'cost_centre_wbs_element_order_description_39', 'unnamed_17',
       'cost_centre_wbs_element_order_41', 'unnamed_18', 'unnamed_19',
       'divison', 'cost_centre_wbs_element_order_45',
       'cost_centre_wbs_element_order_description_46'],
      dtype='object')

Missing Values

aggregate data missing values plot

klib.missingval_plot(aggr_df)

GridSpec(6, 6)

remove nan divisions

clean_df = aggr_df[aggr_df["division"].notna()]
clean_df.division.isnull().sum(), aggr_df.division.isnull().sum(), aggr_df.shape[0]

(0, 57788, 297996)

klib.missingval_plot(clean_df)

GridSpec(6, 6)

merge similar divisions

divisions_list = list(clean_df.division.unique())
divisions_list

['PUBLIC HEALTH',
 'ECONOMIC DEVELOPMENT & CULTURE',
 'PARKS, FORESTRY & RECREATION',
 'TRANSPORTATION SERVICES ',
 'EMPLOYMENT & SOCIAL SERVICES',
 'TORONTO WATER',
 'LONG TERM CARE HOMES & SERVICES',
 'TORONTO PARAMEDIC SERVICES',
 'MUNICIPAL LICENSING & STANDARDS',
 'FIRE SERVICES',
 "CITY CLERK'S OFFICE",
 'STRATEGIC COMMUNICATIONS ',
 'FACILITIES MANAGEMENT DIVISON',
 'LEGAL SERVICES',
 'HUMAN RESOURCES ',
 'SHELTER, SUPPORT & HOUSING ADMINISTRATION',
 'SOLID WASTE MANAGEMENT',
 'ACCOUNTING SERVICES ',
 'CITY PLANNING',
 "CHILDREN'S SERVICES ",
 'ENVIRONMENT & ENERGY OFFICE',
 'SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ',
 'INFORMATION & TECHNOLOGY ',
 'CORPORATE FINANCE',
 'FLEET SERVICES',
 'ENGINEERING & CONSTRUCTION SERVICES',
 'FINANCE & ADMINISTRATION',
 'STRATEGIC & CORPORATE POLICY',
 'REVENUE SERVICES ',
 'POLICY, PLANNING, FINANCE & ADMINISTRATION',
 'TORONTO BUILDING ',
 'PURCHASING & MATERIALS MANAGEMENT ',
 '311 TORONTO',
 'OFFICE OF EMERGENCY MANAGEMENT ',
 'FINANCIAL PLANNING',
 'PENSION, PAYROLL & EMPLOYEE BENEFITS ',
 'AFFORDABLE HOUSING OFFICE',
 'DEPUTY CITY MGR & CFO',
 'CORPORATE CONTRACTS',
 'COURT SERVICES ',
 'CORPORATE SECURITY',
 'CFO',
 'DEPUTY CITY MGR INTERNAL SERVICES',
 'REAL ESTATE SERVICES',
 'TREASURER',
 'EXECUTIVE MANAGEMENT',
 'CITY MANAGER',
 'FACILITIES MANAGEMENT',
 'ENVIRONMENT & ENERGY',
 'STRATEGIC COMMUNICATIONS',
 'TRANSPORTATION',
 'CHILDRENS SERVICES',
 'REVENUE SERVICES',
 'CITY CLERKS OFFICE',
 'AFFORDABLE HOUSING',
 'ACCOUNTING SERVICES',
 'OFFICE OF EMERGENCY MANAGEMENT',
 'INFORMATION & TECHNOLOGY',
 'PENSION, PAYROLL & EMPLOYEE BENEFITS',
 'PURCHASING & MATERIALS MANAGEMENT',
 'TORONTO BUILDING',
 'LONG-TERM CARE HOMES & SERVICES',
 'OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER',
 'SHELTER, SUPPORT & HOUSING ADMINSTRATION',
 'SOCIAL DEVELOPMENT, FINANICE & ADMINISTRATION',
 'COURT SERVICES',
 'POLICY, PLANNING, FINANICE & ADMINISTRATION',
 'EXECUTIVE MGT',
 'TORONTO EMPLOYMENT & SOCIAL SERVICES',
 'CHIEF INNOVATION OFFICE',
 'OFFICE OF THE TREASURER',
 'CHIEF FINANCIAL OFFICER',
 'OFFICE OF THE CONTROLER',
 'OFFICE OF THE DCM - CORPORATE  SERVICES',
 'CHIEF TRANSFORMATION OFFICE',
 'FINANCE AND ADMIN',
 'LONG TERM CARE HOMES',
 'SHELTER, SUPPORT and HOUSING ADMINISTRATION',
 'EMERGENCY MEDICAL SERVICES',
 'FACILITIES MANAGEMENT ',
 'MUNICIPAL LICENSING & STANDARDS ',
 'ENGINEERING AND CONSTRUCTION SERVICES',
 "CITY MANAGER'S OFFICE",
 'INTERNAL AUDIT']

divisions_series = pd.Series(divisions_list)

map_df = group_similar_strings(divisions_series)
map_df["division"] = divisions_series

print(f"divisions raw: {len(map_df.division.unique())}")
print(f"divisions combined: {len(map_df.group_rep.unique())}")

divisions raw: 84
divisions combined: 62

def get_group_rep(map_df: pd.DataFrame, division: str) -> str:
    return map_df[map_df.division == division].head(1).group_rep.values[0]


clean_up_division = functools.partial(get_group_rep, map_df)

clean_df = clean_df.reset_index()
clean_df["division"] = clean_df.division.apply(clean_up_division)

clean_df.division.unique()

array(['PUBLIC HEALTH', 'ECONOMIC DEVELOPMENT & CULTURE',
       'PARKS, FORESTRY & RECREATION', 'TRANSPORTATION SERVICES ',
       'EMPLOYMENT & SOCIAL SERVICES', 'TORONTO WATER',
       'LONG TERM CARE HOMES & SERVICES', 'TORONTO PARAMEDIC SERVICES',
       'MUNICIPAL LICENSING & STANDARDS', 'FIRE SERVICES',
       "CITY CLERK'S OFFICE", 'STRATEGIC COMMUNICATIONS ',
       'FACILITIES MANAGEMENT DIVISON', 'LEGAL SERVICES',
       'HUMAN RESOURCES ', 'SHELTER, SUPPORT & HOUSING ADMINISTRATION',
       'SOLID WASTE MANAGEMENT', 'ACCOUNTING SERVICES ', 'CITY PLANNING',
       "CHILDREN'S SERVICES ", 'ENVIRONMENT & ENERGY OFFICE',
       'SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ',
       'INFORMATION & TECHNOLOGY ', 'CORPORATE FINANCE', 'FLEET SERVICES',
       'ENGINEERING & CONSTRUCTION SERVICES', 'FINANCE & ADMINISTRATION',
       'STRATEGIC & CORPORATE POLICY', 'REVENUE SERVICES ',
       'POLICY, PLANNING, FINANCE & ADMINISTRATION', 'TORONTO BUILDING ',
       'PURCHASING & MATERIALS MANAGEMENT ', '311 TORONTO',
       'OFFICE OF EMERGENCY MANAGEMENT ', 'FINANCIAL PLANNING',
       'PENSION, PAYROLL & EMPLOYEE BENEFITS ',
       'AFFORDABLE HOUSING OFFICE', 'DEPUTY CITY MGR & CFO',
       'CORPORATE CONTRACTS', 'COURT SERVICES ', 'CORPORATE SECURITY',
       'CFO', 'DEPUTY CITY MGR INTERNAL SERVICES', 'REAL ESTATE SERVICES',
       'TREASURER', 'EXECUTIVE MANAGEMENT', 'CITY MANAGER',
       'FACILITIES MANAGEMENT', 'CHILDRENS SERVICES',
       'CITY CLERKS OFFICE',
       'OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER', 'EXECUTIVE MGT',
       'CHIEF INNOVATION OFFICE', 'OFFICE OF THE TREASURER',
       'CHIEF FINANCIAL OFFICER', 'OFFICE OF THE CONTROLER',
       'OFFICE OF THE DCM - CORPORATE  SERVICES',
       'CHIEF TRANSFORMATION OFFICE', 'FINANCE AND ADMIN',
       'EMERGENCY MEDICAL SERVICES', "CITY MANAGER'S OFFICE",
       'INTERNAL AUDIT'], dtype=object)

convert to more efficient dtypes

clean_df = klib.convert_datatypes(clean_df)
clean_df.dtypes

index                                                      int32
division                                                category
batch_transaction_id                                      string
transaction_date                                  datetime64[ns]
card_posting_dt                                   datetime64[ns]
merchant_name                                             string
transaction_amt                                          float32
trx_currency                                            category
original_amount                                          float32
original_currency                                       category
g_l_account                                             category
g_l_account_description                                 category
cost_centre_wbs_element_order                           category
cost_centre_wbs_element_order_description               category
merchant_type                                            float32
merchant_type_description                               category
purpose                                                   string
unnamed_16                                               float32
batch_transaction_id_17                                   string
cost_center_wbs_element_order                           category
cost_center_wbs_element_order_description               category
exp_type_desc                                           category
division_21                                             category
cost_center_wbls_element_order_description              category
cost_center_wbs_element_order_hash                      category
cost_center_wbs_element_order_hash_description          category
card_posting_date                                 datetime64[ns]
transaction_amount                                       float32
transaction_currency                                    category
merchant_type_mcc                                        float32
trx_currency_29                                         category
cost_centre_wbs_element_order_no                        category
cost_centre_wbs_element_order_no_description            category
cost_centre_wbs_element_order_no_32                     category
tr_currency                                             category
cost_centre_wbs_element_order_34                        category
cost_centre_wbs_element_order_description_35            category
cost_centre_wbs_element_order_no_36                     category
cost_centre_wbs_element_order_no_decription             category
cost_centre_wbs_element_order_38                        category
cost_centre_wbs_element_order_description_39            category
unnamed_17                                               float32
cost_centre_wbs_element_order_41                        category
unnamed_18                                               float32
unnamed_19                                               float32
divison                                                 category
cost_centre_wbs_element_order_45                        category
cost_centre_wbs_element_order_description_46            category
dtype: object

pool subset of cols (by minimizing duplicates and loss of information)

clean_df = klib.data_cleaning(clean_df)
clean_df.dtypes

C:\tools\Anaconda3\envs\blog\lib\site-packages\klib\clean.py:76: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
  data.columns.str.replace("\n", "_")

Long column names detected (>25 characters). Consider renaming the following columns ['cost_center_wbs_element_order', 'cost_center_wbs_element_order_description', 'cost_centre_wbs_element_order_no_description', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41'].
Shape of cleaned data: (240208, 27)Remaining NAs: 2487521

Changes:
Dropped rows: 0
     of which 0 duplicates. (Rows: [])
Dropped columns: 21
     of which 0 single valued.     Columns: []
Dropped missing values: 4962027
Reduced memory by at least: 0.21 MB (-0.59%)

index                                                    int32
division                                              category
batch_transaction_id                                    string
transaction_date                                datetime64[ns]
card_posting_dt                                 datetime64[ns]
merchant_name                                           string
transaction_amt                                        Float32
trx_currency                                          category
original_amount                                        Float32
original_currency                                     category
g_l_account                                           category
g_l_account_description                               category
merchant_type                                          Float32
merchant_type_description                             category
purpose                                                 string
batch_transaction_id_17                                 string
cost_center_wbs_element_order                         category
cost_center_wbs_element_order_description             category
card_posting_date                               datetime64[ns]
transaction_amount                                     Float32
transaction_currency                                  category
merchant_type_mcc                                      Float32
trx_currency_29                                       category
cost_centre_wbs_element_order_no_description          category
cost_centre_wbs_element_order_no_32                   category
cost_centre_wbs_element_order_description_39          category
cost_centre_wbs_element_order_41                      category
dtype: object

Global Plots

klib.corr_plot(clean_df)

<AxesSubplot:title={'center':'Feature-correlation (pearson)'}>

EDA | Division

for div, grp in clean_df.groupby("division"):
    print(div)
    klib.dist_plot(grp.drop("index", axis=1))
    plt.show()

311 TORONTO

ACCOUNTING SERVICES

AFFORDABLE HOUSING OFFICE

CFO

CHIEF FINANCIAL OFFICER

CHIEF INNOVATION OFFICE

CHIEF TRANSFORMATION OFFICE
No columns with numeric data were detected.
CHILDREN'S SERVICES

CHILDRENS SERVICES

CITY CLERK'S OFFICE

CITY CLERKS OFFICE

EDA | Division + Transaction Year

for div, grp in clean_df.groupby(
    [clean_df.division, clean_df.transaction_date.dt.year]
):
    print(div)
    klib.dist_plot(grp.drop("index", axis=1))
    plt.show()

('311 TORONTO', 2017.0)

('311 TORONTO', 2018.0)

('311 TORONTO', 2019.0)

('ACCOUNTING SERVICES ', 2015.0)

('ACCOUNTING SERVICES ', 2016.0)

('ACCOUNTING SERVICES ', 2017.0)

('ACCOUNTING SERVICES ', 2018.0)

('ACCOUNTING SERVICES ', 2019.0)

('AFFORDABLE HOUSING OFFICE', 2015.0)

('AFFORDABLE HOUSING OFFICE', 2016.0)

Looks at the Motifs & Discords

cad_df = clean_df[clean_df["original_currency"] == "CAD"]
toronto_cad_df = cad_df[cad_df["division"] == "311 TORONTO"]

matrix_profile = stumpy.stump(toronto_cad_df["original_amount"], m=5)

motif_idx = np.argsort(matrix_profile[:, 0])[0]

print(f"The inspecting the series motif")
pd.concat(
    [
        toronto_cad_df.iloc[motif_idx - 2],
        toronto_cad_df.iloc[motif_idx - 1],
        toronto_cad_df.iloc[motif_idx],
        toronto_cad_df.iloc[motif_idx - 1],
        toronto_cad_df.iloc[motif_idx + 2],
    ],
    axis=1,
)

The inspecting the series motif

correct_arc_curve, regime_locations = stumpy.fluss(
    matrix_profile[:, 1], L=5, n_regimes=2, excl_factor=1
)

	PCARD EXPENDITURES	Unnamed: 1
1	Field	Description
2	DIVISION	Name of City of Toronto division
3	BATCH-TRANSACTION ID	Unique identifier assigned by credit card company
4	TRANSACTION DATE	Date of purchase
5	CARD POSTING DATE	Date purchase posted by credit card company
6	MERCHANT NAME	Name of vendor where purchase was made
7	TRANSACTION AMOUNT	Amount of purchase in Canadian currency (inclu...
8	TRANSACTION CURRENCY	Currency of country
9	ORIGINAL AMOUNT	Amount of Purchase in the currency of purchase...
10	ORIGINAL CURRENCY	Currency in which purchase was made
11	G/L ACCOUNT	Description of the type of expense made and to...
12	G/L ACCOUNT DESCRIPTION	Long text description of the expense
13	COST CENTRE/WBS ELEMENT/ORDER	Represents the Organization Unit or the "where...
14	COST CENTRE/WBS ELEMENT/ORDER DESCRIPTION	Description of the Organization Unit or the "w...
15	MERCHANT TYPE (MCC)	A four-digit number assigned by MasterCard and...
16	MERCHANT TYPE DESCRIPTION	Description of the MCC assigned to the merchant
17	PURPOSE	Free-form text describing purchase by the card...

	0	296462	104882	297995	0
Division	PUBLIC HEALTH	PARKS, FORESTRY & RECREATION	NaN	NaN	object
Batch-Transaction ID	4608-1	NaN	NaN	NaN	object
Transaction Date	2017-06-15 00:00:00	2017-05-17 00:00:00	NaT	NaT	datetime64[ns]
Card Posting Dt	2017-06-16 00:00:00	2017-05-18 00:00:00	NaT	NaT	datetime64[ns]
Merchant Name	PAYPAL *OBC2012	COMFORT VACCUM SERVICE	NaN	NaN	object
Transaction Amt.	50.0	721.52	NaN	NaN	float64
Trx Currency	CAD	CAD	NaN	NaN	object
Original Amount	50.0	721.52	642.96	20729.48	float64
Original Currency	CAD	CAD	CAD	USD	object
G/L Account	4760	2120	NaN	NaN	object
G/L Account Description	MEMBERSHIP FEES	PARTS - MACHINERY & EQUIPMENT	NaN	NaN	object
Cost Centre /WBS Element / Order	PH3071	NaN	NaN	NaN	object
Cost Centre /WBS Element / Order Description	MATERNAL INFANT HEALTH PROGRAM SUPPORT	NaN	NaN	NaN	object
Merchant Type	8641.0	5722.0	NaN	NaN	float64
Merchant Type Description	Associations - Civic, Social, and Frater	Household Appliance Stores	NaN	NaN	object
Purpose	MEMBERSHIP FEE	FLOOR MACHINE REPAIRS	NaN	NaN	object
Unnamed: 16	NaN	NaN	NaN	NaN	float64
Batch Transaction ID	NaN	4565-171	NaN	NaN	object
Cost Center / WBS Element / Order	NaN	NaN	NaN	NaN	object
Cost Center / WBS Element / Order Description	NaN	NaN	NaN	NaN	object
Exp Type Desc	NaN	NaN	NaN	NaN	object
Division	NaN	NaN	NaN	NaN	object
Cost Center / WBLS Element / Order Description	NaN	NaN	NaN	NaN	object
Cost Center / WBS Element / Order #	NaN	NaN	NaN	NaN	object
Cost Center / WBS Element / Order # Description	NaN	NaN	NaN	NaN	object
Card Posting Date	NaT	NaT	NaT	NaT	datetime64[ns]
Transaction Amount	NaN	NaN	642.96	NaN	float64
Transaction Currency	NaN	NaN	CAD	NaN	object
Merchant Type (MCC)	NaN	NaN	NaN	NaN	float64
Trx.Currency	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order No	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order No. Description	NaN	WALLACE EMERSON-OPERATIONS	NaN	NaN	object
Cost Centre / WBS Element / Order No.	NaN	P07728	NaN	NaN	object
Tr Currency	NaN	NaN	NaN	NaN	object
Cost Centre / WBS element / Order	NaN	NaN	NaN	NaN	object
Cost Centre / WBS element / Order Description	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order No.	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order No. Decription	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order	NaN	NaN	NaN	NaN	object
Cost Centre / WBS Element / Order Description	NaN	NaN	NaN	NaN	object
Unnamed: 17	NaN	NaN	NaN	NaN	float64
Cost Centre / WBS Element / Order	NaN	NaN	NaN	NaN	object
Unnamed: 18	NaN	NaN	NaN	NaN	float64
Unnamed: 19	NaN	NaN	NaN	NaN	float64
Divison	NaN	NaN	NaN	NaN	object
Cost Centre/ WBS Element / Order	NaN	NaN	NaN	NaN	object
Cost Centre/ WBS Element / Order Description	NaN	NaN	NaN	NaN	object

	73246	73247	73248	73247	73250
index	93349	93350	93351	93350	93353
division	311 TORONTO	311 TORONTO	311 TORONTO	311 TORONTO	311 TORONTO
batch_transaction_id	5097-52	5104-15	5104-16	5104-15	5109-24
transaction_date	2018-10-11 00:00:00	2018-10-22 00:00:00	2018-10-22 00:00:00	2018-10-22 00:00:00	2018-10-29 00:00:00
card_posting_dt	NaT	NaT	NaT	NaT	NaT
merchant_name	WAL*MART CANADA INC	PIZZA NOVA 017	PIZZA NOVA 99	PIZZA NOVA 017	TIM HORTONS 2387 QTH
transaction_amt	NaN	NaN	NaN	NaN	NaN
trx_currency	NaN	NaN	NaN	NaN	NaN
original_amount	40.610001	283.839996	59.189999	283.839996	41.939999
original_currency	CAD	CAD	CAD	CAD	CAD
g_l_account	2099	4820	4820	4820	4820
g_l_account_description	OTHER OFFICE MATERIAL &MINOR FURNISHING	BUSINESS MEETING EXPENSES	BUSINESS MEETING EXPENSES	BUSINESS MEETING EXPENSES	BUSINESS MEETING EXPENSES
cost_centre_wbs_element_order	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_description	NaN	NaN	NaN	NaN	NaN
merchant_type	NaN	NaN	NaN	NaN	NaN
merchant_type_description	Discount Stores	Quick Payment Service - Fast-Food Restau	Eating Places, Restaurants	Quick Payment Service - Fast-Food Restau	Quick Payment Service - Fast-Food Restau
purpose	STORAGE CABINET FOR VIEWING GALLERY MEETING ROOM	TEN PARTY SIZE PIZZA FOR ELECTION DAY STAFF AP...	2 PIZZA FOR ELECTION DAY STAFF APPRECIATION OC...	TEN PARTY SIZE PIZZA FOR ELECTION DAY STAFF AP...	50 COOKIES FOR WORKFORCE DEVELOPMENT MONTH TOU...
unnamed_16	NaN	NaN	NaN	NaN	NaN
batch_transaction_id_17	<NA>	<NA>	<NA>	<NA>	<NA>
cost_center_wbs_element_order	TO9301	TO9301	TO9301	TO9301	TO9301
cost_center_wbs_element_order_description	CONTACT CENTRE	CONTACT CENTRE	CONTACT CENTRE	CONTACT CENTRE	CONTACT CENTRE
exp_type_desc	NaN	NaN	NaN	NaN	NaN
division_21	NaN	NaN	NaN	NaN	NaN
cost_center_wbls_element_order_description	NaN	NaN	NaN	NaN	NaN
cost_center_wbs_element_order_hash	NaN	NaN	NaN	NaN	NaN
cost_center_wbs_element_order_hash_description	NaN	NaN	NaN	NaN	NaN
card_posting_date	2018-10-11 00:00:00	2018-10-22 00:00:00	2018-10-22 00:00:00	2018-10-22 00:00:00	2018-10-29 00:00:00
transaction_amount	40.610001	283.839996	59.189999	283.839996	41.939999
transaction_currency	CAD	CAD	CAD	CAD	CAD
merchant_type_mcc	5310.0	5814.0	5812.0	5814.0	5814.0
trx_currency_29	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_no	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_no_description	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_no_32	NaN	NaN	NaN	NaN	NaN
tr_currency	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_34	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_description_35	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_no_36	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_no_decription	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_38	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_description_39	NaN	NaN	NaN	NaN	NaN
unnamed_17	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_41	NaN	NaN	NaN	NaN	NaN
unnamed_18	NaN	NaN	NaN	NaN	NaN
unnamed_19	NaN	NaN	NaN	NaN	NaN
divison	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_45	NaN	NaN	NaN	NaN	NaN
cost_centre_wbs_element_order_description_46	NaN	NaN	NaN	NaN	NaN