Objectives

The objectives of this analysis are:

  • Characterize the expenditures of the City’s Divisions or Cost Centres by establishing the number and frequency of transactions, their typical amounts, dispersion, etc.
  • Identify meaningful groups of Divisions or Cost Centres that behave similarly in terms of their expense behaviour.
  • Identify meaningful anomalies in the data that may require closer examination.

Notebook Setup

%load_ext lab_black
%load_ext autoreload
%autoreload 2
%matplotlib inline

%config Completer.use_jedi = False

Libraries

import io
import requests
import functools

from pathlib import Path
import klib
import stumpy

import numpy as np
import pandas as pd

import zipfile as zp

import matplotlib as mpl
import matplotlib.pyplot as plt

print(f"{np.__version__=}")
print(f"{pd.__version__=}")
print(f"{klib.__version__=}")
print(f"{mpl.__version__=}")
print(f"{stumpy.__version__=}")
np.__version__='1.20.3'
pd.__version__='1.2.3'
klib.__version__='0.1.5'
mpl.__version__='3.3.4'
stumpy.__version__='1.8.0'
from string_grouper import group_similar_strings

Global Variables

data_dir = Path("data")

data_dir.absolute()
WindowsPath('C:/Users/kslad/Documents/Projects/kslader8-thoughts/_notebooks/data')

Data

Loading

meta_data_req = (
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show",
    {"id": "ebc3f9c2-2f80-4405-bf4f-5fb309581485"},
)
meta_payload = requests.get(*meta_data_req).json()
meta_payload["result"]
{'license_title': 'Open Government Licence – Toronto',
 'owner_unit': None,
 'relationships_as_object': [],
 'topics': 'City government,Finance',
 'owner_email': 'pcard@toronto.ca',
 'excerpt': 'The dataset contains details of all purchases made by City staff members using City-issued credit cards also referred to as PCards or purchasing cards.   ',
 'private': False,
 'owner_division': 'Accounting Services',
 'num_tags': 7,
 'id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
 'metadata_created': '2019-07-23T17:52:47.150105',
 'refresh_rate': 'Monthly',
 'title': 'PCard Expenditures',
 'license_url': 'https://open.toronto.ca/open-data-license/',
 'state': 'active',
 'information_url': 'http://www.toronto.ca/finance/index.htm',
 'license_id': 'open-government-licence-toronto',
 'type': 'dataset',
 'resources': [{'cache_last_updated': None,
   'package_id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
   'datastore_active': False,
   'id': 'c3a307f6-09c1-4db4-b7de-c35a53344a65',
   'size': 26624,
   'format': 'XLS',
   'state': 'active',
   'hash': '',
   'description': '',
   'is_preview': False,
   'last_modified': '2019-07-24T20:25:05.559452',
   'url_type': 'upload',
   'mimetype': 'application/vnd.ms-excel',
   'cache_url': None,
   'extract_job': None,
   'name': 'readme-file',
   'created': '2019-07-23T17:52:47.544377',
   'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/c3a307f6-09c1-4db4-b7de-c35a53344a65/download/pcard_expenditures_readme.xls',
   'mimetype_inner': None,
   'position': 0,
   'revision_id': '2b64cfe2-0a96-4243-9eff-f27237c602c2',
   'resource_type': None},
  {'cache_last_updated': None,
   'package_id': 'ebc3f9c2-2f80-4405-bf4f-5fb309581485',
   'datastore_active': False,
   'id': '6b394ed0-1de6-403b-a907-a3608509e300',
   'size': 46902535,
   'format': 'ZIP',
   'state': 'active',
   'hash': '',
   'description': '',
   'is_preview': False,
   'last_modified': '2019-09-03T19:44:00',
   'url_type': 'upload',
   'mimetype': 'application/zip',
   'cache_url': None,
   'extract_job': 'Airflow: upload_remote_files',
   'name': 'pcard-expenses',
   'created': '2020-12-07T21:47:55.929457',
   'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/ebc3f9c2-2f80-4405-bf4f-5fb309581485/resource/6b394ed0-1de6-403b-a907-a3608509e300/download/expenditures.zip',
   'mimetype_inner': None,
   'position': 1,
   'revision_id': '84e77a3a-7692-4696-8297-6d029c4a033a',
   'resource_type': None}],
 'limitations': None,
 'num_resources': 2,
 'collection_method': None,
 'tags': [{'vocabulary_id': None,
   'state': 'active',
   'display_name': 'council spending',
   'id': '8050749a-8ee9-4858-a640-50ef88ce9a73',
   'name': 'council spending'},
  {'vocabulary_id': None,
   'state': 'active',
   'display_name': 'pcard',
   'id': 'a8554c91-fbf8-4faf-b774-ab2ff619d448',
   'name': 'pcard'}],
 'is_retired': False,
 'groups': [],
 'creator_user_id': '150d5301-86ec-44a3-a070-50f2cea839c9',
 'dataset_category': 'Document',
 'relationships_as_subject': [],
 'name': 'pcard-expenditures',
 'metadata_modified': '2020-12-07T21:47:57.302587',
 'isopen': False,
 'url': None,
 'notes': '\n\nSee README file (PCard Expenditures Readme.xls)\n\nCouncil PCard expenses are not included in this report. Councillor expenses are disclosed separately through the quarterly Councillor Office Expense Reports.\n\nPCard expenses for the Offices of the Auditor General, Integrity Commissioner, Lobbyist Registrar and Ombudsman are not included as part of this report. PCard expenses for these offices are posted separately on the websites of the Offices of the Auditor General, Integrity Commissioner, Lobbyist Registrar and Ombudsman.\n',
 'owner_org': '95a064ae-77e8-4ef0-a4e3-4e2d43e1f066',
 'last_refreshed': '2019-09-03T19:44:00.000000',
 'image_url': None,
 'formats': 'XLS,ZIP',
 'owner_section': None,
 'organization': {'description': '',
  'created': '2019-07-23T15:51:35.236542',
  'title': 'City of Toronto',
  'name': 'city-of-toronto',
  'is_organization': True,
  'state': 'active',
  'image_url': '',
  'revision_id': '62817a72-3dc5-450a-8cc4-fef2c51660da',
  'type': 'organization',
  'id': '95a064ae-77e8-4ef0-a4e3-4e2d43e1f066',
  'approval_status': 'approved'},
 'revision_id': '2b64cfe2-0a96-4243-9eff-f27237c602c2',
 'civic_issues': 'Fiscal responsibility'}
resources_list = [
    resource.get("url")
    for resource in meta_payload.get("result", {}).get("resources", {})
]

resources_dict = {resource.split("""/""")[-1]: resource for resource in resources_list}
resources_dict.keys()
dict_keys(['pcard_expenditures_readme.xls', 'expenditures.zip'])
pd.read_excel(resources_dict.get("pcard_expenditures_readme.xls")).dropna()
PCARD EXPENDITURES Unnamed: 1
1 Field Description
2 DIVISION Name of City of Toronto division
3 BATCH-TRANSACTION ID Unique identifier assigned by credit card company
4 TRANSACTION DATE Date of purchase
5 CARD POSTING DATE Date purchase posted by credit card company
6 MERCHANT NAME Name of vendor where purchase was made
7 TRANSACTION AMOUNT Amount of purchase in Canadian currency (inclu...
8 TRANSACTION CURRENCY Currency of country
9 ORIGINAL AMOUNT Amount of Purchase in the currency of purchase...
10 ORIGINAL CURRENCY Currency in which purchase was made
11 G/L ACCOUNT Description of the type of expense made and to...
12 G/L ACCOUNT DESCRIPTION Long text description of the expense
13 COST CENTRE/WBS ELEMENT/ORDER Represents the Organization Unit or the "where...
14 COST CENTRE/WBS ELEMENT/ORDER DESCRIPTION Description of the Organization Unit or the "w...
15 MERCHANT TYPE (MCC) A four-digit number assigned by MasterCard and...
16 MERCHANT TYPE DESCRIPTION Description of the MCC assigned to the merchant
17 PURPOSE Free-form text describing purchase by the card...
expenditures_dir = data_dir / "expenditures"

r = requests.get(resources_dict.get("expenditures.zip"))
z = zp.ZipFile(io.BytesIO(r.content))
z.extractall(expenditures_dir)
list(expenditures_dir.glob("*"))
[WindowsPath('data/expenditures/PCard Expenses_201706.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201707.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201708.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201709.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_201710.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2017_11_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2017_12_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_01_Final_revised.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_02_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_03_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_04_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_05_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_06_Final.xlsx'),
 WindowsPath('data/expenditures/PCard Expenses_2018_07.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_08.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_09.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_10.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_11.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2018_12.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_01.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_02.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_03.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_04.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_05.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_06.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCard Expenses_2019_07.FINAL.XLSX'),
 WindowsPath('data/expenditures/PCardExpenses_201101_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201102_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201103_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201104_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201105_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201106_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201107_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201108_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201109_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201110_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201111_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201112_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201201_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201202_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201203_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201204_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201205_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201206_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201207_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201208_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201209_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201210_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201211_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201212_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201301_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201302_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201303_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201304_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201305_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201306_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201307_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201308_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201309_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201310_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201311_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201312_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201401_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201402_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201403_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201404_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201405_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201406_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201407_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201408_final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201409_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201410_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201411_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201412_Final.xls'),
 WindowsPath('data/expenditures/PCardExpenses_201501_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201502_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201503_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201504_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201505_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201506_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201507_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201508_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201509_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201510_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201511_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201512_Final.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201601.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201602.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201603.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201604.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201605.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201606.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201607.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201608.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201609.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201610.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201611.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201612.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201701.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201702.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201703.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201704.xlsx'),
 WindowsPath('data/expenditures/PCardExpenses_201705.xlsx')]
kb_conv = 1_024
mb_conv = kb_conv ** 2
gb_nv = kb_conv ** 3

bytes_size = expenditures_dir.stat().st_size

f"{bytes_size / mb_conv:,.3} MB"
'0.0469 MB'
dfs = [pd.read_excel(xls) for xls in expenditures_dir.glob("*.xlsx")]

aggr_df = pd.concat(dfs, ignore_index=True)
pd.concat(
    [aggr_df.head(1).T, aggr_df.sample(2).T, aggr_df.tail(1).T, aggr_df.dtypes], axis=1
)
0 296462 104882 297995 0
Division PUBLIC HEALTH PARKS, FORESTRY & RECREATION NaN NaN object
Batch-Transaction ID 4608-1 NaN NaN NaN object
Transaction Date 2017-06-15 00:00:00 2017-05-17 00:00:00 NaT NaT datetime64[ns]
Card Posting Dt 2017-06-16 00:00:00 2017-05-18 00:00:00 NaT NaT datetime64[ns]
Merchant Name PAYPAL *OBC2012 COMFORT VACCUM SERVICE NaN NaN object
Transaction Amt. 50.0 721.52 NaN NaN float64
Trx Currency CAD CAD NaN NaN object
Original Amount 50.0 721.52 642.96 20729.48 float64
Original Currency CAD CAD CAD USD object
G/L Account 4760 2120 NaN NaN object
G/L Account Description MEMBERSHIP FEES PARTS - MACHINERY & EQUIPMENT NaN NaN object
Cost Centre /WBS Element / Order PH3071 NaN NaN NaN object
Cost Centre /WBS Element / Order Description MATERNAL INFANT HEALTH PROGRAM SUPPORT NaN NaN NaN object
Merchant Type 8641.0 5722.0 NaN NaN float64
Merchant Type Description Associations - Civic, Social, and Frater Household Appliance Stores NaN NaN object
Purpose MEMBERSHIP FEE FLOOR MACHINE REPAIRS NaN NaN object
Unnamed: 16 NaN NaN NaN NaN float64
Batch Transaction ID NaN 4565-171 NaN NaN object
Cost Center / WBS Element / Order NaN NaN NaN NaN object
Cost Center / WBS Element / Order Description NaN NaN NaN NaN object
Exp Type Desc NaN NaN NaN NaN object
Division NaN NaN NaN NaN object
Cost Center / WBLS Element / Order Description NaN NaN NaN NaN object
Cost Center / WBS Element / Order # NaN NaN NaN NaN object
Cost Center / WBS Element / Order # Description NaN NaN NaN NaN object
Card Posting Date NaT NaT NaT NaT datetime64[ns]
Transaction Amount NaN NaN 642.96 NaN float64
Transaction Currency NaN NaN CAD NaN object
Merchant Type (MCC) NaN NaN NaN NaN float64
Trx.Currency NaN NaN NaN NaN object
Cost Centre / WBS Element / Order No NaN NaN NaN NaN object
Cost Centre / WBS Element / Order No. Description NaN WALLACE EMERSON-OPERATIONS NaN NaN object
Cost Centre / WBS Element / Order No. NaN P07728 NaN NaN object
Tr Currency NaN NaN NaN NaN object
Cost Centre / WBS element / Order NaN NaN NaN NaN object
Cost Centre / WBS element / Order Description NaN NaN NaN NaN object
Cost Centre / WBS Element / Order No. NaN NaN NaN NaN object
Cost Centre / WBS Element / Order No. Decription NaN NaN NaN NaN object
Cost Centre / WBS Element / Order NaN NaN NaN NaN object
Cost Centre / WBS Element / Order Description NaN NaN NaN NaN object
Unnamed: 17 NaN NaN NaN NaN float64
Cost Centre / WBS Element / Order NaN NaN NaN NaN object
Unnamed: 18 NaN NaN NaN NaN float64
Unnamed: 19 NaN NaN NaN NaN float64
Divison NaN NaN NaN NaN object
Cost Centre/ WBS Element / Order NaN NaN NaN NaN object
Cost Centre/ WBS Element / Order Description NaN NaN NaN NaN object

Basic Preprocessing

clean and standardizes column names

aggr_df = klib.clean_column_names(aggr_df)
aggr_df.columns
Duplicate column names detected! Columns with index [17, 21, 29, 32, 34, 35, 36, 38, 39, 41, 45, 46] and names ['batch_transaction_id', 'division', 'trx_currency', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description']) have been renamed to ['batch_transaction_id_17', 'division_21', 'trx_currency_29', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_34', 'cost_centre_wbs_element_order_description_35', 'cost_centre_wbs_element_order_no_36', 'cost_centre_wbs_element_order_38', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41', 'cost_centre_wbs_element_order_45', 'cost_centre_wbs_element_order_description_46'].
Long column names detected (>25 characters). Consider renaming the following columns ['cost_centre_wbs_element_order', 'cost_centre_wbs_element_order_description', 'cost_center_wbs_element_order', 'cost_center_wbs_element_order_description', 'cost_center_wbls_element_order_description', 'cost_center_wbs_element_order_hash', 'cost_center_wbs_element_order_hash_description', 'cost_centre_wbs_element_order_no', 'cost_centre_wbs_element_order_no_description', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_34', 'cost_centre_wbs_element_order_description_35', 'cost_centre_wbs_element_order_no_36', 'cost_centre_wbs_element_order_no_decription', 'cost_centre_wbs_element_order_38', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41', 'cost_centre_wbs_element_order_45', 'cost_centre_wbs_element_order_description_46'].
C:\tools\Anaconda3\envs\blog\lib\site-packages\klib\clean.py:76: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
  data.columns.str.replace("\n", "_")
Index(['division', 'batch_transaction_id', 'transaction_date',
       'card_posting_dt', 'merchant_name', 'transaction_amt', 'trx_currency',
       'original_amount', 'original_currency', 'g_l_account',
       'g_l_account_description', 'cost_centre_wbs_element_order',
       'cost_centre_wbs_element_order_description', 'merchant_type',
       'merchant_type_description', 'purpose', 'unnamed_16',
       'batch_transaction_id_17', 'cost_center_wbs_element_order',
       'cost_center_wbs_element_order_description', 'exp_type_desc',
       'division_21', 'cost_center_wbls_element_order_description',
       'cost_center_wbs_element_order_hash',
       'cost_center_wbs_element_order_hash_description', 'card_posting_date',
       'transaction_amount', 'transaction_currency', 'merchant_type_mcc',
       'trx_currency_29', 'cost_centre_wbs_element_order_no',
       'cost_centre_wbs_element_order_no_description',
       'cost_centre_wbs_element_order_no_32', 'tr_currency',
       'cost_centre_wbs_element_order_34',
       'cost_centre_wbs_element_order_description_35',
       'cost_centre_wbs_element_order_no_36',
       'cost_centre_wbs_element_order_no_decription',
       'cost_centre_wbs_element_order_38',
       'cost_centre_wbs_element_order_description_39', 'unnamed_17',
       'cost_centre_wbs_element_order_41', 'unnamed_18', 'unnamed_19',
       'divison', 'cost_centre_wbs_element_order_45',
       'cost_centre_wbs_element_order_description_46'],
      dtype='object')

Missing Values

aggregate data missing values plot

klib.missingval_plot(aggr_df)
GridSpec(6, 6)

remove nan divisions

clean_df = aggr_df[aggr_df["division"].notna()]
clean_df.division.isnull().sum(), aggr_df.division.isnull().sum(), aggr_df.shape[0]
(0, 57788, 297996)
klib.missingval_plot(clean_df)
GridSpec(6, 6)

merge similar divisions

divisions_list = list(clean_df.division.unique())
divisions_list
['PUBLIC HEALTH',
 'ECONOMIC DEVELOPMENT & CULTURE',
 'PARKS, FORESTRY & RECREATION',
 'TRANSPORTATION SERVICES ',
 'EMPLOYMENT & SOCIAL SERVICES',
 'TORONTO WATER',
 'LONG TERM CARE HOMES & SERVICES',
 'TORONTO PARAMEDIC SERVICES',
 'MUNICIPAL LICENSING & STANDARDS',
 'FIRE SERVICES',
 "CITY CLERK'S OFFICE",
 'STRATEGIC COMMUNICATIONS ',
 'FACILITIES MANAGEMENT DIVISON',
 'LEGAL SERVICES',
 'HUMAN RESOURCES ',
 'SHELTER, SUPPORT & HOUSING ADMINISTRATION',
 'SOLID WASTE MANAGEMENT',
 'ACCOUNTING SERVICES ',
 'CITY PLANNING',
 "CHILDREN'S SERVICES ",
 'ENVIRONMENT & ENERGY OFFICE',
 'SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ',
 'INFORMATION & TECHNOLOGY ',
 'CORPORATE FINANCE',
 'FLEET SERVICES',
 'ENGINEERING & CONSTRUCTION SERVICES',
 'FINANCE & ADMINISTRATION',
 'STRATEGIC & CORPORATE POLICY',
 'REVENUE SERVICES ',
 'POLICY, PLANNING, FINANCE & ADMINISTRATION',
 'TORONTO BUILDING ',
 'PURCHASING & MATERIALS MANAGEMENT ',
 '311 TORONTO',
 'OFFICE OF EMERGENCY MANAGEMENT ',
 'FINANCIAL PLANNING',
 'PENSION, PAYROLL & EMPLOYEE BENEFITS ',
 'AFFORDABLE HOUSING OFFICE',
 'DEPUTY CITY MGR & CFO',
 'CORPORATE CONTRACTS',
 'COURT SERVICES ',
 'CORPORATE SECURITY',
 'CFO',
 'DEPUTY CITY MGR INTERNAL SERVICES',
 'REAL ESTATE SERVICES',
 'TREASURER',
 'EXECUTIVE MANAGEMENT',
 'CITY MANAGER',
 'FACILITIES MANAGEMENT',
 'ENVIRONMENT & ENERGY',
 'STRATEGIC COMMUNICATIONS',
 'TRANSPORTATION',
 'CHILDRENS SERVICES',
 'REVENUE SERVICES',
 'CITY CLERKS OFFICE',
 'AFFORDABLE HOUSING',
 'ACCOUNTING SERVICES',
 'OFFICE OF EMERGENCY MANAGEMENT',
 'INFORMATION & TECHNOLOGY',
 'PENSION, PAYROLL & EMPLOYEE BENEFITS',
 'PURCHASING & MATERIALS MANAGEMENT',
 'TORONTO BUILDING',
 'LONG-TERM CARE HOMES & SERVICES',
 'OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER',
 'SHELTER, SUPPORT & HOUSING ADMINSTRATION',
 'SOCIAL DEVELOPMENT, FINANICE & ADMINISTRATION',
 'COURT SERVICES',
 'POLICY, PLANNING, FINANICE & ADMINISTRATION',
 'EXECUTIVE MGT',
 'TORONTO EMPLOYMENT & SOCIAL SERVICES',
 'CHIEF INNOVATION OFFICE',
 'OFFICE OF THE TREASURER',
 'CHIEF FINANCIAL OFFICER',
 'OFFICE OF THE CONTROLER',
 'OFFICE OF THE DCM - CORPORATE  SERVICES',
 'CHIEF TRANSFORMATION OFFICE',
 'FINANCE AND ADMIN',
 'LONG TERM CARE HOMES',
 'SHELTER, SUPPORT and HOUSING ADMINISTRATION',
 'EMERGENCY MEDICAL SERVICES',
 'FACILITIES MANAGEMENT ',
 'MUNICIPAL LICENSING & STANDARDS ',
 'ENGINEERING AND CONSTRUCTION SERVICES',
 "CITY MANAGER'S OFFICE",
 'INTERNAL AUDIT']
divisions_series = pd.Series(divisions_list)

map_df = group_similar_strings(divisions_series)
map_df["division"] = divisions_series

print(f"divisions raw: {len(map_df.division.unique())}")
print(f"divisions combined: {len(map_df.group_rep.unique())}")
divisions raw: 84
divisions combined: 62
def get_group_rep(map_df: pd.DataFrame, division: str) -> str:
    return map_df[map_df.division == division].head(1).group_rep.values[0]


clean_up_division = functools.partial(get_group_rep, map_df)
clean_df = clean_df.reset_index()
clean_df["division"] = clean_df.division.apply(clean_up_division)
clean_df.division.unique()
array(['PUBLIC HEALTH', 'ECONOMIC DEVELOPMENT & CULTURE',
       'PARKS, FORESTRY & RECREATION', 'TRANSPORTATION SERVICES ',
       'EMPLOYMENT & SOCIAL SERVICES', 'TORONTO WATER',
       'LONG TERM CARE HOMES & SERVICES', 'TORONTO PARAMEDIC SERVICES',
       'MUNICIPAL LICENSING & STANDARDS', 'FIRE SERVICES',
       "CITY CLERK'S OFFICE", 'STRATEGIC COMMUNICATIONS ',
       'FACILITIES MANAGEMENT DIVISON', 'LEGAL SERVICES',
       'HUMAN RESOURCES ', 'SHELTER, SUPPORT & HOUSING ADMINISTRATION',
       'SOLID WASTE MANAGEMENT', 'ACCOUNTING SERVICES ', 'CITY PLANNING',
       "CHILDREN'S SERVICES ", 'ENVIRONMENT & ENERGY OFFICE',
       'SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ',
       'INFORMATION & TECHNOLOGY ', 'CORPORATE FINANCE', 'FLEET SERVICES',
       'ENGINEERING & CONSTRUCTION SERVICES', 'FINANCE & ADMINISTRATION',
       'STRATEGIC & CORPORATE POLICY', 'REVENUE SERVICES ',
       'POLICY, PLANNING, FINANCE & ADMINISTRATION', 'TORONTO BUILDING ',
       'PURCHASING & MATERIALS MANAGEMENT ', '311 TORONTO',
       'OFFICE OF EMERGENCY MANAGEMENT ', 'FINANCIAL PLANNING',
       'PENSION, PAYROLL & EMPLOYEE BENEFITS ',
       'AFFORDABLE HOUSING OFFICE', 'DEPUTY CITY MGR & CFO',
       'CORPORATE CONTRACTS', 'COURT SERVICES ', 'CORPORATE SECURITY',
       'CFO', 'DEPUTY CITY MGR INTERNAL SERVICES', 'REAL ESTATE SERVICES',
       'TREASURER', 'EXECUTIVE MANAGEMENT', 'CITY MANAGER',
       'FACILITIES MANAGEMENT', 'CHILDRENS SERVICES',
       'CITY CLERKS OFFICE',
       'OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER', 'EXECUTIVE MGT',
       'CHIEF INNOVATION OFFICE', 'OFFICE OF THE TREASURER',
       'CHIEF FINANCIAL OFFICER', 'OFFICE OF THE CONTROLER',
       'OFFICE OF THE DCM - CORPORATE  SERVICES',
       'CHIEF TRANSFORMATION OFFICE', 'FINANCE AND ADMIN',
       'EMERGENCY MEDICAL SERVICES', "CITY MANAGER'S OFFICE",
       'INTERNAL AUDIT'], dtype=object)

convert to more efficient dtypes

clean_df = klib.convert_datatypes(clean_df)
clean_df.dtypes
index                                                      int32
division                                                category
batch_transaction_id                                      string
transaction_date                                  datetime64[ns]
card_posting_dt                                   datetime64[ns]
merchant_name                                             string
transaction_amt                                          float32
trx_currency                                            category
original_amount                                          float32
original_currency                                       category
g_l_account                                             category
g_l_account_description                                 category
cost_centre_wbs_element_order                           category
cost_centre_wbs_element_order_description               category
merchant_type                                            float32
merchant_type_description                               category
purpose                                                   string
unnamed_16                                               float32
batch_transaction_id_17                                   string
cost_center_wbs_element_order                           category
cost_center_wbs_element_order_description               category
exp_type_desc                                           category
division_21                                             category
cost_center_wbls_element_order_description              category
cost_center_wbs_element_order_hash                      category
cost_center_wbs_element_order_hash_description          category
card_posting_date                                 datetime64[ns]
transaction_amount                                       float32
transaction_currency                                    category
merchant_type_mcc                                        float32
trx_currency_29                                         category
cost_centre_wbs_element_order_no                        category
cost_centre_wbs_element_order_no_description            category
cost_centre_wbs_element_order_no_32                     category
tr_currency                                             category
cost_centre_wbs_element_order_34                        category
cost_centre_wbs_element_order_description_35            category
cost_centre_wbs_element_order_no_36                     category
cost_centre_wbs_element_order_no_decription             category
cost_centre_wbs_element_order_38                        category
cost_centre_wbs_element_order_description_39            category
unnamed_17                                               float32
cost_centre_wbs_element_order_41                        category
unnamed_18                                               float32
unnamed_19                                               float32
divison                                                 category
cost_centre_wbs_element_order_45                        category
cost_centre_wbs_element_order_description_46            category
dtype: object

pool subset of cols (by minimizing duplicates and loss of information)

clean_df = klib.data_cleaning(clean_df)
clean_df.dtypes
C:\tools\Anaconda3\envs\blog\lib\site-packages\klib\clean.py:76: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
  data.columns.str.replace("\n", "_")
Long column names detected (>25 characters). Consider renaming the following columns ['cost_center_wbs_element_order', 'cost_center_wbs_element_order_description', 'cost_centre_wbs_element_order_no_description', 'cost_centre_wbs_element_order_no_32', 'cost_centre_wbs_element_order_description_39', 'cost_centre_wbs_element_order_41'].
Shape of cleaned data: (240208, 27)Remaining NAs: 2487521

Changes:
Dropped rows: 0
     of which 0 duplicates. (Rows: [])
Dropped columns: 21
     of which 0 single valued.     Columns: []
Dropped missing values: 4962027
Reduced memory by at least: 0.21 MB (-0.59%)

index                                                    int32
division                                              category
batch_transaction_id                                    string
transaction_date                                datetime64[ns]
card_posting_dt                                 datetime64[ns]
merchant_name                                           string
transaction_amt                                        Float32
trx_currency                                          category
original_amount                                        Float32
original_currency                                     category
g_l_account                                           category
g_l_account_description                               category
merchant_type                                          Float32
merchant_type_description                             category
purpose                                                 string
batch_transaction_id_17                                 string
cost_center_wbs_element_order                         category
cost_center_wbs_element_order_description             category
card_posting_date                               datetime64[ns]
transaction_amount                                     Float32
transaction_currency                                  category
merchant_type_mcc                                      Float32
trx_currency_29                                       category
cost_centre_wbs_element_order_no_description          category
cost_centre_wbs_element_order_no_32                   category
cost_centre_wbs_element_order_description_39          category
cost_centre_wbs_element_order_41                      category
dtype: object

Global Plots

klib.corr_plot(clean_df)
<AxesSubplot:title={'center':'Feature-correlation (pearson)'}>

EDA | Division

for div, grp in clean_df.groupby("division"):
    print(div)
    klib.dist_plot(grp.drop("index", axis=1))
    plt.show()
311 TORONTO
ACCOUNTING SERVICES 
AFFORDABLE HOUSING OFFICE
CFO
CHIEF FINANCIAL OFFICER
CHIEF INNOVATION OFFICE
CHIEF TRANSFORMATION OFFICE
No columns with numeric data were detected.
CHILDREN'S SERVICES 
CHILDRENS SERVICES
CITY CLERK'S OFFICE
CITY CLERKS OFFICE
CITY MANAGER
CITY MANAGER'S OFFICE
No columns with numeric data were detected.
CITY PLANNING
CORPORATE CONTRACTS
CORPORATE FINANCE
CORPORATE SECURITY
COURT SERVICES 
DEPUTY CITY MGR & CFO
DEPUTY CITY MGR INTERNAL SERVICES
ECONOMIC DEVELOPMENT & CULTURE
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
EMERGENCY MEDICAL SERVICES
EMPLOYMENT & SOCIAL SERVICES
ENGINEERING & CONSTRUCTION SERVICES
ENVIRONMENT & ENERGY OFFICE
EXECUTIVE MANAGEMENT
EXECUTIVE MGT
FACILITIES MANAGEMENT
FACILITIES MANAGEMENT DIVISON
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
FINANCE & ADMINISTRATION
FINANCE AND ADMIN
No columns with numeric data were detected.
FINANCIAL PLANNING
FIRE SERVICES
FLEET SERVICES
HUMAN RESOURCES 
INFORMATION & TECHNOLOGY 
INTERNAL AUDIT
No columns with numeric data were detected.
LEGAL SERVICES
LONG TERM CARE HOMES & SERVICES
MUNICIPAL LICENSING & STANDARDS
OFFICE OF EMERGENCY MANAGEMENT 
OFFICE OF THE CONTROLER
OFFICE OF THE DCM - CORPORATE  SERVICES
OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER
OFFICE OF THE TREASURER
PARKS, FORESTRY & RECREATION
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
PENSION, PAYROLL & EMPLOYEE BENEFITS 
POLICY, PLANNING, FINANCE & ADMINISTRATION
PUBLIC HEALTH
PURCHASING & MATERIALS MANAGEMENT 
REAL ESTATE SERVICES
REVENUE SERVICES 
SHELTER, SUPPORT & HOUSING ADMINISTRATION
SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION 
SOLID WASTE MANAGEMENT
STRATEGIC & CORPORATE POLICY
STRATEGIC COMMUNICATIONS 
TORONTO BUILDING 
TORONTO PARAMEDIC SERVICES
TORONTO WATER
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
TRANSPORTATION SERVICES 
TREASURER

EDA | Division + Transaction Year

for div, grp in clean_df.groupby(
    [clean_df.division, clean_df.transaction_date.dt.year]
):
    print(div)
    klib.dist_plot(grp.drop("index", axis=1))
    plt.show()
('311 TORONTO', 2017.0)
('311 TORONTO', 2018.0)
('311 TORONTO', 2019.0)
('ACCOUNTING SERVICES ', 2015.0)
('ACCOUNTING SERVICES ', 2016.0)
('ACCOUNTING SERVICES ', 2017.0)
('ACCOUNTING SERVICES ', 2018.0)
('ACCOUNTING SERVICES ', 2019.0)
('AFFORDABLE HOUSING OFFICE', 2015.0)
('AFFORDABLE HOUSING OFFICE', 2016.0)
('AFFORDABLE HOUSING OFFICE', 2017.0)
('AFFORDABLE HOUSING OFFICE', 2018.0)
('AFFORDABLE HOUSING OFFICE', 2019.0)
('CFO', 2017.0)
No columns with numeric data were detected.
('CFO', 2018.0)
No columns with numeric data were detected.
('CHIEF FINANCIAL OFFICER', 2018.0)
('CHIEF FINANCIAL OFFICER', 2019.0)
('CHIEF INNOVATION OFFICE', 2018.0)
('CHIEF TRANSFORMATION OFFICE', 2018.0)
No columns with numeric data were detected.
('CHIEF TRANSFORMATION OFFICE', 2019.0)
No columns with numeric data were detected.
("CHILDREN'S SERVICES ", 2015.0)
("CHILDREN'S SERVICES ", 2016.0)
("CHILDREN'S SERVICES ", 2017.0)
("CHILDREN'S SERVICES ", 2018.0)
('CHILDRENS SERVICES', 2018.0)
('CHILDRENS SERVICES', 2019.0)
("CITY CLERK'S OFFICE", 2015.0)
("CITY CLERK'S OFFICE", 2016.0)
("CITY CLERK'S OFFICE", 2017.0)
("CITY CLERK'S OFFICE", 2018.0)
('CITY CLERKS OFFICE', 2018.0)
('CITY CLERKS OFFICE', 2019.0)
('CITY MANAGER', 2016.0)
('CITY MANAGER', 2018.0)
("CITY MANAGER'S OFFICE", 2016.0)
No columns with numeric data were detected.
('CITY PLANNING', 2015.0)
('CITY PLANNING', 2016.0)
('CITY PLANNING', 2017.0)
('CITY PLANNING', 2018.0)
('CITY PLANNING', 2019.0)
('CORPORATE CONTRACTS', 2015.0)
('CORPORATE CONTRACTS', 2016.0)
('CORPORATE CONTRACTS', 2017.0)
('CORPORATE FINANCE', 2016.0)
No columns with numeric data were detected.
('CORPORATE FINANCE', 2017.0)
('CORPORATE FINANCE', 2018.0)
('CORPORATE FINANCE', 2019.0)
('CORPORATE SECURITY', 2017.0)
('CORPORATE SECURITY', 2018.0)
('COURT SERVICES ', 2015.0)
('COURT SERVICES ', 2017.0)
No columns with numeric data were detected.
('COURT SERVICES ', 2018.0)
('COURT SERVICES ', 2019.0)
('DEPUTY CITY MGR & CFO', 2015.0)
('DEPUTY CITY MGR & CFO', 2016.0)
('DEPUTY CITY MGR & CFO', 2017.0)
('DEPUTY CITY MGR INTERNAL SERVICES', 2017.0)
('DEPUTY CITY MGR INTERNAL SERVICES', 2018.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2014.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2015.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2016.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2017.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2018.0)
('ECONOMIC DEVELOPMENT & CULTURE', 2019.0)
('EMERGENCY MEDICAL SERVICES', 2014.0)
No columns with numeric data were detected.
('EMERGENCY MEDICAL SERVICES', 2015.0)
('EMPLOYMENT & SOCIAL SERVICES', 2015.0)
('EMPLOYMENT & SOCIAL SERVICES', 2016.0)
('EMPLOYMENT & SOCIAL SERVICES', 2017.0)
('EMPLOYMENT & SOCIAL SERVICES', 2018.0)
('EMPLOYMENT & SOCIAL SERVICES', 2019.0)
('ENGINEERING & CONSTRUCTION SERVICES', 2015.0)
('ENGINEERING & CONSTRUCTION SERVICES', 2016.0)
('ENGINEERING & CONSTRUCTION SERVICES', 2017.0)
('ENVIRONMENT & ENERGY OFFICE', 2015.0)
('ENVIRONMENT & ENERGY OFFICE', 2016.0)
('ENVIRONMENT & ENERGY OFFICE', 2017.0)
('ENVIRONMENT & ENERGY OFFICE', 2018.0)
('ENVIRONMENT & ENERGY OFFICE', 2019.0)
('EXECUTIVE MANAGEMENT', 2018.0)
('EXECUTIVE MANAGEMENT', 2019.0)
('EXECUTIVE MGT', 2018.0)
('FACILITIES MANAGEMENT', 2014.0)
('FACILITIES MANAGEMENT', 2015.0)
('FACILITIES MANAGEMENT', 2018.0)
('FACILITIES MANAGEMENT', 2019.0)
('FACILITIES MANAGEMENT DIVISON', 2015.0)
('FACILITIES MANAGEMENT DIVISON', 2016.0)
('FACILITIES MANAGEMENT DIVISON', 2017.0)
('FACILITIES MANAGEMENT DIVISON', 2018.0)
('FINANCE & ADMINISTRATION', 2017.0)
('FINANCE & ADMINISTRATION', 2018.0)
No columns with numeric data were detected.
('FINANCE AND ADMIN', 2019.0)
No columns with numeric data were detected.
('FINANCIAL PLANNING', 2016.0)
No columns with numeric data were detected.
('FINANCIAL PLANNING', 2017.0)
('FINANCIAL PLANNING', 2018.0)
('FINANCIAL PLANNING', 2019.0)
No columns with numeric data were detected.
('FIRE SERVICES', 2014.0)
('FIRE SERVICES', 2015.0)
('FIRE SERVICES', 2016.0)
('FIRE SERVICES', 2017.0)
('FIRE SERVICES', 2018.0)
('FIRE SERVICES', 2019.0)
('FLEET SERVICES', 2015.0)
('FLEET SERVICES', 2016.0)
('FLEET SERVICES', 2017.0)
('FLEET SERVICES', 2018.0)
('FLEET SERVICES', 2019.0)
('HUMAN RESOURCES ', 2015.0)
No columns with numeric data were detected.
('HUMAN RESOURCES ', 2016.0)
('HUMAN RESOURCES ', 2017.0)
('INFORMATION & TECHNOLOGY ', 2015.0)
('INFORMATION & TECHNOLOGY ', 2016.0)
('INFORMATION & TECHNOLOGY ', 2017.0)
('INFORMATION & TECHNOLOGY ', 2018.0)
('INFORMATION & TECHNOLOGY ', 2019.0)
('INTERNAL AUDIT', 2016.0)
No columns with numeric data were detected.
('LEGAL SERVICES', 2015.0)
('LEGAL SERVICES', 2016.0)
('LEGAL SERVICES', 2017.0)
('LEGAL SERVICES', 2018.0)
('LEGAL SERVICES', 2019.0)
('LONG TERM CARE HOMES & SERVICES', 2014.0)
('LONG TERM CARE HOMES & SERVICES', 2015.0)
('LONG TERM CARE HOMES & SERVICES', 2016.0)
('LONG TERM CARE HOMES & SERVICES', 2017.0)
('LONG TERM CARE HOMES & SERVICES', 2018.0)
('LONG TERM CARE HOMES & SERVICES', 2019.0)
('MUNICIPAL LICENSING & STANDARDS', 2015.0)
('MUNICIPAL LICENSING & STANDARDS', 2016.0)
('MUNICIPAL LICENSING & STANDARDS', 2017.0)
('MUNICIPAL LICENSING & STANDARDS', 2018.0)
('MUNICIPAL LICENSING & STANDARDS', 2019.0)
('OFFICE OF EMERGENCY MANAGEMENT ', 2015.0)
No columns with numeric data were detected.
('OFFICE OF EMERGENCY MANAGEMENT ', 2016.0)
('OFFICE OF EMERGENCY MANAGEMENT ', 2017.0)
('OFFICE OF EMERGENCY MANAGEMENT ', 2018.0)
('OFFICE OF EMERGENCY MANAGEMENT ', 2019.0)
('OFFICE OF THE CONTROLER', 2018.0)
No columns with numeric data were detected.
('OFFICE OF THE CONTROLER', 2019.0)
('OFFICE OF THE DCM - CORPORATE  SERVICES', 2018.0)
('OFFICE OF THE DCM - CORPORATE  SERVICES', 2019.0)
('OFFICE OF THE DCM - INTERNAL SERVICES CLUSTER', 2018.0)
('OFFICE OF THE TREASURER', 2018.0)
('PARKS, FORESTRY & RECREATION', 2014.0)
('PARKS, FORESTRY & RECREATION', 2015.0)
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
('PARKS, FORESTRY & RECREATION', 2016.0)
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
('PARKS, FORESTRY & RECREATION', 2017.0)
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
('PARKS, FORESTRY & RECREATION', 2018.0)
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
('PARKS, FORESTRY & RECREATION', 2019.0)
Large dataset detected, using 10000 random samples for the plots. Summary statistics are still based on the entire dataset.
('PENSION, PAYROLL & EMPLOYEE BENEFITS ', 2015.0)
('PENSION, PAYROLL & EMPLOYEE BENEFITS ', 2016.0)
('PENSION, PAYROLL & EMPLOYEE BENEFITS ', 2017.0)
('PENSION, PAYROLL & EMPLOYEE BENEFITS ', 2018.0)
('PENSION, PAYROLL & EMPLOYEE BENEFITS ', 2019.0)
('POLICY, PLANNING, FINANCE & ADMINISTRATION', 2015.0)
('POLICY, PLANNING, FINANCE & ADMINISTRATION', 2016.0)
('POLICY, PLANNING, FINANCE & ADMINISTRATION', 2017.0)
('POLICY, PLANNING, FINANCE & ADMINISTRATION', 2018.0)
('POLICY, PLANNING, FINANCE & ADMINISTRATION', 2019.0)
('PUBLIC HEALTH', 2014.0)
No columns with numeric data were detected.
('PUBLIC HEALTH', 2015.0)
('PUBLIC HEALTH', 2016.0)
('PUBLIC HEALTH', 2017.0)
('PUBLIC HEALTH', 2018.0)
('PUBLIC HEALTH', 2019.0)
('PURCHASING & MATERIALS MANAGEMENT ', 2015.0)
('PURCHASING & MATERIALS MANAGEMENT ', 2016.0)
('PURCHASING & MATERIALS MANAGEMENT ', 2017.0)
('PURCHASING & MATERIALS MANAGEMENT ', 2018.0)
('PURCHASING & MATERIALS MANAGEMENT ', 2019.0)
('REAL ESTATE SERVICES', 2017.0)
('REAL ESTATE SERVICES', 2018.0)
('REAL ESTATE SERVICES', 2019.0)
('REVENUE SERVICES ', 2014.0)
No columns with numeric data were detected.
('REVENUE SERVICES ', 2015.0)
('REVENUE SERVICES ', 2016.0)
('REVENUE SERVICES ', 2017.0)
('REVENUE SERVICES ', 2018.0)
('REVENUE SERVICES ', 2019.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2014.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2015.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2016.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2017.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2018.0)
('SHELTER, SUPPORT & HOUSING ADMINISTRATION', 2019.0)
('SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ', 2015.0)
('SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ', 2016.0)
('SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ', 2017.0)
('SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ', 2018.0)
('SOCIAL DEVELOPMENT, FINANCE & ADMINISTRATION ', 2019.0)
('SOLID WASTE MANAGEMENT', 2014.0)
No columns with numeric data were detected.
('SOLID WASTE MANAGEMENT', 2015.0)
('SOLID WASTE MANAGEMENT', 2016.0)
('SOLID WASTE MANAGEMENT', 2017.0)
('SOLID WASTE MANAGEMENT', 2018.0)
('SOLID WASTE MANAGEMENT', 2019.0)
('STRATEGIC & CORPORATE POLICY', 2016.0)
('STRATEGIC & CORPORATE POLICY', 2017.0)
('STRATEGIC & CORPORATE POLICY', 2018.0)
('STRATEGIC COMMUNICATIONS ', 2015.0)
('STRATEGIC COMMUNICATIONS ', 2016.0)
('STRATEGIC COMMUNICATIONS ', 2017.0)
('STRATEGIC COMMUNICATIONS ', 2018.0)
('STRATEGIC COMMUNICATIONS ', 2019.0)
('TORONTO BUILDING ', 2015.0)
('TORONTO BUILDING ', 2016.0)
('TORONTO BUILDING ', 2017.0)
('TORONTO BUILDING ', 2018.0)
('TORONTO BUILDING ', 2019.0)
('TORONTO PARAMEDIC SERVICES', 2015.0)
('TORONTO PARAMEDIC SERVICES', 2016.0)
('TORONTO PARAMEDIC SERVICES', 2017.0)
('TORONTO PARAMEDIC SERVICES', 2018.0)
('TORONTO PARAMEDIC SERVICES', 2019.0)
('TORONTO WATER', 2014.0)
('TORONTO WATER', 2015.0)
('TORONTO WATER', 2016.0)
('TORONTO WATER', 2017.0)
('TORONTO WATER', 2018.0)
('TORONTO WATER', 2019.0)
('TRANSPORTATION SERVICES ', 2015.0)
('TRANSPORTATION SERVICES ', 2016.0)
('TRANSPORTATION SERVICES ', 2017.0)
('TRANSPORTATION SERVICES ', 2018.0)
('TRANSPORTATION SERVICES ', 2019.0)
('TREASURER', 2018.0)

Looks at the Motifs & Discords

cad_df = clean_df[clean_df["original_currency"] == "CAD"]
toronto_cad_df = cad_df[cad_df["division"] == "311 TORONTO"]
matrix_profile = stumpy.stump(toronto_cad_df["original_amount"], m=5)
motif_idx = np.argsort(matrix_profile[:, 0])[0]

print(f"The inspecting the series motif")
pd.concat(
    [
        toronto_cad_df.iloc[motif_idx - 2],
        toronto_cad_df.iloc[motif_idx - 1],
        toronto_cad_df.iloc[motif_idx],
        toronto_cad_df.iloc[motif_idx - 1],
        toronto_cad_df.iloc[motif_idx + 2],
    ],
    axis=1,
)
The inspecting the series motif
73246 73247 73248 73247 73250
index 93349 93350 93351 93350 93353
division 311 TORONTO 311 TORONTO 311 TORONTO 311 TORONTO 311 TORONTO
batch_transaction_id 5097-52 5104-15 5104-16 5104-15 5109-24
transaction_date 2018-10-11 00:00:00 2018-10-22 00:00:00 2018-10-22 00:00:00 2018-10-22 00:00:00 2018-10-29 00:00:00
card_posting_dt NaT NaT NaT NaT NaT
merchant_name WAL*MART CANADA INC PIZZA NOVA 017 PIZZA NOVA 99 PIZZA NOVA 017 TIM HORTONS 2387 QTH
transaction_amt NaN NaN NaN NaN NaN
trx_currency NaN NaN NaN NaN NaN
original_amount 40.610001 283.839996 59.189999 283.839996 41.939999
original_currency CAD CAD CAD CAD CAD
g_l_account 2099 4820 4820 4820 4820
g_l_account_description OTHER OFFICE MATERIAL &MINOR FURNISHING BUSINESS MEETING EXPENSES BUSINESS MEETING EXPENSES BUSINESS MEETING EXPENSES BUSINESS MEETING EXPENSES
cost_centre_wbs_element_order NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_description NaN NaN NaN NaN NaN
merchant_type NaN NaN NaN NaN NaN
merchant_type_description Discount Stores Quick Payment Service - Fast-Food Restau Eating Places, Restaurants Quick Payment Service - Fast-Food Restau Quick Payment Service - Fast-Food Restau
purpose STORAGE CABINET FOR VIEWING GALLERY MEETING ROOM TEN PARTY SIZE PIZZA FOR ELECTION DAY STAFF AP... 2 PIZZA FOR ELECTION DAY STAFF APPRECIATION OC... TEN PARTY SIZE PIZZA FOR ELECTION DAY STAFF AP... 50 COOKIES FOR WORKFORCE DEVELOPMENT MONTH TOU...
unnamed_16 NaN NaN NaN NaN NaN
batch_transaction_id_17 <NA> <NA> <NA> <NA> <NA>
cost_center_wbs_element_order TO9301 TO9301 TO9301 TO9301 TO9301
cost_center_wbs_element_order_description CONTACT CENTRE CONTACT CENTRE CONTACT CENTRE CONTACT CENTRE CONTACT CENTRE
exp_type_desc NaN NaN NaN NaN NaN
division_21 NaN NaN NaN NaN NaN
cost_center_wbls_element_order_description NaN NaN NaN NaN NaN
cost_center_wbs_element_order_hash NaN NaN NaN NaN NaN
cost_center_wbs_element_order_hash_description NaN NaN NaN NaN NaN
card_posting_date 2018-10-11 00:00:00 2018-10-22 00:00:00 2018-10-22 00:00:00 2018-10-22 00:00:00 2018-10-29 00:00:00
transaction_amount 40.610001 283.839996 59.189999 283.839996 41.939999
transaction_currency CAD CAD CAD CAD CAD
merchant_type_mcc 5310.0 5814.0 5812.0 5814.0 5814.0
trx_currency_29 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_no NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_no_description NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_no_32 NaN NaN NaN NaN NaN
tr_currency NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_34 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_description_35 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_no_36 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_no_decription NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_38 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_description_39 NaN NaN NaN NaN NaN
unnamed_17 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_41 NaN NaN NaN NaN NaN
unnamed_18 NaN NaN NaN NaN NaN
unnamed_19 NaN NaN NaN NaN NaN
divison NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_45 NaN NaN NaN NaN NaN
cost_centre_wbs_element_order_description_46 NaN NaN NaN NaN NaN
correct_arc_curve, regime_locations = stumpy.fluss(
    matrix_profile[:, 1], L=5, n_regimes=2, excl_factor=1
)