City Division EDA
Objectives
The objectives of this analysis are:
- Characterize the expenditures of the City’s Divisions or Cost Centres by establishing the number and frequency of transactions, their typical amounts, dispersion, etc.
- Identify meaningful groups of Divisions or Cost Centres that behave similarly in terms of their expense behaviour.
- Identify meaningful anomalies in the data that may require closer examination.
%load_ext lab_black
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config Completer.use_jedi = False
import io
import requests
import functools
from pathlib import Path
import klib
import stumpy
import numpy as np
import pandas as pd
import zipfile as zp
import matplotlib as mpl
import matplotlib.pyplot as plt
print(f"{np.__version__=}")
print(f"{pd.__version__=}")
print(f"{klib.__version__=}")
print(f"{mpl.__version__=}")
print(f"{stumpy.__version__=}")
from string_grouper import group_similar_strings
data_dir = Path("data")
data_dir.absolute()
meta_data_req = (
"https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show",
{"id": "ebc3f9c2-2f80-4405-bf4f-5fb309581485"},
)
meta_payload = requests.get(*meta_data_req).json()
meta_payload["result"]
resources_list = [
resource.get("url")
for resource in meta_payload.get("result", {}).get("resources", {})
]
resources_dict = {resource.split("""/""")[-1]: resource for resource in resources_list}
resources_dict.keys()
pd.read_excel(resources_dict.get("pcard_expenditures_readme.xls")).dropna()
expenditures_dir = data_dir / "expenditures"
r = requests.get(resources_dict.get("expenditures.zip"))
z = zp.ZipFile(io.BytesIO(r.content))
z.extractall(expenditures_dir)
list(expenditures_dir.glob("*"))
kb_conv = 1_024
mb_conv = kb_conv ** 2
gb_nv = kb_conv ** 3
bytes_size = expenditures_dir.stat().st_size
f"{bytes_size / mb_conv:,.3} MB"
dfs = [pd.read_excel(xls) for xls in expenditures_dir.glob("*.xlsx")]
aggr_df = pd.concat(dfs, ignore_index=True)
pd.concat(
[aggr_df.head(1).T, aggr_df.sample(2).T, aggr_df.tail(1).T, aggr_df.dtypes], axis=1
)
clean and standardizes column names
aggr_df = klib.clean_column_names(aggr_df)
aggr_df.columns
aggregate data missing values plot
klib.missingval_plot(aggr_df)
remove nan divisions
clean_df = aggr_df[aggr_df["division"].notna()]
clean_df.division.isnull().sum(), aggr_df.division.isnull().sum(), aggr_df.shape[0]
klib.missingval_plot(clean_df)
merge similar divisions
divisions_list = list(clean_df.division.unique())
divisions_list
divisions_series = pd.Series(divisions_list)
map_df = group_similar_strings(divisions_series)
map_df["division"] = divisions_series
print(f"divisions raw: {len(map_df.division.unique())}")
print(f"divisions combined: {len(map_df.group_rep.unique())}")
def get_group_rep(map_df: pd.DataFrame, division: str) -> str:
return map_df[map_df.division == division].head(1).group_rep.values[0]
clean_up_division = functools.partial(get_group_rep, map_df)
clean_df = clean_df.reset_index()
clean_df["division"] = clean_df.division.apply(clean_up_division)
clean_df.division.unique()
convert to more efficient dtypes
clean_df = klib.convert_datatypes(clean_df)
clean_df.dtypes
pool subset of cols (by minimizing duplicates and loss of information)
clean_df = klib.data_cleaning(clean_df)
clean_df.dtypes
klib.corr_plot(clean_df)
for div, grp in clean_df.groupby("division"):
print(div)
klib.dist_plot(grp.drop("index", axis=1))
plt.show()
for div, grp in clean_df.groupby(
[clean_df.division, clean_df.transaction_date.dt.year]
):
print(div)
klib.dist_plot(grp.drop("index", axis=1))
plt.show()
cad_df = clean_df[clean_df["original_currency"] == "CAD"]
toronto_cad_df = cad_df[cad_df["division"] == "311 TORONTO"]
matrix_profile = stumpy.stump(toronto_cad_df["original_amount"], m=5)
motif_idx = np.argsort(matrix_profile[:, 0])[0]
print(f"The inspecting the series motif")
pd.concat(
[
toronto_cad_df.iloc[motif_idx - 2],
toronto_cad_df.iloc[motif_idx - 1],
toronto_cad_df.iloc[motif_idx],
toronto_cad_df.iloc[motif_idx - 1],
toronto_cad_df.iloc[motif_idx + 2],
],
axis=1,
)
correct_arc_curve, regime_locations = stumpy.fluss(
matrix_profile[:, 1], L=5, n_regimes=2, excl_factor=1
)