diff --git a/justclust/analysis/bolzano.py b/justclust/analysis/bolzano.py index 2f5a2d0273a481b4562be80a3eabe15276799332..760895de0a17a5f4492a0b8b0f06a1e938eb2b6b 100644 --- a/justclust/analysis/bolzano.py +++ b/justclust/analysis/bolzano.py @@ -27,11 +27,6 @@ raw = read_data() selected = raw.loc[:, cols].dropna() selected.rename(columns=conv, inplace=True) -# %% -# Remove error observation -mask = selected.index == 8888888 -selected = selected[~ mask] - # %% # Explore clustering pre-processing @@ -77,7 +72,7 @@ pre_key = ( "s:Normalizer()|" "w:true|" - "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)" + "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd')" # "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)" ) @@ -102,6 +97,7 @@ if my_paths.clfile.exists(): fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0) else: print(f"File: {my_paths.clfile} does not exist. Start computing the clusters") + print(f"Preprocessing: {pre_key}") geo_out = raw.loc[:, sez_cols] models = get_algclusters(preprocs=pre) diff --git a/justclust/data/bolzano.py b/justclust/data/bolzano.py index 61acd104b102a99abce3bda9bdf97cc9bb51b727..5f96851a60e0776431995a520c9442a43cc9b976 100644 --- a/justclust/data/bolzano.py +++ b/justclust/data/bolzano.py @@ -7,14 +7,22 @@ import justclust.paths.paths as paths my_paths = paths.define_paths(city = 'bolzano') # %% +# columns selected for the columns of the outputs +sez_cols = [ + "SEZ", "geometry", + "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM" + ] + +id_col = ['SEZ'] + #---- Air Quality ---- # Define available files -aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp" +prisk_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp" # Define columns and other constant values -aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"] -aq_nor_cols = [ +prisk_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"] +prisk_nor_cols = [ "P_Area_r1", "P_Area_r2", "P_Area_r3", @@ -39,9 +47,6 @@ ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee co2_em_raw_cols = ["finale"] co2_em_nor_cols = ["finale/m²"] -# columns selected for the columns of the outputs -sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"] - #---- FFH ---- # Define available files @@ -116,13 +121,13 @@ socio_perc_cols = [ cols_dict = { # Air quality - 'air_q':{ - "P_Area_r1": "area very-low P-risk [%]", - "P_Area_r2": "area low P-risk [%]", - "P_Area_r3": "area medium-low P-risk [%]", - "P_Area_r4": "area medium-high P-risk [%]", - "P_Area_r6": "area high P-risk [%]", - "P_Area_r9": "area very-high P-risk [%]" + 'prisk':{ + "P_Area_r1": "Area very-low P-risk [%]", + "P_Area_r2": "Area low P-risk [%]", + "P_Area_r3": "Area medium-low P-risk [%]", + "P_Area_r4": "Area medium-high P-risk [%]", + "P_Area_r6": "Area high P-risk [%]", + "P_Area_r9": "Area very-high P-risk [%]" }, # Carbon 'carbon':{ @@ -165,7 +170,7 @@ conv = {k:v for values in cols_dict.values() for k,v in values.items()} # define subset of columns according to categories selected_col = { - 'col_aq': list(cols_dict.get('air_q').values()), + 'col_prisk': list(cols_dict.get('prisk').values()), 'col_carbon':list(cols_dict.get('carbon').values()), 'col_char':\ list(cols_dict.get('ffh').values())+\ @@ -181,7 +186,7 @@ selected_col = { def read_data(): - aq_scr = gpd.read_file(aq_scr_shp) + prisk_scr = gpd.read_file(prisk_scr_shp) co2_ab = gpd.read_file(co2_ab_shp) co2_ab["SEZ"] = co2_ab["SEZ"].astype(int) co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area @@ -207,7 +212,7 @@ def read_data(): raw = pd.concat( [ co2_ab.loc[:, sez_cols], - aq_scr.loc[:, aq_raw_cols + aq_nor_cols], + prisk_scr.loc[:, prisk_raw_cols + prisk_nor_cols], co2_ab.loc[:, co2_raw_cols + co2_nor_cols], co2_em.loc[:, co2_em_raw_cols + co2_em_nor_cols + ems], ffh_pa.loc[:, ffh_raw_cols + ffh_nor_cols], @@ -221,4 +226,9 @@ def read_data(): ) raw.index = co2_ab["SEZ"] raw.index.name = None + + # Remove error observation + mask = raw.index == 8888888 + raw = raw[~ mask] + return raw diff --git a/justclust/explore/preprocs.py b/justclust/explore/preprocs.py index 044cf21c915ba468920b43f1ababf53a6c5c0796..7ae82c8c0965180efb040314710edcca8d733d68 100644 --- a/justclust/explore/preprocs.py +++ b/justclust/explore/preprocs.py @@ -32,7 +32,7 @@ def get_preprocs(selected=None, apply_weight=None): pre.StandardScaler(with_mean=True, with_std=True), pre.Normalizer(norm="l2"), pre.Normalizer(norm="l1"), - pre.Normalizer(norm="max"), + pre.Normalizer(norm="max"), # create sparse matrixes that lead to error in fast ica https://github.com/scikit-learn/scikit-learn/issues/2089#issuecomment-1126183560 pre.QuantileTransformer( n_quantiles=1000, output_distribution="uniform", @@ -81,11 +81,11 @@ def get_preprocs(selected=None, apply_weight=None): dec.PCA(), dec.PCA(n_components="mle", svd_solver="full"), dec.PCA(svd_solver="randomized", whiten=True), - dec.FastICA(whiten="warn"), - dec.FastICA(whiten="unit-variance"), - dec.FactorAnalysis(max_iter=200), - dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=-1), - dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=-1), + dec.FastICA(whiten="warn", max_iter= 500), + dec.FastICA(whiten="unit-variance", max_iter= 500), + dec.FactorAnalysis(max_iter=500), + dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=None), + dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=None), ] preprocs = { f"s:{s}|w:{'true' if w else 'false'}|d:{d}": (s, w, d) diff --git a/justclust/plots.py b/justclust/plots.py index 94b9a9f0edac89f7ec86778d3120ccdaa2618c0c..8cfd2de13862b3cc6e37efb7c2b90670bbea3f47 100644 --- a/justclust/plots.py +++ b/justclust/plots.py @@ -311,7 +311,11 @@ def plot_profiles2( row = floor(c / ncols) col = floor(c - (row * ncols)) print(f"cl: {cl}, row: {row}, col: {col}") - ax1 = axes[row][col] + + if nrows > 1: + ax1 = axes[row][col] + else: + ax1 = axes[col] data = [clf[vname][cl] for vname in vnames] @@ -530,7 +534,12 @@ def plot_stats( for c, cl in enumerate(cl_cat): irow = floor(c / ncols) icol = floor(c - (irow * ncols)) - ax = axes[irow][icol] + + if nrows > 1: + ax = axes[irow][icol] + else: + ax = axes[icol] + light_bg = is_light(*cmap[cl], hsp_threshold=hsp_threshold) sdf = stdfs[cl] if idx_order is None: diff --git a/justclust/quarto.py b/justclust/quarto.py index bfda5e500129df7c14bdb9c7246cf00421cda130..b6aa7ef087c53c3325f7b17b74d33fb4217df9be 100644 --- a/justclust/quarto.py +++ b/justclust/quarto.py @@ -17,1438 +17,1444 @@ import matplotlib.pyplot as plt import seaborn as sns from matplotlib.colors import LinearSegmentedColormap -#---- get_dict_stat -def get_dict_stat( - data, - selected_col, - var_cluster = 'cluster_lab', - exclude_Z = True - ): - data = data[['cluster_lab'] + selected_col].copy() +#---- create_qmd_report ---- - if exclude_Z: - data = data[data['cluster_lab'] != 'Z'] - - selected_grouped = data.groupby(['cluster_lab']) - cluster_means = selected_grouped.mean() - cluster_medians = selected_grouped.median() - overall_mean = cluster_means.mean() - overall_median = cluster_medians.mean() +def create_qmd_report( + city, + clname, + cdir: Path, + ): + graph_open = "{" + graph_close = "}" - res = { - 'cluster_means':cluster_means.T, - 'cluster_means_cent':cluster_means.T\ - .sub(overall_mean, axis = 0)\ - .divide(np.abs(overall_mean), axis = 0), + header =f"""--- +title: 'JUSTNature - {city}' +format: + pdf: + toc: true + number-sections: true + colorlinks: true + keep-tex: false + fig-pos: 'H' + html: + theme: cosmo + toc: true + number-sections: true + embed-resources: false +execute: + echo: false + warning: false +--- +\pagebreak + """ + settings =f""" +```{graph_open}python{graph_close} +import os +import sys +import geopandas as gpd +import numpy as np +import pandas as pd +import re +import matplotlib.pyplot as plt - 'cluster_medians':cluster_medians.T, - 'cluster_medians_cent':cluster_medians.T\ - .sub(overall_median, axis = 0)\ - .divide(np.abs(overall_median), axis = 0) - } +# Custom modules +from justclust.data.{city} import cols, conv, read_data, selected_col +import justclust.paths.paths as paths +import justclust.quarto as qmd - return res +plt.rcParams['figure.dpi'] = 600 +plt.rcParams['savefig.dpi'] = 600 -#---- summary_stat ---- +my_paths = paths.define_paths(city = '{city}') -def summary_stat(data, round = 2): - data_summary = data.describe().T +``` + """ + data_load = f""" +```{graph_open}python{graph_close} - res = data_summary.round(decimals = round) - return res +# Get selected data with the required cluster +raw = read_data() +selected = raw.loc[:, cols].dropna() +selected.rename(columns=conv, inplace=True) -#---- table_latex ---- +clname = '{clname}' -def table_latex( - data, - caption = None, - label = None, - size = None, - position = 'H' - ): - - res = data.to_latex( - position= position, - caption = caption, - label = label - ) - - if size is not None: - res = re.sub( - pattern = 'centering', - repl = 'centering\\n\\\\'+size, - string = res - ) - - return res +geo_out = gpd.read_file(my_paths.clfile) +geo_out.set_index('SEZ', drop=False, verify_integrity=True, inplace=True) +clsts = geo_out.loc[selected.index,[clname, 'geometry']] +selected = clsts.join(selected, validate='1:1') -#---- cluster_color_dict ---- +# Subset fo columns +selected_col_prisk = selected_col.get('col_prisk') +selected_col_carbon = selected_col.get('col_carbon') +selected_col_char = selected_col.get('col_char') +selected_col_socio = selected_col.get('col_socio') -def cluster_color_dict(cluster_labels): - n_clusters = len(cluster_labels) +selected_col_all = [] +for x in [selected_col_prisk, selected_col_carbon, selected_col_char, selected_col_socio]: + if x is not None: + selected_col_all = selected_col_all + x - palette = list(sns.color_palette("colorblind", n_colors=max(8, n_clusters))) - gray_color = palette[7] - palette.remove(gray_color) - palette = palette[0: n_clusters-1] + [gray_color] - res = { - label:rgb for label, rgb in zip(cluster_labels.values(), palette) - } - return res +# Clustering +cluster_labels = qmd.get_cluster_labels(selected[clname]) +cluster_colors = qmd.cluster_color_dict(cluster_labels) +selected['cluster_lab'] = selected[clname].map(cluster_labels) -#---- get_cluster_lab ---- +data_cluster = selected[selected['cluster_lab'] != 'Z'][['cluster_lab'] + selected_col_all] +list_clust =sorted(set(data_cluster['cluster_lab'])) -def get_cluster_labels(cluster_var) : - - unique_values = sorted(cluster_var.unique().astype(int)) +# Compute cluster statistics +dict_stat = qmd.get_dict_stat( + data=selected, + selected_col=selected_col_all, + var_cluster = 'cluster_lab', + exclude_Z = True +) - res = { - i:string.ascii_uppercase[i] if i >= 0 else 'Z' for i in unique_values - } - - # sort by value - res = dict(sorted(res.items(), key=lambda item: item[1])) - return res +# Compute feature importance +X_fi = data_cluster[selected_col_all] +y_fi = data_cluster['cluster_lab'] -#---- get_clst_feature_importance ---- +data_fi = qmd.get_clst_feature_importance(X_fi, y_fi, max_depth=None) -def get_clst_feature_importance( - data, - labels, - max_depth = 10 - ): - - cl_cat = sorted(set(labels)) - - res = pd.DataFrame() +# Orther info +n_units_total = geo_out.shape[0] +n_units = selected.shape[0] - clf = ens.RandomForestClassifier( - max_depth=max_depth, - n_estimators=500, - random_state=1, - n_jobs = -1 - ) +# summary cluster frequencyz +summary_cls_frequency = qmd.get_summary_clst_frequency(selected) - for cl in cl_cat: +``` + """ - clf.fit( - data, - labels == cl) - fi = pd.DataFrame( - clf.feature_importances_, - index=pd.Series(data.columns, name="features"), - columns=[cl] - ) - - res = pd.concat([res, fi], axis='columns') + descriptive_intro = f""" +# Descriptive Statistics - return res +```{graph_open}python{graph_close} +#| output: asis +print( + f'The urban area of {city} is formed by {graph_open}n_units_total{graph_close} territorial units.', + 'However, some territorial units are not included in the analysis due to the presence of missing data.', + f'The analysis covers {graph_open}n_units{graph_close} territorial units ({graph_open}n_units/n_units_total*100:.2f{graph_close}% of the total; see @fig-territorial-units).') +``` -#---- get_summary_clst_frequency ---- +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-territorial-units +#| fig-cap: 'Territorial units included or excluded from the analysis' -def get_summary_clst_frequency( - data, - var_clust = 'cluster_lab' - ): - res = pd.DataFrame({ - 'Frequency':data[var_clust].value_counts(sort = False).sort_index() - }) - res['Percent'] = res['Frequency'].div(res['Frequency'].sum()) - return res +qmd.plot_map_included( + geo_out=geo_out, + clname=clname, + figsize=(9,9) +) +``` -#---- is_light ---- + """ + descriptive_feature = f""" +## Features -def is_light(r, g, b, hsp_threshold=0.5): - # HSP (Highly Sensitive Poo) equation is from http://alienryderflex.com/hsp.html - hsp = np.sqrt(0.299 * (r * r) + 0.587 * (g * g) + 0.114 * (b * b)) - return True if hsp > hsp_threshold else False +In the next sections, descriptive statistics of the territorial units characteristics are presented. -#---- plot_map_included ---- +### Pollution Risk -def plot_map_included( - geo_out, - clname, - figsize = (12,12) - ): - - data_plot = geo_out[[clname, 'geometry']].copy() - data_plot['Inclusion'] = data_plot[clname].map(lambda x: 'Excluded' if np.isnan(x) else 'Included' ) +Values regarding pollution risk are presented in @fig-p-risk and summarized in Table \\ref{graph_open}tbl-p-risk{graph_close}. +```{graph_open}python{graph_close} +#| output: asis +#| warning: false - sns_colors = list(sns.color_palette()) - dict_colors = { - 'Included':sns_colors[0], - 'Excluded':sns_colors[1]} +print( + qmd.table_latex( + data = qmd.summary_stat(selected[selected_col_prisk]), + caption = 'Summary statistics pollution risk', + label = 'tbl-p-risk' + ) +) +``` +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-p-risk +#| fig-cap: 'Distribution pollution risk indicators' - fig, ax = plt.subplots(1, 1, figsize=figsize) - data_plot.plot( - ax=ax, alpha = .7, column = 'Inclusion', legend = True, categorical=True, - categories = list(dict_colors.keys()), - cmap = LinearSegmentedColormap.from_list("a", list(dict_colors.values()))) - data_plot.boundary.plot(ax=ax, color='white', linewidth = .35) - ax.axis('off') - plt.show() +qmd.plot_summary(selected[selected_col_prisk]) +``` -#---- plot_summary ---- +### Carbon -def plot_summary( - data, - bins = 25, - text_size = 6, - title_size = None, - color = None): - n_var = data.shape[1] +Values regarding carbon emission and absorption are presented in @fig-carbon and summarized in Table \\ref{graph_open}tbl-carbon{graph_close}. +```{graph_open}python{graph_close} +#| output: asis +#| warning: false - if n_var <= 3: - n_col = n_var - n_row = 1 - elif n_var == 4: - n_col = 2 - n_row = 2 - else: - n_col = 3 - n_row = ceil(n_var/3) - - - - fig, ax = plt.subplots(1, 1, figsize=(n_col*2.5, n_row*2)) - fig = pd.DataFrame(data).hist( - ax = ax, - layout = (n_row, n_col), - alpha = .7, - bins = bins) - - # Reduce font - if title_size is None: - title_size = text_size + 2 +print( + qmd.table_latex( + data = qmd.summary_stat(selected[selected_col_carbon]), + caption = 'Summary statistics carbon', + label = 'tbl-carbon' + ) +) - for x in fig.ravel(): - for item in ([x.xaxis.label, - x.yaxis.label] + - x.get_xticklabels() + - x.get_yticklabels()): - item.set_size(text_size) - x.title.set_size(title_size) +``` +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-carbon +#| fig-cap: 'Distribution carbon indicators' - if color is not None: - for rect in x.patches: - rect.set_color(color) - +qmd.plot_summary( + selected[selected_col_carbon], + title_size=7) +``` - plt.show() +### Unit Characteristics -#---- plot_heatmap_corr ---- +Values regarding other territorial unit characteristics are presented in @fig-unit and summarized in Table \\ref{graph_open}tbl-unit{graph_close}. +```{graph_open}python{graph_close} +#| output: asis +#| warning: false -def plot_heatmap_corr( - data_plot, - val_filter = 0.3, - figsize = (15, 15), - ): - fig, ax = plt.subplots(figsize=figsize) - data_cor = data_plot.corr() - - ax = sns.heatmap( - data_cor, - vmin = -1, - vmax = 1, - center = 0, - annot=True, - cmap=sns.color_palette("vlag", as_cmap=True), - ) - - for t in ax.texts: - if np.abs(float(t.get_text()))>=val_filter: - t.set_text(t.get_text()) #if the value is greater than 0.3 then I set the text - else: - t.set_text("") +print( + qmd.table_latex( + data = qmd.summary_stat(selected[selected_col_char]), + caption = 'Summary statistics unit characteristics', + label = 'tbl-unit', + size='small' + ) +) +``` +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-unit +#| fig-cap: 'Distribution unit characteristics indicators' - plt.show() +qmd.plot_summary(selected[selected_col_char]) +``` -#---- plot_map_all_cluster ---- +### Socio Demographic +Values regarding socio demographic characteristics are presented in @fig-socio and summarized in Table \\ref{graph_open}tbl-socio{graph_close}. -def plot_map_all_cluster( - data_plot, - cluster_colors, - figsize = (12, 12) - ): - fig, ax = plt.subplots(1, 1, figsize=figsize) - data_plot.plot( - column = 'cluster_lab', - cmap = LinearSegmentedColormap.from_list("a", list(cluster_colors.values())), - ax=ax, - categorical=True, - legend=True +```{graph_open}python{graph_close} +#| output: asis +#| warning: false + +print( + qmd.table_latex( + data = qmd.summary_stat(selected[selected_col_socio]), + caption = 'Summary statistics socio demographic', + label = 'tbl-socio' ) - data_plot.boundary.plot(ax=ax, color='white', linewidth = .25) - ax.axis('off') - plt.show() +) -#---- plot_clst_frequency ---- +``` +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-socio +#| fig-cap: 'Distribution socio demographic indicators' -def plot_clst_frequency( - data, - cluster_colors, - var_cluster = 'cluster_lab', - figsize = (7, 4) - ): - gs_kw = dict( - width_ratios=[1, 2, 1], - height_ratios=[1] - ) - - fig, ax = plt.subplots( - ncols = 3, - nrows = 1, - figsize=figsize, - constrained_layout=True, - gridspec_kw=gs_kw - ) - - ax[0].axis('off') - ax[2].axis('off') +qmd.plot_summary(selected[selected_col_socio]) +``` - sns.countplot( - ax = ax[1], - data=data, - x=var_cluster, - palette=cluster_colors, - order=cluster_colors) - ax[1].set( - ylabel="Count", - xlabel="Cluster", - ) - plt.show() + """ + descriptive_overview = f""" +## Overview -#---- plot_clst_boxplot_single ---- +The correlation between features is reported in @fig-corr. -def plot_clst_boxplot_single( - data_plot, - data_plot_means, - data_plot_median, - data_fi, - sel_var, - list_clust, - cluster_colors, - ax - ): - fig_sns = sns.boxplot( - x=sel_var, - y="cluster_lab", - data=data_plot, - palette=cluster_colors, - order= list_clust, - width=0.6, - ax = ax, - ) - fig_sns.set(ylabel=None) - # set alpha according to feature importance - max_fi = max(data_fi.max()) - min_fi = min(data_fi.min()) - delta_fi = max_fi - min_fi - for i, patch in enumerate(fig_sns.patches): - alpha = .05 + .95 * (data_fi.loc[sel_var][i] - min_fi)/ delta_fi - r, g, b, a = patch.get_facecolor() - patch.set_facecolor((r, g, b, alpha)) +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-corr +#| fig-cap: 'Correlation between features' - ax.plot( - data_plot_means.loc[sel_var,:], - data_plot_means.columns, - 's', - color="k", - marker="*", - markeredgecolor="k" - ) - - overall_mean = data_plot_means.loc[sel_var,:].mean() - ax.axvline(overall_mean, color='r', linestyle='--', alpha=0.5) - overall_median = data_plot_median.loc[sel_var,:].mean() - ax.axvline(overall_median, color='r', linestyle='-', alpha=0.5) +qmd.plot_heatmap_corr( + data_plot = selected[selected_col_all], + val_filter = 0.3, + figsize = (15, 12), +) +``` + +\clearpage + + """ - ax.xaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5) - max_x = max(data_plot[sel_var]) - min_x = min(data_plot[sel_var]) - delta = max_x - min_x - ax.set_xlim([min_x -.05*delta, max_x + .25*delta]) + cluster_intro = f""" +# Cluster Analysis - for i in range(0, len(list_clust)): - ax.text(max_x + .23*delta, i, f'{data_fi.loc[sel_var][i]*100:.1f}%', - horizontalalignment='right', verticalalignment = 'center', - size='small', color='black') +In this section, we provide all the detail regarding the cluster analysis. In @sec-cluster-approach, the adopted clustering approach is described. In @sec-cluster-results, the results of the clustering are presented. Finally, in @sec-cluster-profiles, a profile with summary information is provided for each cluster. -#---- plot_clst_boxplot_loop ---- +## Cluster Approach {graph_open}#sec-cluster-approach{graph_close} -def plot_clst_boxplot_loop( - data_plot, - data_plot_means, - data_plot_median, - data_fi, - list_vars, - list_clust, - cluster_colors, - figsize - ): - n_plots = len(list_vars) +To conduct the clustering analysis, we explored different approaches for the data preprocessing and the different clustering algorithm settings. - if n_plots == 1: - widths = [1, 1, 1] - ncols = 3 - grid = [0, 1, 0] - elif n_plots in [2, 4]: - widths = [1, 2, 2, 1] - ncols = 4 - grid = [0, 1, 1, 0, 0, 1, 1, 0] - else: - widths = [1, 1, 1] - ncols = 3 - grid = [1] * n_plots +### Preprocessing - nrows = ceil(n_plots / 3) +Before applying cluster algorithms, data must be preprocessed. This process includes the following (possible) steps: +1. **Scaling the data**. Scaling the data is important for clustering because it helps to ensure that all variables in the data set are on the same scale. In this way we avoid that the unit of measure of different features affects the clustering results. We considered the following scaler possibilities from `sklearn.preprocessing`: - fig, axes = plt.subplots( - nrows=nrows, - ncols=ncols, - sharex=False, - sharey=False, - figsize=figsize, - constrained_layout=True, - gridspec_kw=dict( - width_ratios=widths, - height_ratios= [1] * nrows - ) - ) + - `None` + - `MinMaxScaler()` + - `MaxAbsScaler()` + - `StandardScaler()` + - `Normalizer()` with different values of `norm = ['l1', 'l2', 'max']` + - `QuantileTransformer()` considering both `'uniform'` and` 'normal'` output distributions + - `PowerTransformer()` + - `RobustScaler()` with different quantile ranges `(0, 100)`, `(1, 99)`, `(2, 98)`, `(5, 95)`, `(10, 90)`, `(20, 80)`, and `(25, 75)` - i = 0 - for ax, to_plot in zip(axes.flat, grid + [0] * 4): - if to_plot == 0: - # removed unused plots in the grid - ax.axis('off') - continue +1. **Weighted features**. Weighting features allows to avoid that categories with multiple indicators overweight other categories with less indicators. We considered both the effect of weighting on not the features. - sel_var = list_vars[i] +1. **Decomposing techniques**. Decomposing techniques are used to define a new set of features that could properly represent the underlying structure of the data and facilitate the clusterability. We considered the following decomposition possibilities from `sklearn.decomposition`: - plot_clst_boxplot_single( - data_plot = data_plot, - data_plot_means = data_plot_means, - data_plot_median = data_plot_median, - data_fi = data_fi.loc[list_vars], - sel_var = sel_var, - list_clust = list_clust, - cluster_colors = cluster_colors, - ax = ax - ) - - i += 1 - -#---- plot_clst_heatmap ---- + - `None` + - `PCA()` with different possible settings + - `FactorAnalysis()` + - `FastICA()` with different possible settings + - `DictionaryLearning()` with different possible settings -def plot_clst_heatmap( - data, - col_avg, - fig_size = (10, 8) - ): - fig, ax = plt.subplots(figsize=fig_size) - data_plot = data.sub(data[col_avg], axis = 0)\ - .div(np.abs(data[col_avg]), axis = 0) - data_text = data.round(2) +We evaluated all the combinations of the preprocessing steps. To select the preferred data preprocessing, we evaluated the *data clusterability* of the different procedures. That is the tendency of data to form clusters after the application of the preprocessing. To quantify the data clusterability we used the following metric: - ax = sns.heatmap( - data_plot, - # vmin = -1, - # vmax = 1, - center = 0, - annot=data_text, - cmap=sns.color_palette("vlag", as_cmap=True), - cbar=False, - fmt='.3g' - ) - ax.set(xlabel = None) - plt.show() +- **Hopkins statistic**. This metric measures the cluster tendency of a data set by comparing the observed data distribution with a random uniform distribution. -#---- plot_cluster_comp ---- +The selected preprocessing method was: -def plot_cluster_comp( - serie_plot, - ax, - cluster, - color, - serie_fi = None, - text_fi = True, - xmin=-2.5, - xmax=2.5 - ): - fig_bar = ax.barh(serie_plot.index, serie_plot, label=cluster, color=color) +> Scaling: [TODO] +> +> Weighted features: [TODO] +> +> Decomposing: [TODO] - # set alpha according to feature importance - if serie_fi is not None: - max_fi = serie_fi.max() - min_fi = serie_fi.min() - delta_fi = max_fi - min_fi - for i, patch in enumerate(fig_bar.patches): - alpha = .19 + .8 * (serie_fi.iloc[i] - min_fi)/ delta_fi - r, g, b, a = patch.get_facecolor() - patch.set_facecolor((r, g, b, alpha)) +### Clustering Algorithm +To compute the clusters, we used the `HDBSCAN` (Hierarchical Density-based Spatial Clustering of Applications with Noise) algorithm. HDBSCAN combines aspects of DBSCAN and hierarchical clustering. For an introduction on HDBSCAN, see the [official documentation](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html#:~:text=HDBSCAN%20is%20a%20clustering%20algorithm,in%20the%20stability%20of%20clusters.). Summarizing, advantages of HDBSCAN are: - light_bg = is_light(*color, hsp_threshold=0.6) - ax.set(title=f"Cluster {cluster}") - if xmin is not None or xmax is not None: - ax.set_xlim(xmin, xmax) - for idx, val in enumerate(serie_plot): - if val < xmin: - ax.text( - xmin + 0.1 * abs(xmin), - idx, - f"{val:.2f}", - horizontalalignment="center", - verticalalignment="center", - size=7, - weight="semibold", - color="black" if light_bg else "white", - ) - elif val > xmax: - ax.text( - xmax - 0.1 * xmax, - idx, - f"{val:.2f}", - horizontalalignment="center", - verticalalignment="center", - size=7, - weight="semibold", - color="black" if light_bg else "white", - ) - - # add fi text - if serie_fi is not None and text_fi is True: - ax.text(xmax * 1.3, idx, f'{serie_fi.iloc[idx]*100:.1f}%', - horizontalalignment='right', verticalalignment = 'center', - size='small', color='black') +- Handles irregularly shaped and sized clusters. +- Robust to outliers. +- Does not require the number of clusters to be specified. -#---- plot_cluster_comp_loop ---- +We considered different parameter settings. In particular, we tried different values for -def plot_cluster_comp_loop( - data_plot, - data_fi, - colors, - ncols = 3, - xmin=-2.5, - xmax=2.5, - xlabel=None, - ylabel=None - ): - - nvar, ncl = data_plot.shape - nrows = ceil(ncl / ncols) +- `min_cluster_size`, the minimum size of observations to be considered as a clusters. Values considered `[2, 3, 5, 7, 10, 12, 15]`. +- `min_samples`, the number of samples in a neighborhood for a point to be considered a core point. Values considered `[None, 1, 2, 3, 5]`. +- `metric`, the metric to use when calculating distance. Metric considered: euclidean, manhattan, braycurtis, canberra, chebyshev, correlation, minkowski, and sqeuclidean. +- `cluster_selection_method`, the method used to select clusters from the condensed tree. Values considered `["eom", "leaf"]`. - fig, axes = plt.subplots( - nrows=nrows, - ncols=ncols, - sharex=False, - sharey=True, - figsize=(4 * 3, 4.5 * nrows), - ) - for c, cl in enumerate(data_plot.columns): - irow = floor(c / ncols) - icol = floor(c - (irow * ncols)) - ax = axes[irow][icol] - - plot_cluster_comp( - serie_plot = data_plot[cl], - serie_fi = data_fi[cl], - ax=ax, - cluster = cl, - color = colors[cl], - xmin=xmin, - xmax=xmax, - ) - if icol == 0: - ax.set(ylabel=ylabel) - if irow == (nrows - 1): - ax.set(xlabel=xlabel) - plt.subplots_adjust(wspace=0.25) - - # removed unused plots in the grid - nplots = nrows * ncols - if nplots > ncl: - - for i in range(nplots - ncl, 0, -1): - axes[-1, -i].axis('off') +To evaluate the quality of the obtained clusters using all combinations of the different parameters settings, we considered the following metrics: - plt.show() +- `% covered by clusters`, percentage of observations that have been assigned to a cluster +- `silhouette`, measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. +- `davies bouldin`, the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters which are farther apart and less dispersed will result in a better score. The minimum score is zero, with lower values indicating better clustering. +- `calinski harabasz`, the score is defined as ratio of the sum of between-cluster dispersion and of within-cluster dispersion. Higher values indicates better clustering. -#---- plot_cluster_comp_single ---- -def plot_cluster_comp_single( - dict_stat, - data_fi, - cluster_lab, - colors, - xmin=-2.5, - xmax=2.5, - xlabel=None, - ylabel=None, - figsize=(10, 5), - sorted = False - ): - data_plot_mean = dict_stat.get('cluster_means_cent')[cluster_lab].copy() - data_plot_median = dict_stat.get('cluster_medians_cent')[cluster_lab].copy() +The selected clusters were obtained using the following settings: - serie_fi = data_fi[cluster_lab].copy() - if sorted: - serie_fi = serie_fi.sort_values(ascending=True) - data_plot_mean = data_plot_mean.reindex(serie_fi.index) - data_plot_median = data_plot_median.reindex(serie_fi.index) - +> `min_cluster_size = [TODO]` +> +> `min_samples = [TODO]` +> +> `metric = [TODO] +> +> `cluster_selection_method = [TODO]` + +\clearpage - fig, axes = plt.subplots( - nrows=1, - ncols=2, - sharex=True, - sharey=True, - figsize=figsize, - ) - plot_cluster_comp( - serie_plot=data_plot_mean, - serie_fi= serie_fi, - text_fi = False, - ax=axes[0], - cluster = cluster_lab, - color = colors, - xmin=xmin, - xmax=xmax, - ) - axes[0].set(title='Mean') + """ + cluster_results = f""" +## Cluster Results {graph_open}#sec-cluster-results{graph_close} - plot_cluster_comp( - serie_plot=data_plot_median, - serie_fi= serie_fi, - ax=axes[1], - cluster = cluster_lab, - color = colors, - xmin=xmin, - xmax=xmax, - ) - axes[1].set(title='Median') - plt.show() +In this section, the results of the cluster analysis are presented. In @fig-cluster-map-all, the territorial units are colored according to their assigned cluster. Note that `Z` is used to indicate the *outliers* that were not assigned to any cluster adn it should not be considered a cluster group per se. -#---- plot_map_single_cluster ---- +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-cluster-map-all +#| fig-cap: 'Map cluster results' -def plot_map_single_cluster( - data_plot, - mask, - cluster_lab, - cluster_colors, - figsize=(7,7) - ): - fig, ax = plt.subplots(1, 1, figsize=figsize) - data_plot.boundary.plot(ax=ax, color = '#bdbdbd', alpha = .4, linewidth = .7) - data_plot.loc[mask].plot( - column = 'cluster_lab', - color = cluster_colors[cluster_lab], - ax=ax, - categorical=True, - legend=True - ) - ax.set(title=f'Map cluster {cluster_lab}') - ax.axis('off') - plt.show() +qmd.plot_map_all_cluster( + data_plot=selected, + cluster_colors=cluster_colors, + figsize = (12, 12) +) +``` -#---- plot_boxplot_comp_cluster_single ---- +Number of territorial units for each cluster are presented in Table \\ref{graph_open}tbl-clst-frequencies{graph_close} and @fig-cluster-cardinality. -def plot_boxplot_comp_cluster_single( - ax, - data_cluster, - cluster_lab, - sel_var, - data_fi, - data_plot_means, - data_plot_median, - cluster_colors - ): - list_cluster = sorted(set(data_cluster['cluster_lab'])) - list_cluster.remove(cluster_lab) - list_cluster = [cluster_lab] + list_cluster - list_cluster = list_cluster[::-1] +```{graph_open}python{graph_close} +#| output: asis +#| warning: false - positions = [.15 * x for x in list(range(0,len(list_cluster)-1))] - positions.append(positions[-1] +.5) - widths = [.1] * (len(list_cluster) - 1) + [.2] +summary_cls_frequency['Percent'] = summary_cls_frequency['Percent']\ + .map(lambda x: f'{graph_open}round(x*100, 2){graph_close}%') - # set alpha according to feature importance - max_fi = data_fi[cluster_lab].max() - min_fi = data_fi[cluster_lab].min() - delta_fi = max_fi - min_fi - alpha = .05 + .95 * (data_fi.loc[sel_var, cluster_lab] - min_fi)/ delta_fi +print( + qmd.table_latex( + data = summary_cls_frequency.T, + caption = 'Cluster frequencies', + label = 'tbl-clst-frequencies', + size = 'small' + ) +) +``` - for i, letter in enumerate(list_cluster): - mask_iter = data_cluster['cluster_lab'] == letter - color_alpha = (*cluster_colors[letter], alpha) - ax.boxplot( - data_cluster.loc[mask_iter,sel_var], positions=[positions[i]], vert=False, widths = widths[i], - patch_artist=True, - boxprops=dict(facecolor= color_alpha) - ) - ax.plot( - data_plot_means.loc[sel_var,letter], - positions[i], - 's', color="k", marker="*", markeredgecolor="k") +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| out-width: 50% +#| label: fig-cluster-cardinality +#| fig-cap: 'Cluster cardinality' - # add overall mean and median - overall_mean = data_plot_means.loc[sel_var,:].mean() - ax.axvline(overall_mean, color='r', linestyle='--', alpha=0.5) - overall_median = data_plot_median.loc[sel_var,:].mean() - ax.axvline(overall_median, color='r', linestyle='-', alpha=0.5) +qmd.plot_clst_frequency( + data = selected, + cluster_colors = cluster_colors, + var_cluster = 'cluster_lab', + figsize = (10, 3.5) +) +``` - # add fi - ax.text( - data_plot_means.loc[sel_var,cluster_lab], positions[-1] + .15, - f'fi: {data_fi.loc[sel_var, cluster_lab]*100:.1f}%', - horizontalalignment='center', verticalalignment = 'center', - size='small', color='black') +### Cluster Statistics - ax.set_yticks(positions, list_cluster) - ax.set_ylim(-.1, positions[-1] +.2 ) - ax.set_xlabel(sel_var) +In this section, descriptive statistics of clusters characteristics are presented. For each variable, the distribution of values for each cluster is presented using box-plots and the feature importance is indicated as percentage on the right for each cluster. This indicates the relevance that each variable has when classifying an observation as belonging to that specific cluster. -#---- plot_boxplot_single_cluster_loop ---- -def plot_boxplot_comp_cluster_loop( - data_cluster, - cluster_lab, - list_vars, - data_fi, - data_plot_means, - data_plot_median, - cluster_colors, - figsize, - sorted = True - ): +- **Pollution Risk** - if sorted is True: - list_vars = list(data_fi.loc[list_vars]\ - .sort_values(by = [cluster_lab], ascending=False).index) +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-clst-p-risk +#| fig-cap: 'Distribution pollution risk indicators per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' +qmd.plot_clst_boxplot_loop( + data_plot = data_cluster, + data_plot_means = dict_stat.get('cluster_means'), + data_plot_median = dict_stat.get('cluster_medians'), + data_fi = data_fi, + list_vars = selected_col_prisk, + list_clust = list_clust, + cluster_colors = cluster_colors, + figsize = (10,6.5) +) +``` - n_plots = len(list_vars) +- **Carbon** - if n_plots <= 3: - ncols = n_plots - elif n_plots == 4: - ncols = 2 - else: - ncols = 3 +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-clst-carbon +#| fig-cap: 'Distribution carbon indicators per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' +qmd.plot_clst_boxplot_loop( + data_plot = data_cluster, + data_plot_means = dict_stat.get('cluster_means'), + data_plot_median = dict_stat.get('cluster_medians'), + data_fi = data_fi, + list_vars = selected_col_carbon, + list_clust = list_clust, + cluster_colors = cluster_colors, + figsize = (10, 3.5) +) +``` - nrows = ceil(n_plots / ncols) +- **Unit Characteristics** - fig, axes = plt.subplots( - nrows=nrows, - ncols=ncols, - sharex=False, - sharey=False, - figsize=figsize - ) - for i, sel_var in enumerate(list_vars): - irow = floor(i / ncols) - icol = floor(i - (irow * ncols)) - - if nrows > 1: - ax = axes[irow][icol] - else: - ax = axes[icol] +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-clst-unit +#| fig-cap: 'Distribution unit characteristics indicators per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' +qmd.plot_clst_boxplot_loop( + data_plot = data_cluster, + data_plot_means = dict_stat.get('cluster_means'), + data_plot_median = dict_stat.get('cluster_medians'), + data_fi = data_fi, + list_vars = selected_col_char, + list_clust = list_clust, + cluster_colors = cluster_colors, + figsize = (10,7) +) +``` - plot_boxplot_comp_cluster_single( - ax = ax, - data_cluster = data_cluster, - cluster_lab = cluster_lab, - sel_var = sel_var, - data_fi = data_fi, - data_plot_means = data_plot_means, - data_plot_median = data_plot_median, - cluster_colors = cluster_colors - ) - # removed unused plots in the grid - n_grid = nrows * ncols - if n_grid > n_plots: +- **Socio Demographic** - for i in range(n_grid - n_plots, 0, -1): - if nrows > 1: - axes[-1, -i].axis('off') - else: - axes[-i].axis('off') - plt.show() - -#---- create_qmd_report ---- +```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-clst-socio +#| fig-cap: 'Distribution socio demographic indicators per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' +qmd.plot_clst_boxplot_loop( + data_plot = data_cluster, + data_plot_means = dict_stat.get('cluster_means'), + data_plot_median = dict_stat.get('cluster_medians'), + data_fi = data_fi, + list_vars = selected_col_socio, + list_clust = list_clust, + cluster_colors = cluster_colors, + figsize = (10,7) +) +``` -def create_qmd_report( - city, - clname, - cdir: Path, - ): - graph_open = "{" - graph_close = "}" +### Overview Clusters - header =f"""--- -title: 'JUSTNature - {city}' -format: - pdf: - toc: true - number-sections: true - colorlinks: true - keep-tex: false - fig-pos: 'H' -execute: - echo: false - warning: false ---- -\pagebreak - """ - settings =f""" -```{graph_open}python{graph_close} -import os -import sys -import geopandas as gpd -import numpy as np -import pandas as pd -import re -import matplotlib.pyplot as plt - -# Custom modules -from justclust.data.bolzano import cols, conv, read_data, selected_col -import justclust.paths.paths as paths -import justclust.quarto as qmd +To provide an overview of the cluster characteristics, we consider cluster mean and median values. In @fig-mean-heatmap, cluster mean values are reported for each variable and colored according to the difference with the average mean among clusters (`Avg mean`). In @fig-cluster-comp-mean, the cluster mean value is standardized with respect to the average mean among clusters for each variable). That is, -plt.rcParams['figure.dpi'] = 600 -plt.rcParams['savefig.dpi'] = 600 +$$ +\\frac{graph_open}\\bar{graph_open}x{graph_close}_i - \\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close}{graph_open}\\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close} +$$ -my_paths = paths.define_paths(city = '{city}') +where $\\bar{graph_open}x{graph_close}_i$ is the mean value of a given variable for cluster $i$ and $\\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}$ is the average mean among clusters (i.e, $\\frac{graph_open}\\sum_i{graph_open}\\bar{graph_open}x{graph_close}_i{graph_close}{graph_close}{graph_open}\\#i{graph_close}$). -``` - """ - data_load = f""" ```{graph_open}python{graph_close} +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-mean-heatmap +#| fig-cap: 'Cluster feature mean values' +#| +data_means = dict_stat['cluster_means'] +data_means['Avg mean'] = data_means.mean(axis = 1) -# Get selected data with the required cluster -raw = read_data() -selected = raw.loc[:, cols].dropna() -selected.rename(columns=conv, inplace=True) - -# Remove error observation -mask = selected.index == 8888888 -selected = selected[~ mask] - -clname = '{clname}' - -geo_out = gpd.read_file(my_paths.clfile) -geo_out.set_index('SEZ', drop=False, verify_integrity=True, inplace=True) -clsts = geo_out.loc[selected.index,[clname, 'geometry']] -selected = clsts.join(selected, validate='1:1') - -# Subset fo columns -selected_col_aq = selected_col.get('col_aq') -selected_col_carbon = selected_col.get('col_carbon') -selected_col_char = selected_col.get('col_char') -selected_col_socio = selected_col.get('col_socio') - -selected_col_all = [] -for x in [selected_col_aq, selected_col_carbon, selected_col_char, selected_col_socio]: - if x is not None: - selected_col_all = selected_col_all + x - - -# Clustering -cluster_labels = qmd.get_cluster_labels(selected[clname]) -cluster_colors = qmd.cluster_color_dict(cluster_labels) -selected['cluster_lab'] = selected[clname].map(cluster_labels) - -data_cluster = selected[selected['cluster_lab'] != 'Z'][['cluster_lab'] + selected_col_all] -list_clust =sorted(set(data_cluster['cluster_lab'])) - -# Compute cluster statistics -dict_stat = qmd.get_dict_stat( - data=selected, - selected_col=selected_col_all, - var_cluster = 'cluster_lab', - exclude_Z = True +qmd.plot_clst_heatmap( + data = data_means, + col_avg= 'Avg mean', + fig_size= (10,6) ) - -# Compute feature importance -X_fi = data_cluster[selected_col_all] -y_fi = data_cluster['cluster_lab'] - -data_fi = qmd.get_clst_feature_importance(X_fi, y_fi, max_depth=None) - -# Orther info -n_units_total = geo_out.shape[0] -n_units = selected.shape[0] - -# summary cluster frequencyz -summary_cls_frequency = qmd.get_summary_clst_frequency(selected) - -``` - """ - - descriptive_intro = f""" -# Descriptive Statistics - -```{graph_open}python{graph_close} -#| output: asis -print( - f'The urban area of {city} is formed by {graph_open}n_units_total{graph_close} territorial units.', - 'However, some territorial units are not included in the analysis due to the presence of missing data.', - f'The analysis covers {graph_open}n_units{graph_close} territorial units ({graph_open}n_units/n_units_total*100:.2f{graph_close}% of the total; see @fig-territtorial-units).') ``` ```{graph_open}python{graph_close} +#| fig-width: 10 +#| fig-height: 12 #| fig-align: center #| fig-pos: 'H' -#| label: fig-territtorial-units -#| fig-cap: 'Territorial units included or excluded from the analysis' +#| label: fig-cluster-comp-mean +#| fig-cap: 'Cluster comparison Mean (the value is the standardized cluster mean compared to the Avg mean among clusters). Feature importance is reported on the right.' -qmd.plot_map_included( - geo_out=geo_out, - clname=clname, - figsize=(9,9) + +qmd.plot_cluster_comp_loop( + data_plot = dict_stat['cluster_means_cent'], + data_fi=data_fi, + colors = cluster_colors, + ncols = 3, + xmin=-2.5, + xmax=2.5, + xlabel="Value", + ylabel=None ) ``` - """ - descriptive_feature = f""" -## Features +In @fig-median-heatmap, cluster median values are reported for each variable and colored according to the difference with the average median among clusters (`Avg median`). In @fig-cluster-comp-median, the cluster median value is standardized with respect to the average median among clusters for each variable). That is, + +$$ +\\frac{graph_open}\\tilde{graph_open}x{graph_close}_i - \\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close}{graph_open}\\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close} +$$ -In the next sections, descriptive statistics of the territorial units charateristics are presented. +where $\\tilde{graph_open}x{graph_close}_i$ is the median value of a given variable for cluster $i$ and $\\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}$ is the average median among clusters (i.e, $\\frac{graph_open}\\sum_i{graph_open}\\tilde{graph_open}x{graph_close}_i{graph_close}{graph_close}{graph_open}\\#i{graph_close}$). -### Air Quality -Values regarding air quality are presented in @fig-air-quality and summarized in Table \ref{graph_open}tbl-air-quality{graph_close}. ```{graph_open}python{graph_close} -#| output: asis -#| warning: false +#| fig-align: center +#| fig-pos: 'H' +#| label: fig-median-heatmap +#| fig-cap: 'Cluster feature median values' +#| +data_medians = dict_stat['cluster_medians'] +data_medians['Avg median'] = data_medians.mean(axis = 1) -print( - qmd.table_latex( - data = qmd.summary_stat(selected[selected_col_aq]), - caption = 'Summary statistics air quality', - label = 'tbl-air-quality' - ) +qmd.plot_clst_heatmap( + data = data_medians, + col_avg= 'Avg median', + fig_size= (10,6) ) ``` + ```{graph_open}python{graph_close} +#| fig-width: 10 +#| fig-height: 12 #| fig-align: center #| fig-pos: 'H' -#| label: fig-air-quality -#| fig-cap: 'Distribution air quality' +#| label: fig-cluster-comp-median +#| fig-cap: 'Map cluster comparison Median (the value is the standardized cluster median compared to the Avg median among clusters). Feature importance is reported on the right.' -qmd.plot_summary(selected[selected_col_aq]) +data_plot = dict_stat.get('cluster_medians_cent') +qmd.plot_cluster_comp_loop( + data_plot = data_plot, + data_fi=data_fi, + colors = cluster_colors, + ncols = 3, + xmin=-2.5, + xmax=2.5, + xlabel="Value", + ylabel=None +) ``` -### Carbon - -Values regarding carbon emission and absorption are presented in @fig-carbon and summarized in Table \ref{graph_open}tbl-carbon{graph_close}. -```{graph_open}python{graph_close} -#| output: asis -#| warning: false +\clearpage -print( - qmd.table_latex( - data = qmd.summary_stat(selected[selected_col_carbon]), - caption = 'Summary statistics carbon', - label = 'tbl-carbon' - ) -) + """ + cluster_profiles = f""" +## Cluster Profiles {graph_open}#sec-cluster-profiles{graph_close} -``` ```{graph_open}python{graph_close} +#| output: asis #| fig-align: center #| fig-pos: 'H' -#| label: fig-carbon -#| fig-cap: 'Distribution carbon' +#| warning: false -qmd.plot_summary( - selected[selected_col_carbon], - title_size=7) -``` -### Unit Charateristics +for cluster_lab in cluster_labels.values(): + if(cluster_lab == 'Z'): continue -Values regarding other territorial unit charateristics are presented in @fig-unit and summarized in Table \ref{graph_open}tbl-unit{graph_close}. -```{graph_open}python{graph_close} -#| output: asis -#| warning: false + mask = selected['cluster_lab'] == cluster_lab -print( - qmd.table_latex( - data = qmd.summary_stat(selected[selected_col_char]), - caption = 'Summary statistics unit charateristics', - label = 'tbl-unit', - size='small' + graph_par_open = "{graph_open}" + graph_par_close = "{graph_close}" + print(f'\\subsubsection{graph_open}graph_par_open{graph_close}Cluster {graph_open}cluster_lab{graph_close} (n = {graph_open}selected.loc[mask].shape[0]{graph_close}){graph_open}graph_par_close{graph_close}') + + qmd.plot_map_single_cluster( + data_plot=selected, + mask = mask, + cluster_lab=cluster_lab, + cluster_colors=cluster_colors ) -) -``` -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-unit -#| fig-cap: 'Distribution unit charateristics' -qmd.plot_summary(selected[selected_col_char]) -``` + qmd.plot_cluster_comp_single( + dict_stat=dict_stat, + data_fi = data_fi, + cluster_lab=cluster_lab, + colors=cluster_colors[cluster_lab], + sorted = True + ) -### Socio Demographic -Values regarding socio demographic charateristics are presented in @fig-socio and summarized in Table \ref{graph_open}tbl-socio{graph_close}. + qmd.plot_boxplot_comp_cluster_loop( + data_cluster = data_cluster, + cluster_lab = cluster_lab, + list_vars = selected_col_all, + data_fi = data_fi, + data_plot_means = dict_stat['cluster_means'], + data_plot_median = dict_stat['cluster_medians'], + cluster_colors = cluster_colors, + figsize = (20,25) + ) + + print('\\clearpage') + +``` ```{graph_open}python{graph_close} #| output: asis +#| fig-align: center +#| fig-pos: 'H' #| warning: false +cluster_lab == 'Z' -print( - qmd.table_latex( - data = qmd.summary_stat(selected[selected_col_socio]), - caption = 'Summary statistics socio demographic', - label = 'tbl-socio' +mask = selected['cluster_lab'] == cluster_lab +print(f'\\subsubsection{graph_open}graph_par_open{graph_close}Cluster {graph_open}cluster_lab{graph_close} (n = {graph_open}selected.loc[mask].shape[0]{graph_close}){graph_open}graph_par_close{graph_close}') + +qmd.plot_map_single_cluster( + data_plot=selected, + mask = mask, + cluster_lab=cluster_lab, + cluster_colors=cluster_colors ) -) ``` ```{graph_open}python{graph_close} #| fig-align: center #| fig-pos: 'H' -#| label: fig-socio -#| fig-cap: 'Distribution socio demographic' +#| fig-width: 20 +#| fig-height: 25 +#| warning: false +qmd.plot_summary( + data = selected.loc[mask, selected_col_all], + color = cluster_colors[cluster_lab] +) -qmd.plot_summary(selected[selected_col_socio]) ``` - """ - descriptive_overview = f""" -## Overview - -The correlation between features is reportd in @fig-corr. +\clearpage - -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-corr -#| fig-cap: 'Corrrelation between features' - -qmd.plot_heatmap_corr( - data_plot = selected[selected_col_all], - val_filter = 0.3, - figsize = (15, 12), -) -``` - -\clearpage - """ - cluster_intro = f""" -# Cluster Analysis - -In this section, we provide all the detail regarding the cluster analysis. In @sec-cluster-approach, the adopted clustering approach is described. In @sec-cluster-results, the results of the clustering are presented. Finally, in @sec-cluster-profiles, a profile with summary information is provided for each cluster. - -## Cluster Approach {graph_open}#sec-cluster-approach{graph_close} + file = open(cdir / f'report-{city}-template.qmd', 'w') + + + file.write(header + settings + data_load) + file.write(descriptive_intro + descriptive_feature + descriptive_overview) + file.write(cluster_intro + cluster_results + cluster_profiles) + + file.close() -To conduct the clustering analysis, we explored different approaches for the data preprocessing and the differen clustering algorithm settings. +#---- get_dict_stat +def get_dict_stat( + data, + selected_col, + var_cluster = 'cluster_lab', + exclude_Z = True + ): + data = data[['cluster_lab'] + selected_col].copy() -### Preprocessing + if exclude_Z: + data = data[data['cluster_lab'] != 'Z'] + + selected_grouped = data.groupby(['cluster_lab']) + cluster_means = selected_grouped.mean() + cluster_medians = selected_grouped.median() + overall_mean = cluster_means.mean() + overall_median = cluster_medians.mean() -Before applying cluster algorithms, data must be preprocessed. This process includ the following (possible) steps: + res = { + 'cluster_means':cluster_means.T, + 'cluster_means_cent':cluster_means.T\ + .sub(overall_mean, axis = 0)\ + .divide(np.abs(overall_mean), axis = 0), -1. **Scaling the data**. Scaling the data is important for clustering because it helps to ensure that all variables in the data set are on the same scale. In this way we avoid that the unit of measure of different features affects the clusteriing results. We considered the following scaler possibilities from `sklearn.preprocessing`: + 'cluster_medians':cluster_medians.T, + 'cluster_medians_cent':cluster_medians.T\ + .sub(overall_median, axis = 0)\ + .divide(np.abs(overall_median), axis = 0) + } - - `None` - - `MinMaxScaler()` - - `MaxAbsScaler()` - - `StandardScaler()` - - `Normalizer()` with different values of `norm = ['l1', 'l2', 'max']` - - `QuantileTransformer()` conosiderig both `'uniform'` and` 'normal'` output distirbutions - - `PowerTransformer()` - - `RobustScaler()` with different quantile ranges `(0, 100)`, `(1, 99)`, `(2, 98)`, `(5, 95)`, `(10, 90)`, `(20, 80)`, and `(25, 75)` + return res -1. **Weighted features**. Weighting features allows to avoid that categories with multiple indicators overweigths other categories with less indicators. We considerd both the effect of weighting on not the features. +#---- summary_stat ---- -1. **Decomposing techniques**. Decomposing techniques are used to define a new set of features that could properly represent the underlying structure of the data and facilitate the clusterability. We considered the following decomposition possibilities from `sklearn.decomposition`: +def summary_stat(data, round = 2): + data_summary = data.describe().T - - `None` - - `PCA()` with different possible settings - - `FactorAnalysis()` - - `FastICA()` with different possible settings - - `DictionaryLearning()` with different possible settings + res = data_summary.round(decimals = round) + return res -We elauated all the combinations of the preprocessing steps. To select the preferred data preprocessing, we evaluated the *data clusterability* of the different procedures. That is the tendency of data to form clusters after the application of the preprocessing. To quantify the data clusterability we used the following metric: +#---- table_latex ---- -- **Hopkins statistic**. This metric measures the cluster tendency of a data set by comparing the observed data distribution with a random uniform distribution. +def table_latex( + data, + caption = None, + label = None, + size = None, + position = 'H' + ): + + res = data.to_latex( + position= position, + caption = caption, + label = label + ) + + if size is not None: + res = re.sub( + pattern = 'centering', + repl = 'centering\\n\\\\'+size, + string = res + ) + + return res -The selected preprocessing method was: +#---- cluster_color_dict ---- +def cluster_color_dict(cluster_labels): + n_clusters = len(cluster_labels) -> Scaling: [TODO] -> -> Weighted features: [TODO] -> -> Decomposing: [TODO] + palette = list(sns.color_palette("colorblind", n_colors=max(8, n_clusters))) + gray_color = palette[7] + palette.remove(gray_color) + palette = palette[0: n_clusters-1] + [gray_color] + res = { + label:rgb for label, rgb in zip(cluster_labels.values(), palette) + } + return res -### Clustering Algorithm +#---- get_cluster_lab ---- -To compute the clusters, we used the `HDBSCAN` (Hierarchical Density-based Spatial Clustering of Applications with Noise) algorithm. HDBSCAN combines aspects of DBSCAN and hierarchical clustering. For an introduction on HDBSCAN, see the [official documentation](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html#:~:text=HDBSCAN%20is%20a%20clustering%20algorithm,in%20the%20stability%20of%20clusters.). Summarizing, advantages of HDBSCAN are: +def get_cluster_labels(cluster_var) : + + unique_values = sorted(cluster_var.unique().astype(int)) -- Handles irregularly shaped and sized clusters. -- Robust to outliers. -- Does not require the number of clusters to be specified. + res = { + i:string.ascii_uppercase[i] if i >= 0 else 'Z' for i in unique_values + } + + # sort by value + res = dict(sorted(res.items(), key=lambda item: item[1])) + return res -We considered different parameter settings. In particular, we tried different values for +#---- get_clst_feature_importance ---- -- `min_cluster_size`, the minimum size of observations to be considered as a clusters. Values considered `[2, 3, 5, 7, 10, 12, 15]`. -- `min_samples`, the number of samples in a neighbourhood for a point to be considered a core point. Values considered `[None, 1, 2, 3, 5]`. -- `metric`, the metric to use when calculating distance. Metric considered: euclidean, manhattan, braycurtis, canberra, chebyshev, correlation, minkowski, and sqeuclidean. -- `cluster_selection_method`, the method used to select clusters from the condensed tree. Values considered `["eom", "leaf"]`. +def get_clst_feature_importance( + data, + labels, + max_depth = 10 + ): + + cl_cat = sorted(set(labels)) + + res = pd.DataFrame() -To evaluate the quality of the obtained clusters using all combinations of the different parameters settings, we considere the following metrics: + clf = ens.RandomForestClassifier( + max_depth=max_depth, + n_estimators=500, + random_state=1, + n_jobs = -1 + ) -- `% covered by clusters`, percentage of observatiosn that have been assigned to a cluster -- `silhouette`, measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. -- `davies bouldin`, the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters which are farther apart and less dispersed will result in a better score. The minimum score is zero, with lower values indicating better clustering. -- `calinski harabasz`, the score is defined as ratio of the sum of between-cluster dispersion and of within-cluster dispersion. Higher values indicates better clustering. + for cl in cl_cat: + clf.fit( + data, + labels == cl) + fi = pd.DataFrame( + clf.feature_importances_, + index=pd.Series(data.columns, name="features"), + columns=[cl] + ) + + res = pd.concat([res, fi], axis='columns') -The selected clusters were obtaied using the following settings: + return res -> `min_cluster_size = [TODO]` -> -> `min_samples = [TODO]` -> -> `metric = [TODO] -> -> `cluster_selection_method = [TODO]` - -\clearpage +#---- get_summary_clst_frequency ---- - """ - cluster_results = f""" -## Cluster Results {graph_open}#sec-cluster-results{graph_close} +def get_summary_clst_frequency( + data, + var_clust = 'cluster_lab' + ): + res = pd.DataFrame({ + 'Frequency':data[var_clust].value_counts(sort = False).sort_index() + }) + res['Percent'] = res['Frequency'].div(res['Frequency'].sum()) + return res -In this section, the results of the cluster analysis are presented. In @fig-cluster-map-all, the territorial units are colored according to their assigned cluster. Note that `Z` is used to indicate the *outliers* that were not assigned to any cluster adn it should not be considered a cluster group per se. +#---- is_light ---- -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-cluster-map-all -#| fig-cap: 'Map cluster results' +def is_light(r, g, b, hsp_threshold=0.5): + # HSP (Highly Sensitive Poo) equation is from http://alienryderflex.com/hsp.html + hsp = np.sqrt(0.299 * (r * r) + 0.587 * (g * g) + 0.114 * (b * b)) + return True if hsp > hsp_threshold else False -qmd.plot_map_all_cluster( - data_plot=selected, - cluster_colors=cluster_colors, - figsize = (12, 12) -) -``` +#---- plot_map_included ---- -Number of territorial units for each cluster are presented in Table \ref{graph_open}tbl-clst-frequencies{graph_close} and @fig-cluster-cardinality. +def plot_map_included( + geo_out, + clname, + figsize = (12,12) + ): + + data_plot = geo_out[[clname, 'geometry']].copy() + data_plot['Inclusion'] = data_plot[clname].map(lambda x: 'Excluded' if np.isnan(x) else 'Included' ) -```{graph_open}python{graph_close} -#| output: asis -#| warning: false + sns_colors = list(sns.color_palette()) + dict_colors = { + 'Included':sns_colors[0], + 'Excluded':sns_colors[1]} -summary_cls_frequency['Percent'] = summary_cls_frequency['Percent']\ - .map(lambda x: f'{graph_open}round(x*100, 2){graph_close}%') + fig, ax = plt.subplots(1, 1, figsize=figsize) + data_plot.plot( + ax=ax, alpha = .7, column = 'Inclusion', legend = True, categorical=True, + categories = list(dict_colors.keys()), + cmap = LinearSegmentedColormap.from_list("a", list(dict_colors.values()))) + data_plot.boundary.plot(ax=ax, color='white', linewidth = .35) + ax.axis('off') + plt.show() -print( - qmd.table_latex( - data = summary_cls_frequency.T, - caption = 'Cluster frequencies', - label = 'tbl-clst-frequencies', - size = 'small' - ) -) -``` +#---- plot_summary ---- -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| out-width: 50% -#| label: fig-cluster-cardinality -#| fig-cap: 'Cluster cardinality' +def plot_summary( + data, + bins = 25, + text_size = 6, + title_size = None, + color = None): + n_var = data.shape[1] -qmd.plot_clst_frequency( - data = selected, - cluster_colors = cluster_colors, - var_cluster = 'cluster_lab', - figsize = (10, 3.5) -) -``` + if n_var <= 3: + n_col = n_var + n_row = 1 + elif n_var == 4: + n_col = 2 + n_row = 2 + else: + n_col = 3 + n_row = ceil(n_var/3) + + + + fig, ax = plt.subplots(1, 1, figsize=(n_col*2.5, n_row*2)) + fig = pd.DataFrame(data).hist( + ax = ax, + layout = (n_row, n_col), + alpha = .7, + bins = bins) + + # Reduce font + if title_size is None: + title_size = text_size + 2 -### Cluster Statistics + for col, x in zip(data.columns, fig.ravel()): + for item in ([x.xaxis.label, + x.yaxis.label] + + x.get_xticklabels() + + x.get_yticklabels()): + item.set_size(text_size) -In this section, descriptive statistics of clusters charateristics are presented. For each variable, the distribution of values for each cluster is presented using box-plots and the feature importance is indicated as percentage on the right for each cluster. This indicates the relevance that each variable has when classifing an observation as belonging to that specific cluster. + x.set_xlabel(col) + x.xaxis.label.set_size(title_size) + x.set_ylabel("Count") + x.set_title(None) + if color is not None: + for rect in x.patches: + rect.set_color(color) + -- **Air Quality** + plt.show() -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-clst-air-quality -#| fig-cap: 'Distribution air quality per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' -qmd.plot_clst_boxplot_loop( - data_plot = data_cluster, - data_plot_means = dict_stat.get('cluster_means'), - data_plot_median = dict_stat.get('cluster_medians'), - data_fi = data_fi, - list_vars = selected_col_aq, - list_clust = list_clust, - cluster_colors = cluster_colors, - figsize = (10,6.5) -) -``` +#---- plot_heatmap_corr ---- -- **Carbon** +def plot_heatmap_corr( + data_plot, + val_filter = 0.3, + figsize = (15, 15), + ): + fig, ax = plt.subplots(figsize=figsize) + data_cor = data_plot.corr() + + ax = sns.heatmap( + data_cor, + vmin = -1, + vmax = 1, + center = 0, + annot=True, + cmap=sns.color_palette("vlag", as_cmap=True), + ) + + for t in ax.texts: + if np.abs(float(t.get_text()))>=val_filter: + t.set_text(t.get_text()) #if the value is greater than 0.3 then I set the text + else: + t.set_text("") -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-clst-carbono -#| fig-cap: 'Distribution carbon per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' -qmd.plot_clst_boxplot_loop( - data_plot = data_cluster, - data_plot_means = dict_stat.get('cluster_means'), - data_plot_median = dict_stat.get('cluster_medians'), - data_fi = data_fi, - list_vars = selected_col_carbon, - list_clust = list_clust, - cluster_colors = cluster_colors, - figsize = (10, 3.5) -) -``` + plt.show() -- **Unit Charaterisics** +#---- plot_map_all_cluster ---- -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-clst-unit -#| fig-cap: 'Distribution unit charateristics per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' -qmd.plot_clst_boxplot_loop( - data_plot = data_cluster, - data_plot_means = dict_stat.get('cluster_means'), - data_plot_median = dict_stat.get('cluster_medians'), - data_fi = data_fi, - list_vars = selected_col_char, - list_clust = list_clust, - cluster_colors = cluster_colors, - figsize = (10,7) -) -``` +def plot_map_all_cluster( + data_plot, + cluster_colors, + figsize = (12, 12) + ): + fig, ax = plt.subplots(1, 1, figsize=figsize) + data_plot.plot( + column = 'cluster_lab', + cmap = LinearSegmentedColormap.from_list("a", list(cluster_colors.values())), + ax=ax, + categorical=True, + legend=True + ) + data_plot.boundary.plot(ax=ax, color='white', linewidth = .25) + ax.axis('off') + plt.show() -- **Socio Demographic** +#---- plot_clst_frequency ---- -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-clst-socio -#| fig-cap: 'Distribution socio demographic per cluster (red solid vertical line - Avg median; red dashed vertical line - Avg mean )' -qmd.plot_clst_boxplot_loop( - data_plot = data_cluster, - data_plot_means = dict_stat.get('cluster_means'), - data_plot_median = dict_stat.get('cluster_medians'), - data_fi = data_fi, - list_vars = selected_col_socio, - list_clust = list_clust, - cluster_colors = cluster_colors, - figsize = (10,7) -) -``` +def plot_clst_frequency( + data, + cluster_colors, + var_cluster = 'cluster_lab', + figsize = (7, 4) + ): + gs_kw = dict( + width_ratios=[1, 2, 1], + height_ratios=[1] + ) + + fig, ax = plt.subplots( + ncols = 3, + nrows = 1, + figsize=figsize, + constrained_layout=True, + gridspec_kw=gs_kw + ) + + ax[0].axis('off') + ax[2].axis('off') -### Overwie Clusters + sns.countplot( + ax = ax[1], + data=data, + x=var_cluster, + palette=cluster_colors, + order=cluster_colors) + ax[1].set( + ylabel="Count", + xlabel="Cluster", + ) + plt.show() -Too provide an overwie of the cluster charateristics, we consider cluster mean and median values. In @fig-mean-heatmap, cluster mean values are reprted for each variable and colored according to the difference with the average mean among clusters (`Avg mean`). In @fig-cluster-comp-mean, the cluster mean value is standardized with resepct to the average mean among clusters for each variable). That is, +#---- plot_clst_boxplot_single ---- -$$ -\\frac{graph_open}\\bar{graph_open}x{graph_close}_i - \\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close}{graph_open}\\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close} -$$ +def plot_clst_boxplot_single( + data_plot, + data_plot_means, + data_plot_median, + data_fi, + sel_var, + list_clust, + cluster_colors, + ax + ): + fig_sns = sns.boxplot( + x=sel_var, + y="cluster_lab", + data=data_plot, + palette=cluster_colors, + order= list_clust, + width=0.6, + ax = ax, + ) + fig_sns.set(ylabel=None) -where $\\bar{graph_open}x{graph_close}_i$ is the mean value of a given variable for cluster $i$ and $\\bar{graph_open}x{graph_close}_{graph_open}Avg{graph_close}$ is the average mean among clusters (i.e, $\\frac{graph_open}\\sum_i{graph_open}\\bar{graph_open}x{graph_close}_i{graph_close}{graph_close}{graph_open}\\#i{graph_close}$). + # set alpha according to feature importance + max_fi = max(data_fi.max()) + min_fi = min(data_fi.min()) + delta_fi = max_fi - min_fi -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-mean-heatmap -#| fig-cap: 'Cluster feature mean values' -#| -data_means = dict_stat['cluster_means'] -data_means['Avg mean'] = data_means.mean(axis = 1) + for i, patch in enumerate(fig_sns.patches): + alpha = .05 + .95 * (data_fi.loc[sel_var][i] - min_fi)/ delta_fi + r, g, b, a = patch.get_facecolor() + patch.set_facecolor((r, g, b, alpha)) -qmd.plot_clst_heatmap( - data = data_means, - col_avg= 'Avg mean', - fig_size= (10,6) -) -``` + ax.plot( + data_plot_means.loc[sel_var,:], + data_plot_means.columns, + 's', + color="k", + marker="*", + markeredgecolor="k" + ) + + overall_mean = data_plot_means.loc[sel_var,:].mean() + ax.axvline(overall_mean, color='r', linestyle='--', alpha=0.5) + overall_median = data_plot_median.loc[sel_var,:].mean() + ax.axvline(overall_median, color='r', linestyle='-', alpha=0.5) + + ax.xaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5) + max_x = max(data_plot[sel_var]) + min_x = min(data_plot[sel_var]) + delta = max_x - min_x + ax.set_xlim([min_x -.05*delta, max_x + .25*delta]) -```{graph_open}python{graph_close} -#| fig-width: 10 -#| fig-height: 12 -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-cluster-comp-mean -#| fig-cap: 'Cluster comparison Mean (the value is the statndardized cluster mean compared to othe Avg mean among clusters). Feature importance is repoorted on the right.' + for i in range(0, len(list_clust)): + ax.text(max_x + .23*delta, i, f'{data_fi.loc[sel_var][i]*100:.1f}%', + horizontalalignment='right', verticalalignment = 'center', + size='small', color='black') +#---- plot_clst_boxplot_loop ---- -qmd.plot_cluster_comp_loop( - data_plot = dict_stat['cluster_means_cent'], - data_fi=data_fi, - colors = cluster_colors, +def plot_clst_boxplot_loop( + data_plot, + data_plot_means, + data_plot_median, + data_fi, + list_vars, + list_clust, + cluster_colors, + figsize + ): + n_plots = len(list_vars) + + if n_plots == 1: + widths = [1, 1, 1] + ncols = 3 + grid = [0, 1, 0] + elif n_plots in [2, 4]: + widths = [1, 2, 2, 1] + ncols = 4 + grid = [0, 1, 1, 0, 0, 1, 1, 0] + else: + widths = [1, 1, 1] + ncols = 3 + grid = [1] * n_plots + + nrows = ceil(n_plots / 3) + + + fig, axes = plt.subplots( + nrows=nrows, + ncols=ncols, + sharex=False, + sharey=False, + figsize=figsize, + constrained_layout=True, + gridspec_kw=dict( + width_ratios=widths, + height_ratios= [1] * nrows + ) + ) + + i = 0 + for ax, to_plot in zip(axes.flat, grid + [0] * 4): + if to_plot == 0: + # removed unused plots in the grid + ax.axis('off') + continue + + sel_var = list_vars[i] + + plot_clst_boxplot_single( + data_plot = data_plot, + data_plot_means = data_plot_means, + data_plot_median = data_plot_median, + data_fi = data_fi.loc[list_vars], + sel_var = sel_var, + list_clust = list_clust, + cluster_colors = cluster_colors, + ax = ax + ) + + i += 1 + +#---- plot_clst_heatmap ---- + +def plot_clst_heatmap( + data, + col_avg, + fig_size = (10, 8) + ): + fig, ax = plt.subplots(figsize=fig_size) + data_plot = data.sub(data[col_avg], axis = 0)\ + .div(np.abs(data[col_avg]), axis = 0) + data_text = data.round(2) + + ax = sns.heatmap( + data_plot, + # vmin = -1, + # vmax = 1, + center = 0, + annot=data_text, + cmap=sns.color_palette("vlag", as_cmap=True), + cbar=False, + fmt='.5g' + ) + ax.set(xlabel = None) + plt.show() + +#---- plot_cluster_comp ---- + +def plot_cluster_comp( + serie_plot, + ax, + cluster, + color, + serie_fi = None, + text_fi = True, + xmin=-2.5, + xmax=2.5 + ): + + fig_bar = ax.barh(serie_plot.index, serie_plot, label=cluster, color=color) + + # set alpha according to feature importance + if serie_fi is not None: + max_fi = serie_fi.max() + min_fi = serie_fi.min() + delta_fi = max_fi - min_fi + + for i, patch in enumerate(fig_bar.patches): + alpha = .19 + .8 * (serie_fi.iloc[i] - min_fi)/ delta_fi + r, g, b, a = patch.get_facecolor() + patch.set_facecolor((r, g, b, alpha)) + + + light_bg = is_light(*color, hsp_threshold=0.6) + ax.set(title=f"Cluster {cluster}") + if xmin is not None or xmax is not None: + ax.set_xlim(xmin, xmax) + for idx, val in enumerate(serie_plot): + if val < xmin: + ax.text( + xmin + 0.1 * abs(xmin), + idx, + f"{val:.2f}", + horizontalalignment="center", + verticalalignment="center", + size=7, + weight="semibold", + color="black" if light_bg else "white", + ) + elif val > xmax: + ax.text( + xmax - 0.1 * xmax, + idx, + f"{val:.2f}", + horizontalalignment="center", + verticalalignment="center", + size=7, + weight="semibold", + color="black" if light_bg else "white", + ) + + # add fi text + if serie_fi is not None and text_fi is True: + ax.text(xmax * 1.3, idx, f'{serie_fi.iloc[idx]*100:.1f}%', + horizontalalignment='right', verticalalignment = 'center', + size='small', color='black') + +#---- plot_cluster_comp_loop ---- + +def plot_cluster_comp_loop( + data_plot, + data_fi, + colors, ncols = 3, xmin=-2.5, xmax=2.5, - xlabel="Value", + xlabel=None, ylabel=None -) -``` + ): + + nvar, ncl = data_plot.shape + nrows = ceil(ncl / ncols) + + fig, axes = plt.subplots( + nrows=nrows, + ncols=ncols, + sharex=False, + sharey=True, + figsize=(4 * 3, 4.5 * nrows), + ) + for c, cl in enumerate(data_plot.columns): + irow = floor(c / ncols) + icol = floor(c - (irow * ncols)) + ax = axes[irow][icol] + + plot_cluster_comp( + serie_plot = data_plot[cl], + serie_fi = data_fi[cl], + ax=ax, + cluster = cl, + color = colors[cl], + xmin=xmin, + xmax=xmax, + ) + if icol == 0: + ax.set(ylabel=ylabel) + if irow == (nrows - 1): + ax.set(xlabel=xlabel) + plt.subplots_adjust(wspace=0.25) + + # removed unused plots in the grid + nplots = nrows * ncols + if nplots > ncl: + + for i in range(nplots - ncl, 0, -1): + axes[-1, -i].axis('off') + + plt.show() + +#---- plot_cluster_comp_single ---- -In @fig-median-heatmap, cluster median values are reprted for each variable and colored according to the difference with the average median among clusters (`Avg median`). In @fig-cluster-comp-median, the cluster median value is standardized with resepct to the average median among clusters for each variable). That is, +def plot_cluster_comp_single( + dict_stat, + data_fi, + cluster_lab, + colors, + xmin=-2.5, + xmax=2.5, + xlabel=None, + ylabel=None, + figsize=(10, 5), + sorted = False + ): + data_plot_mean = dict_stat.get('cluster_means_cent')[cluster_lab].copy() + data_plot_median = dict_stat.get('cluster_medians_cent')[cluster_lab].copy() + + serie_fi = data_fi[cluster_lab].copy() + if sorted: + serie_fi = serie_fi.sort_values(ascending=True) + data_plot_mean = data_plot_mean.reindex(serie_fi.index) + data_plot_median = data_plot_median.reindex(serie_fi.index) + + + fig, axes = plt.subplots( + nrows=1, + ncols=2, + sharex=True, + sharey=True, + figsize=figsize, + ) + plot_cluster_comp( + serie_plot=data_plot_mean, + serie_fi= serie_fi, + text_fi = False, + ax=axes[0], + cluster = cluster_lab, + color = colors, + xmin=xmin, + xmax=xmax, + ) + axes[0].set(title='Mean') -$$ -\\frac{graph_open}\\tilde{graph_open}x{graph_close}_i - \\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close}{graph_open}\\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}{graph_close} -$$ + plot_cluster_comp( + serie_plot=data_plot_median, + serie_fi= serie_fi, + ax=axes[1], + cluster = cluster_lab, + color = colors, + xmin=xmin, + xmax=xmax, + ) + axes[1].set(title='Median') + plt.show() -where $\\tilde{graph_open}x{graph_close}_i$ is the median value of a given variable for cluster $i$ and $\\tilde{graph_open}x{graph_close}_{graph_open}Avg{graph_close}$ is the average median among clusters (i.e, $\\frac{graph_open}\\sum_i{graph_open}\\tilde{graph_open}x{graph_close}_i{graph_close}{graph_close}{graph_open}\\#i{graph_close}$). +#---- plot_map_single_cluster ---- +def plot_map_single_cluster( + data_plot, + mask, + cluster_lab, + cluster_colors, + figsize=(7,7) + ): + fig, ax = plt.subplots(1, 1, figsize=figsize) + data_plot.boundary.plot(ax=ax, color = '#bdbdbd', alpha = .4, linewidth = .7) + data_plot.loc[mask].plot( + column = 'cluster_lab', + color = cluster_colors[cluster_lab], + ax=ax, + categorical=True, + legend=True + ) + ax.set(title=f'Map cluster {cluster_lab}') + ax.axis('off') + plt.show() -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-median-heatmap -#| fig-cap: 'Cluster feature median values' -#| -data_medians = dict_stat['cluster_medians'] -data_medians['Avg median'] = data_medians.mean(axis = 1) +#---- plot_boxplot_comp_cluster_single ---- -qmd.plot_clst_heatmap( - data = data_medians, - col_avg= 'Avg median', - fig_size= (10,6) -) -``` +def plot_boxplot_comp_cluster_single( + ax, + data_cluster, + cluster_lab, + sel_var, + data_fi, + data_plot_means, + data_plot_median, + cluster_colors + ): + list_cluster = sorted(set(data_cluster['cluster_lab'])) + list_cluster.remove(cluster_lab) + list_cluster = [cluster_lab] + list_cluster + list_cluster = list_cluster[::-1] -```{graph_open}python{graph_close} -#| fig-width: 10 -#| fig-height: 12 -#| fig-align: center -#| fig-pos: 'H' -#| label: fig-cluster-comp-median -#| fig-cap: 'Map cluster comparison Median (the value is the statndardized cluster median compared to othe Avg median among clusters). Feature importance is repoorted on the right.' + positions = [.15 * x for x in list(range(0,len(list_cluster)-1))] + positions.append(positions[-1] +.5) + widths = [.1] * (len(list_cluster) - 1) + [.2] -data_plot = dict_stat.get('cluster_medians_cent') -qmd.plot_cluster_comp_loop( - data_plot = data_plot, - data_fi=data_fi, - colors = cluster_colors, - ncols = 3, - xmin=-2.5, - xmax=2.5, - xlabel="Value", - ylabel=None -) -``` + # set alpha according to feature importance + max_fi = data_fi[cluster_lab].max() + min_fi = data_fi[cluster_lab].min() + delta_fi = max_fi - min_fi + alpha = .05 + .95 * (data_fi.loc[sel_var, cluster_lab] - min_fi)/ delta_fi -\clearpage + for i, letter in enumerate(list_cluster): + mask_iter = data_cluster['cluster_lab'] == letter + color_alpha = (*cluster_colors[letter], alpha) + ax.boxplot( + data_cluster.loc[mask_iter,sel_var], positions=[positions[i]], vert=False, widths = widths[i], + patch_artist=True, + boxprops=dict(facecolor= color_alpha) + ) + ax.plot( + data_plot_means.loc[sel_var,letter], + positions[i], + 's', color="k", marker="*", markeredgecolor="k") - """ - cluster_profiles = f""" -## Cluster Profiles {graph_open}#sec-cluster-profiles{graph_close} + # add overall mean and median + overall_mean = data_plot_means.loc[sel_var,:].mean() + ax.axvline(overall_mean, color='r', linestyle='--', alpha=0.5) + overall_median = data_plot_median.loc[sel_var,:].mean() + ax.axvline(overall_median, color='r', linestyle='-', alpha=0.5) -```{graph_open}python{graph_close} -#| output: asis -#| fig-align: center -#| fig-pos: 'H' -#| warning: false + # add fi + ax.text( + data_plot_means.loc[sel_var,cluster_lab], positions[-1] + .15, + f'fi: {data_fi.loc[sel_var, cluster_lab]*100:.1f}%', + horizontalalignment='center', verticalalignment = 'center', + size='small', color='black') + ax.set_yticks(positions, list_cluster) + ax.set_ylim(-.1, positions[-1] +.2 ) + ax.set_xlabel(sel_var) -for cluster_lab in cluster_labels.values(): - if(cluster_lab == 'Z'): continue +#---- plot_boxplot_single_cluster_loop ---- - mask = selected['cluster_lab'] == cluster_lab +def plot_boxplot_comp_cluster_loop( + data_cluster, + cluster_lab, + list_vars, + data_fi, + data_plot_means, + data_plot_median, + cluster_colors, + figsize, + sorted = True + ): - graph_par_open = "{graph_open}" - graph_par_close = "{graph_close}" - print(f'\\subsubsection{graph_open}graph_par_open{graph_close}Cluster {graph_open}cluster_lab{graph_close} (n = {graph_open}selected.loc[mask].shape[0]{graph_close}){graph_open}graph_par_close{graph_close}') - - qmd.plot_map_single_cluster( - data_plot=selected, - mask = mask, - cluster_lab=cluster_lab, - cluster_colors=cluster_colors - ) + if sorted is True: + list_vars = list(data_fi.loc[list_vars]\ + .sort_values(by = [cluster_lab], ascending=False).index) - qmd.plot_cluster_comp_single( - dict_stat=dict_stat, - data_fi = data_fi, - cluster_lab=cluster_lab, - colors=cluster_colors[cluster_lab], - sorted = True - ) + n_plots = len(list_vars) - qmd.plot_boxplot_comp_cluster_loop( - data_cluster = data_cluster, - cluster_lab = cluster_lab, - list_vars = selected_col_all, - data_fi = data_fi, - data_plot_means = dict_stat['cluster_means'], - data_plot_median = dict_stat['cluster_medians'], - cluster_colors = cluster_colors, - figsize = (20,25) - ) - - print('\\clearpage') - -``` + if n_plots <= 3: + ncols = n_plots + elif n_plots == 4: + ncols = 2 + else: + ncols = 3 -```{graph_open}python{graph_close} -#| output: asis -#| fig-align: center -#| fig-pos: 'H' -#| warning: false -cluster_lab == 'Z' + nrows = ceil(n_plots / ncols) -mask = selected['cluster_lab'] == cluster_lab -print(f'\\subsubsection{graph_open}graph_par_open{graph_close}Cluster {graph_open}cluster_lab{graph_close} (n = {graph_open}selected.loc[mask].shape[0]{graph_close}){graph_open}graph_par_close{graph_close}') - -qmd.plot_map_single_cluster( - data_plot=selected, - mask = mask, - cluster_lab=cluster_lab, - cluster_colors=cluster_colors + fig, axes = plt.subplots( + nrows=nrows, + ncols=ncols, + sharex=False, + sharey=False, + figsize=figsize ) + for i, sel_var in enumerate(list_vars): + irow = floor(i / ncols) + icol = floor(i - (irow * ncols)) + + if nrows > 1: + ax = axes[irow][icol] + else: + ax = axes[icol] -``` -```{graph_open}python{graph_close} -#| fig-align: center -#| fig-pos: 'H' -#| fig-width: 20 -#| fig-height: 25 -#| warning: false -qmd.plot_summary( - data = selected.loc[mask, selected_col_all], - color = cluster_colors[cluster_lab] -) - -``` - -\clearpage - + plot_boxplot_comp_cluster_single( + ax = ax, + data_cluster = data_cluster, + cluster_lab = cluster_lab, + sel_var = sel_var, + data_fi = data_fi, + data_plot_means = data_plot_means, + data_plot_median = data_plot_median, + cluster_colors = cluster_colors + ) + # removed unused plots in the grid + n_grid = nrows * ncols + if n_grid > n_plots: - """ - - file = open(cdir / f'report-{city}-template.qmd', 'w') - + for i in range(n_grid - n_plots, 0, -1): + if nrows > 1: + axes[-1, -i].axis('off') + else: + axes[-i].axis('off') + plt.show() - file.write(header + settings + data_load) - file.write(descriptive_intro + descriptive_feature + descriptive_overview) - file.write(cluster_intro + cluster_results + cluster_profiles) - - file.close()