diff --git a/.gitignore b/.gitignore index 28231e5c3f9acde54d13916f3812b97feaa103ba..d2984003d11172144786fc51b65778be41e79169 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,5 @@ __pycache__ # Output folder bolzano -report/bolzano +report/* /.luarc.json diff --git a/justclust/analysis/bolzano.py b/justclust/analysis/bolzano.py index 733047ef490e5c6413ca091db6ad1a006b606761..d5bd09dba09e4a38a57e272c5bf8f184ab36274b 100644 --- a/justclust/analysis/bolzano.py +++ b/justclust/analysis/bolzano.py @@ -10,17 +10,12 @@ import pandas as pd from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols from justclust.explore.algclusters import explore_models, get_algclusters from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs -from justclust.paths.bolzano import ( - clfile, - clscorefile, - fimpfile, - prescorefile, - report_dir, - selclfile, -) +import justclust.paths.paths as paths from justclust.plots import reporting import justclust.quarto as qmd +my_paths = paths.define_paths(city = 'bolzano') + # %% # read and filter/select data raw = read_data() @@ -34,14 +29,14 @@ selected.rename(columns=conv, inplace=True) # %% # Remove error observation -mask = selected['Carbon emission building [ton CO2/m²]'] > 8 +mask = selected.index == 8888888 selected = selected[~ mask] # %% # Explore clustering pre-processing # create a directory if does not exists -os.makedirs(report_dir, exist_ok=True) +os.makedirs(my_paths.report_dir, exist_ok=True) # weighted = selected * wcols apply_weight = ApplyW(wcols) @@ -51,16 +46,16 @@ preprocs = get_preprocs(selected, apply_weight) # pkeys = sorted(preprocs.keys())[:10] # preprocs = {k: preprocs[k] for k in pkeys} -if prescorefile.exists(): - print(f"Reading pre-processed scores from: {prescorefile}") - pre_scores = pd.read_excel(prescorefile) +if my_paths.prescorefile.exists(): + print(f"Reading pre-processed scores from: {my_paths.prescorefile}") + pre_scores = pd.read_excel(my_paths.prescorefile) else: print( - f"Pre-processed file ({prescorefile}) not found, " + f"Pre-processed file ({my_paths.prescorefile}) not found, " "start computing the main metrics" ) pre_scores = explore_preprocs(selected, preprocs) - pre_scores.to_excel(prescorefile) + pre_scores.to_excel(my_paths.prescorefile) print("=" * 50) print(f"BEST PRE-PROCESSING (10/{len(pre_scores)}):") @@ -95,25 +90,25 @@ pre = preprocs[pre_key] # %% # explore the clustering space -if clfile.exists(): - print(f"File: {clfile} exists load cluster from the file") +if my_paths.clfile.exists(): + print(f"File: {my_paths.clfile} exists load cluster from the file") print("WARNING: clusters read from file, not computed from scratch!") - geo_out = gpd.read_file(clfile) + geo_out = gpd.read_file(my_paths.clfile) # force index to maintain consistency with raw and selected DF geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True) - print(f"Reading clusters' scores from: {clscorefile}") - sc = pd.read_excel(clscorefile, header=0, index_col=0) - print(f"Reading feature importances from: {fimpfile}") - fi = pd.read_excel(fimpfile, header=0, index_col=0) + print(f"Reading clusters' scores from: {my_paths.clscorefile}") + sc = pd.read_excel(my_paths.clscorefile, header=0, index_col=0) + print(f"Reading feature importances from: {my_paths.fimpfile}") + fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0) else: - print(f"File: {clfile} does not exist. Start computing the clusters") + print(f"File: {my_paths.clfile} does not exist. Start computing the clusters") geo_out = raw.loc[:, sez_cols] models = get_algclusters(preprocs=pre) sc, fi = explore_models( data=selected, cldf=geo_out, - rdir=report_dir, + rdir=my_paths.report_dir, models=models, n_jobs=-1, filters=[ @@ -149,19 +144,19 @@ else: for alg, adf in algs: if len(adf) > 20: for k, kdf in adf.groupby("n_clusters"): - klfile = report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg" + klfile = my_paths.report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg" print(f"Saving {klfile}") geo_out.loc[:, sez_cols + list(kdf.index)].to_file( klfile, driver="GPKG", index=True ) # save all columns to open all the clusters - print(f"Saving {clfile}") - geo_out.to_file(clfile, driver="GPKG", index=True) - print(f"Saving {clscorefile}") - sc.to_excel(clscorefile) - print(f"Saving {fimpfile}") - fi.to_excel(fimpfile) + print(f"Saving {my_paths.clfile}") + geo_out.to_file(my_paths.clfile, driver="GPKG", index=True) + print(f"Saving {my_paths.clscorefile}") + sc.to_excel(my_paths.clscorefile) + print(f"Saving {my_paths.fimpfile}") + fi.to_excel(my_paths.fimpfile) # %% # Select cluster to be selected @@ -183,7 +178,7 @@ sel_clsts = [ if len(sel_clsts) == 0: print( - f"You must select the clusters to be analyzed from {clscorefile} " + f"You must select the clusters to be analyzed from {my_paths.clscorefile} " "using the cluster label as UUID" ) sccols = [ @@ -225,7 +220,7 @@ clsts = geo_out.loc[ ], ] print(f"Saving selected clusters {clsts}") -clsts.to_file(selclfile, driver="GPKG", index=True) +clsts.to_file(my_paths.selclfile, driver="GPKG", index=True) for clname in sel_clsts: print(f"Reporting: {clname}...") @@ -233,7 +228,7 @@ for clname in sel_clsts: data=selected.loc[clsts.index], clname=clname, clsts=clsts, - cdir=report_dir / clname, + cdir=my_paths.report_dir / clname, pal="turbo", preprocs=pre, ) @@ -241,7 +236,7 @@ for clname in sel_clsts: qmd.create_qmd_report( city = 'bolzano', clname= clname, - cdir = report_dir / clname + cdir = my_paths.report_dir / clname ) # %% diff --git a/justclust/data/bolzano.py b/justclust/data/bolzano.py index e9de98a07dfff7b657ed9b1cb5db01bce63063b9..61acd104b102a99abce3bda9bdf97cc9bb51b727 100644 --- a/justclust/data/bolzano.py +++ b/justclust/data/bolzano.py @@ -2,14 +2,17 @@ import geopandas as gpd import numpy as np import pandas as pd -from justclust.paths import bolzano as paths +import justclust.paths.paths as paths +my_paths = paths.define_paths(city = 'bolzano') # %% -# Define columns and other constant values -quantile_range = (20, 80) +#---- Air Quality ---- + +# Define available files +aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp" -# air quality +# Define columns and other constant values aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"] aq_nor_cols = [ "P_Area_r1", @@ -20,34 +23,68 @@ aq_nor_cols = [ "P_Area_r9", ] -# carbon -# absorption +#---- Carbon ---- + +# Define available files +co2_ab_shp = my_paths.rawdata_dir / "Carbon" / "c_absorption.shp" +co2_em_shp = my_paths.rawdata_dir / "Carbon" / "c_emission.shp" + +# Define columns and other constant values +# Absorption co2_raw_cols = ["CO2"] co2_nor_cols = ["kgCO2/m²"] -# emmissions +# Emmissions ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee_em"] co2_em_raw_cols = ["finale"] co2_em_nor_cols = ["finale/m²"] -# flora fauna and habitat +# columns selected for the columns of the outputs +sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"] + +#---- FFH ---- + +# Define available files +ffh_pa_shp = my_paths.rawdata_dir / "FFH" / "protected_areas.shp" + +# Define columns and other constant values ffh_raw_cols = ["F_Area_m2"] ffh_nor_cols = ["P_Prot_are"] -# spatial +#---- Spatial ---- + +# Define available files +spt_ag_shp = my_paths.rawdata_dir / "Spatial" / "accessibility_green_areas.shp" +spt_wk_shp = my_paths.rawdata_dir / "Spatial" / "walkability.shp" + +# Define columns and other constant values spt_ag_raw_cols = ["Area_300m_", "Area_750m_"] spt_ag_nor_cols = ["P_area_300", "P_area_750"] - spt_wk_nor_cols = ["MEAN_Resul"] -# temporary +#---- Temporal ---- + +# Define available files +tem_ss_shp = my_paths.rawdata_dir / "Temporal" / "soil_sealing.shp" + +# Define columns and other constant values tem_raw_cols = ["Area_diff"] tem_nor_cols = ["P_Area_dif"] -# thermal +#---- Thermal ---- + +# Define available files +thm_sh_shp = my_paths.rawdata_dir / "Thermal" / "suhi.shp" + +# Define columns and other constant values thm_nor_cols = ["mean_norm"] -# socio-demographic +#---- Socio ---- + +# Define available files +socio_gpkg = my_paths.rawdata_dir / "Socio" / "bz_pop_2021.gpkg" + +# Define columns and other constant values socio_abs_cols = [ "Maschi", "Femmine", @@ -73,163 +110,90 @@ socio_perc_cols = [ "%Single", ] -# columns selected for the columns of the outputs -sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"] # %% -## SELECT COLUMNS TO BE USED +#---- SELECT COLUMNS TO BE USED ---- + +cols_dict = { + # Air quality + 'air_q':{ + "P_Area_r1": "area very-low P-risk [%]", + "P_Area_r2": "area low P-risk [%]", + "P_Area_r3": "area medium-low P-risk [%]", + "P_Area_r4": "area medium-high P-risk [%]", + "P_Area_r6": "area high P-risk [%]", + "P_Area_r9": "area very-high P-risk [%]" + }, + # Carbon + 'carbon':{ + "finale/m²": "Carbon emission building [ton CO2/m²]", + "kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]" + }, + # FFH + 'ffh':{"P_Prot_are": "Protected area [%]"}, + # Spatial + 'spatial':{ + "P_area_300": "Accessibility public gardens (<5 min) [%]", + "MEAN_Resul": "Walkability Index" + }, + # Temporal + 'temporal':{"P_Area_dif":"Soil sealing between 2022 and 2018 [%]"}, + # Thermal + 'thermal':{"mean_norm": "Surface urban heat island"}, + # Socio + 'socio':{ + "indice di vecchiaia": "Age Index", + "%stranieri": "Foreign population [%]", + "%con figli": "Families with children [%]", + # "%senza figli": "Families without children [%]", + "%Single": "Families with one component [%]" + } +} -cols = [ - # air-quality: % risk based on SVF and road classification - "P_Area_r1", - "P_Area_r2", - "P_Area_r3", - "P_Area_r4", - "P_Area_r6", - "P_Area_r9", - # carbon: absorption and emmissions - "kgCO2/m²", - "finale/m²", - # FFH: % protected area, % to 5 min to prot. areas - "P_Prot_are", - # spatial: % of area within a distance from protected areas and walkability index - "P_area_300", - # "P_area_750", - "MEAN_Resul", - # temporal: % area changed - "P_Area_dif", - # thermal - "mean_norm", - # socio demographic - # "%65+", - # "%minori", - "indice di vecchiaia", - "%stranieri", - "%con figli", - # "%senza figli", # same as con figli - "%Single", -] +# Get only selected columns old-names +cols = [col for value in cols_dict.values() for col in list(value.keys())] -# define weights +# define weights columns wcols = np.array( - [ - # air-quality: % risk based on SVF and road classification - 1.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - 1.0 / 6.0, - # carbon: absorption and emmissions - 1.0 / 2.0, - 1.0 / 2.0, - # FFH: % protected area, % to 5 min to prot. areas - 1.0, - # spatial: % of area within a distance from protected areas and walkability index - 1.0 / 2.0, - # "P_area_750", - 1.0 / 2.0, - # temporal: % area changed - 1.0, - # thermal - 1.0, - # socio demographic - # "%65+", - # "%minori", - 1.0 / 4.0, - 1.0 / 4.0, - 1.0 / 4.0, - # "%senza figli" - 1.0 / 4.0, - ], + [weight for value in cols_dict.values() for weight in [1/len(value)] * len(value)], dtype=float, ) # define nice text label to not expose the internal nomenclature -conv = { - # air quality: Street canyon risk per census unit [%] - "P_Area_r1": "area very-low AQ-risk [%]", - "P_Area_r2": "area low AQ-risk [%]", - "P_Area_r3": "area medium-low AQ-risk [%]", - "P_Area_r4": "area medium-high AQ-risk [%]", - "P_Area_r6": "area high AQ-risk [%]", - "P_Area_r9": "area very-high AQ-risk [%]", - # carbon - "finale/m²": "Carbon emission building [ton CO2/m²]", - "kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]", - # FFH - "P_Prot_are": "Protected area [%]", - # spatial - "P_area_300": "Accessibility urban green areas (<5 min) [%]", - "MEAN_Resul": "Walkability Index", - # temporal - "P_Area_dif": "Soil sealing between 2022 and 2018 [%]", - # thermal - "mean_norm": "Surface urban heat island", - # socio demographic - "indice di vecchiaia": "Age Index", - "%stranieri": "Foreign population [%]", - "%con figli": "Families with children [%]", - # "%senza figli": "Families without children [%]", - "%Single": "Families with one component [%]", -} +conv = {k:v for values in cols_dict.values() for k,v in values.items()} -# %% # define subset of columns according to categories selected_col = { - 'col_aq':[ - 'area very-low AQ-risk [%]', - 'area low AQ-risk [%]', - 'area medium-low AQ-risk [%]', - 'area medium-high AQ-risk [%]', - 'area high AQ-risk [%]', - 'area very-high AQ-risk [%]' - ], - 'col_carbon':[ - "Carbon emission building [ton CO2/m²]", - "Carbon absorption vegetation [kg CO2/m²]" - ], - 'col_char':[ - "Protected area [%]", - "Accessibility urban green areas (<5 min) [%]", - "Walkability Index", - "Soil sealing between 2022 and 2018 [%]", - "Surface urban heat island" - ], - 'col_socio':[ - "Age Index", - "Foreign population [%]", - "Families with children [%]", - "Families with one component [%]" - ] + 'col_aq': list(cols_dict.get('air_q').values()), + 'col_carbon':list(cols_dict.get('carbon').values()), + 'col_char':\ + list(cols_dict.get('ffh').values())+\ + list(cols_dict.get('spatial').values())+\ + list(cols_dict.get('temporal').values())+\ + list(cols_dict.get('thermal').values()), + 'col_socio':list(cols_dict.get('socio').values()) } - - - - - - # %% # Define function to read files def read_data(): - aq_scr = gpd.read_file(paths.aq_scr_shp) - co2_ab = gpd.read_file(paths.co2_ab_shp) + aq_scr = gpd.read_file(aq_scr_shp) + co2_ab = gpd.read_file(co2_ab_shp) co2_ab["SEZ"] = co2_ab["SEZ"].astype(int) co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area - co2_em = gpd.read_file(paths.co2_em_shp) + co2_em = gpd.read_file(co2_em_shp) co2_em["finale/m²"] = co2_em["finale"] / co2_em.area - ffh_pa = gpd.read_file(paths.ffh_pa_shp) - spt_ag = gpd.read_file(paths.spt_ag_shp) - spt_wk = gpd.read_file(paths.spt_wk_shp) - tem_ss = gpd.read_file(paths.tem_ss_shp) - thm_sh = gpd.read_file(paths.thm_sh_shp) + ffh_pa = gpd.read_file(ffh_pa_shp) + spt_ag = gpd.read_file(spt_ag_shp) + spt_wk = gpd.read_file(spt_wk_shp) + tem_ss = gpd.read_file(tem_ss_shp) + thm_sh = gpd.read_file(thm_sh_shp) - socio_pk = gpd.read_file(paths.socio_gpkg, driver="GPKG") + socio_pk = gpd.read_file(socio_gpkg, driver="GPKG") # fix column res = [] for val in socio_pk["indice di vecchiaia"]: diff --git a/justclust/paths/bolzano.py b/justclust/paths/bolzano.py deleted file mode 100644 index 0522dd2f3d327091e9bd2a2a22f2c148ab394fba..0000000000000000000000000000000000000000 --- a/justclust/paths/bolzano.py +++ /dev/null @@ -1,24 +0,0 @@ -from justclust.paths import drawdir, repdir - -rawdata_dir = drawdir / "bolzano" -report_dir = repdir / "bolzano" - -aq_scr_shp = rawdata_dir / "Air quality" / "street_canyon_risk.shp" -co2_ab_shp = rawdata_dir / "Carbon" / "c_absorption.shp" -co2_em_shp = rawdata_dir / "Carbon" / "c_emission.shp" -ffh_pa_shp = rawdata_dir / "FFH" / "protected_areas.shp" -spt_ag_shp = rawdata_dir / "Spatial" / "accessibility_green_areas.shp" -spt_wk_shp = rawdata_dir / "Spatial" / "walkability.shp" -tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp" -thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp" - -socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg" - -# main outputs into the report folder -# - pre-processing -prescorefile = report_dir / "scores_preprocess.xlsx" -# - clustering -clfile = report_dir / "clusters_all.gpkg" -selclfile = report_dir / "selected_clusters.gpkg" -clscorefile = report_dir / "scores_clusters.xlsx" -fimpfile = report_dir / "feature_importances.xlsx" diff --git a/justclust/paths/paths.py b/justclust/paths/paths.py new file mode 100644 index 0000000000000000000000000000000000000000..2dfe7f12c261833499527dbc0bf514c08366128f --- /dev/null +++ b/justclust/paths/paths.py @@ -0,0 +1,18 @@ +from justclust.paths import drawdir, repdir + +class define_paths(object): + + def __init__(self, city = 'city-example'): + self.drawdir = drawdir + self.repdir = repdir + self.rawdata_dir = drawdir / city + self.report_dir = repdir / city + + # main outputs into the report folder + # - pre-processing + self.prescorefile = self.report_dir / "scores_preprocess.xlsx" + # - clustering + self.clfile = self.report_dir / "clusters_all.gpkg" + self.selclfile = self.report_dir / "selected_clusters.gpkg" + self.clscorefile = self.report_dir / "scores_clusters.xlsx" + self.fimpfile = self.report_dir / "feature_importances.xlsx" diff --git a/justclust/plots.py b/justclust/plots.py index 7083282e11483195280a702c021b7acb1206f84d..94b9a9f0edac89f7ec86778d3120ccdaa2618c0c 100644 --- a/justclust/plots.py +++ b/justclust/plots.py @@ -867,9 +867,14 @@ def reporting( ) fig.savefig(cdir / "cardinality.png", dpi=600) + # check for invalid vlae 0 se to None + min_samples = int(ms.split("-")[1]) + if int(ms.split("-")[1]) == 0: + min_samples = None + clusterer = hdbscan.HDBSCAN( min_cluster_size=int(mcs.split("-")[1]), - min_samples=int(ms.split("-")[1]), + min_samples=min_samples, metric=dist.split("-")[1], cluster_selection_method=csm.split("-")[1], ) diff --git a/justclust/quarto.py b/justclust/quarto.py index 50a83a54774b29b674370bf57985b9f52ef63ddf..bfda5e500129df7c14bdb9c7246cf00421cda130 100644 --- a/justclust/quarto.py +++ b/justclust/quarto.py @@ -805,8 +805,8 @@ execute: --- \pagebreak """ - settings =r""" -```{python} + settings =f""" +```{graph_open}python{graph_close} import os import sys import geopandas as gpd @@ -817,12 +817,14 @@ import matplotlib.pyplot as plt # Custom modules from justclust.data.bolzano import cols, conv, read_data, selected_col -from justclust.paths.bolzano import clfile +import justclust.paths.paths as paths import justclust.quarto as qmd plt.rcParams['figure.dpi'] = 600 plt.rcParams['savefig.dpi'] = 600 +my_paths = paths.define_paths(city = '{city}') + ``` """ data_load = f""" @@ -834,12 +836,12 @@ selected = raw.loc[:, cols].dropna() selected.rename(columns=conv, inplace=True) # Remove error observation -# mask = selected['Carbon emission building [ton CO2/m²]'] > 8 -# selected = selected[~ mask] +mask = selected.index == 8888888 +selected = selected[~ mask] clname = '{clname}' -geo_out = gpd.read_file(clfile) +geo_out = gpd.read_file(my_paths.clfile) geo_out.set_index('SEZ', drop=False, verify_integrity=True, inplace=True) clsts = geo_out.loc[selected.index,[clname, 'geometry']] selected = clsts.join(selected, validate='1:1')