Compare revisions

Pietro Zambelli · Pietro Zambelli · Pietro Zambelli · Pietro Zambelli · Pietro Zambelli · Pietro Zambelli
--- a/justclust/analysis/bolzano.py
+++ b/justclust/analysis/bolzano.py
 import os
 import sys
+import geopandas as gpd
 import pandas as pd
-from justclust.data.bolzano import cols, read_data, sez_cols, wcols
+from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols
 from justclust.explore.algclusters import explore_models, get_algclusters
 from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs
-from justclust.paths.bolzano import report_dir
+from justclust.paths.bolzano import (
+    clfile,
+    clscorefile,
+    fimpfile,
+    prescorefile,
+    report_dir,
+    selclfile,
+)
+from justclust.plots import reporting
 # %%
 # read and filter/select data
@@ -17,7 +26,7 @@ raw = read_data()
 # use selected columns
 selected = raw.loc[:, cols].dropna()
+selected.rename(columns=conv, inplace=True)
 # %%
 # Explore clustering pre-processing
@@ -33,13 +42,13 @@ preprocs = get_preprocs(selected, apply_weight)
 # pkeys = sorted(preprocs.keys())[:10]
 # preprocs = {k: preprocs[k] for k in pkeys}
-prescorefile = report_dir / "scores_preprocess.xlsx"
 if prescorefile.exists():
    print(f"Reading pre-processed scores from: {prescorefile}")
    pre_scores = pd.read_excel(prescorefile)
 else:
    print(
-        f"Pre-processed file ({prescorefile}) not found, start computing the main metrics"
+        f"Pre-processed file ({prescorefile}) not found, "
+        "start computing the main metrics"
    )
    pre_scores = explore_preprocs(selected, preprocs)
    pre_scores.to_excel(prescorefile)
@@ -70,58 +79,142 @@ pre = preprocs[pre_key]
 # %%
 # explore the clustering space
-geo_out = raw.loc[selected.index, sez_cols]
-models = get_algclusters(preprocs=pre)
-sc, fi = explore_models(
-    data=selected,
-    cldf=geo_out,
-    rdir=report_dir,
-    models=models,
-    n_jobs=-1,
-    filters=[
-        ("n_clusters", (5.0, 15.0)),
-        # silhouette_score: The best value is 1 and the worst value is -1.
-        # Values near 0 indicate overlapping clusters. Negative values generally
-        # indicate that a sample has been assigned to the wrong cluster, as a
-        # different cluster is more similar.
-        ("silhouette", (0.5, None)),
-        # davies_bouldin_score: The minimum score is zero, with lower values
-        # indicating better clustering.
-        # ("davies bouldin", (None, 0.6)),
-        # calinski_harabasz_score: higher Calinski-Harabasz score relates
-        # to a model with better defined clusters
-        # ("calinski harabasz", (10, None)),
-        # mean_el_per_cl: if you need to filter clusters with a mean
-        # number of elements
-        # ("mean_el_per_cl", (10, None)),
-        # % covered by clusters: percentage o, f elements that have been assigned
-        # to a cluster
-        ("% covered by clusters", (50, None)),
-    ],
-)
-# split file to make them not too heavy
+if clfile.exists():
-cc = [c for c in geo_out.columns if c not in sez_cols]
+    print(f"File: {clfile} exists load cluster from the file")
-algs = sorted(set([c.split("__")[0].split("_")[-1] for c in cc]))
+    print("WARNING: clusters read from file, not computed from scratch!")
+    geo_out = gpd.read_file(clfile)
-for alg in algs:
+    # force index to mantain consistency with raw and selected DF
-    acols = [c for c in cc if alg in c]
+    geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True)
-    if len(acols) > 20:
+    print(f"Reading clusters' scores from: {clscorefile}")
-        kl = [c.split("_")[0] for c in acols]
+    sc = pd.read_excel(clscorefile, header=0, index_col=0)
-        for k in kl:
+    print(f"Reading feature importances from: {fimpfile}")
-            klfile = report_dir / f"clusters_{alg}_{k}.gpkg"
+    fi = pd.read_excel(fimpfile, header=0, index_col=0)
-            kcols = [c for c in acols if c.startswith(k)]
+else:
-            print(f"Saving {klfile} with cols: {kcols}")
+    print(f"File: {clfile} does not exist. Start computing the clusters")
-            geo_out.loc[:, sez_cols + kcols].to_file(klfile, driver="GPKG")
+    geo_out = raw.loc[:, sez_cols]
+    models = get_algclusters(preprocs=pre)
-# save all columns to open all the clusters
-clfile = report_dir / f"clusters_{alg}.gpkg"
+    sc, fi = explore_models(
-print(f"Saving {clfile}")
+        data=selected,
-geo_out.to_file(clfile, driver="GPKG")
+        cldf=geo_out,
+        rdir=report_dir,
-clscorefile = report_dir / "scores_clusters.xlsx"
+        models=models,
-sc.to_excel(clscorefile)
+        n_jobs=-1,
+        filters=[
-fimpfile = report_dir / "feature_importances.xlsx"
+            ("n_clusters", (5.0, 15.0)),
-fi.to_excel(fimpfile)
+            # silhouette_score: The best value is 1 and the worst value is -1.
+            # Values near 0 indicate overlapping clusters. Negative values generally
+            # indicate that a sample has been assigned to the wrong cluster, as a
+            # different cluster is more similar.
+            ("silhouette", (0.5, None)),
+            # davies_bouldin_score: The minimum score is zero, with lower values
+            # indicating better clustering.
+            # ("davies bouldin", (None, 0.6)),
+            # calinski_harabasz_score: higher Calinski-Harabasz score relates
+            # to a model with better defined clusters
+            # ("calinski harabasz", (10, None)),
+            # mean_el_per_cl: if you need to filter clusters with a mean
+            # number of elements
+            # ("mean_el_per_cl", (10, None)),
+            # % covered by clusters: percentage o, f elements that have been assigned
+            # to a cluster
+            ("% covered by clusters", (50, None)),
+        ],
+    )
+    # %%
+    # Export the clusters into separate files
+    # split file to make them not too heavy
+    scs = sc.set_index("label", drop=False, verify_integrity=True).loc[
+        sc.selected.values, :
+    ]
+    algs = scs.groupby("model")
+    for alg, adf in algs:
+        if len(adf) > 20:
+            for k, kdf in adf.groupby("n_clusters"):
+                klfile = report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg"
+                print(f"Saving {klfile}")
+                geo_out.loc[:, sez_cols + list(kdf.index)].to_file(
+                    klfile, driver="GPKG", index=True
+                )
+    # save all columns to open all the clusters
+    print(f"Saving {clfile}")
+    geo_out.to_file(clfile, driver="GPKG", index=True)
+    print(f"Saving {clscorefile}")
+    sc.to_excel(clscorefile)
+    print(f"Saving {fimpfile}")
+    fi.to_excel(fimpfile)
+# %%
+# Select cluster to be selected
+# Select the cluster id that need to be compared
+# the ID can be taken from the `clscorefile`
+sel_clsts = [
+    # label
+    # "hdbscan__mcs-15_ms-05_m-euclidean_csm-eom",  # k06 @51.6%
+    # "hdbscan__mcs-15_ms-05_m-sqeuclidean_csm-eom",  # k07 @50.2%
+    # "hdbscan__mcs-10_ms-05_m-sqeuclidean_csm-eom",  # k09 @60.3%
+    # "hdbscan__mcs-10_ms-01_m-euclidean_csm-eom",  # k12 @74.5%
+    # "hdbscan__mcs-07_ms-05_m-chebyshev_csm-eom",  # k15 @77.5%
+]
+if len(sel_clsts) == 0:
+    print(
+        f"You must select the clusters to be analyzed from {clscorefile} "
+        "using the cluster label as UUID"
+    )
+    sccols = [
+        "label",
+        "% covered by clusters",
+        "n_clusters",
+        "silhouette",
+        "davies bouldin",
+        "calinski harabasz",
+    ]
+    # save the default value
+    old_max_rows = pd.get_option("display.max_rows")
+    pd.set_option("display.max_rows", None)
+    print(
+        "Available clusters are:\n",
+        sc.loc[sc["selected"] is True, sccols].sort_values(
+            by=["n_clusters", "silhouette", "davies bouldin", "calinski harabasz"],
+            ascending=[True, False, True, False],
+        ),
+    )
+    # save default value back
+    pd.set_option("display.max_rows", old_max_rows)
+    sys.exit(1)
+for clname in sel_clsts:
+    if clname not in geo_out.columns:
+        raise ValueError(
+            f"The selected cluster is: {clname!r} but "
+            "it is not available. Select one of the "
+            f"following: {geo_out.columns!r}."
+        )
+# select only the cluster of interest
+clsts = geo_out.loc[
+    selected.index,
+    sel_clsts
+    + [
+        "geometry",
+    ],
+]
+print(f"Saving selected clusters {clsts}")
+clsts.to_file(selclfile, driver="GPKG", index=True)
+for clname in sel_clsts:
+    print(f"Reporting: {clname}...")
+    reporting(
+        data=selected.loc[clsts.index],
+        clname=clname,
+        clsts=clsts,
+        cdir=report_dir / clname,
+        pal="turbo",
+        preprocs=pre,
+    )
--- a/justclust/data/bolzano.py
+++ b/justclust/data/bolzano.py
@@ -145,6 +145,34 @@ wcols = np.array(
    dtype=float,
 )
+# define nice text label to not expose the internal nomenclature
+conv = {
+    # air quality: Street canyon risk per census unit [%]
+    "P_Area_r1": "area very-low AQ-risk [%]",
+    "P_Area_r2": "area low AQ-risk [%]",
+    "P_Area_r3": "area medium-low AQ-risk [%]",
+    "P_Area_r4": "area medium-high AQ-risk [%]",
+    "P_Area_r6": "area high AQ-risk [%]",
+    "P_Area_r9": "area very-high AQ-risk [%]",
+    # carbon
+    "finale/m²": "Carbon emission building [ton CO2/m²]",
+    "kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]",
+    # FFH
+    "P_Prot_are": "Protected area [%]",
+    # spatial
+    "P_area_300": "Accessibility urban green areas (<5 min) [%]",
+    "MEAN_Resul": "Walkability Index",
+    # temporal
+    "P_Area_dif": "Soil sealing between 2022 and 2018 [%]",
+    # thermal
+    "mean_norm": "Surface urban heat island",
+    # socio demographic
+    "indice di vecchiaia": "Age Index",
+    "%stranieri": "Foreign population [%]",
+    "%con figli": "Families with children [%]",
+    "%senza figli": "Families without children [%]",
+    "%Single": "Families with one component [%]",
+}
 # %%
 # Define function to read files
@@ -153,6 +181,7 @@ wcols = np.array(
 def read_data():
    aq_scr = gpd.read_file(paths.aq_scr_shp)
    co2_ab = gpd.read_file(paths.co2_ab_shp)
+    co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
    co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
    co2_em = gpd.read_file(paths.co2_em_shp)
    co2_em["finale/m²"] = co2_em["finale"] / co2_em.area
@@ -188,4 +217,6 @@ def read_data():
        ],
        axis="columns",
    )
+    raw.index = co2_ab["SEZ"]
+    raw.index.name = None
    return raw
--- a/justclust/explore/algclusters.py
+++ b/justclust/explore/algclusters.py
@@ -370,7 +370,7 @@ def model_worker(
    if rdir is not None:
        clcount.to_excel(rdir / f"{long_label}.xlsx")
    fimp = feature_importances(tdata, labels, label)
-    return long_label, labels, xcores, fimp
+    return label, labels, xcores, fimp
 def apply_filter(

--- a/justclust/paths/bolzano.py
+++ b/justclust/paths/bolzano.py
@@ -13,3 +13,12 @@ tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp"
 thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp"
 socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg"
+# main outputs into the report folder
+# - pre-processing
+prescorefile = report_dir / "scores_preprocess.xlsx"
+# - clustering
+clfile = report_dir / "clusters_all.gpkg"
+selclfile = report_dir / "selected_clusters.gpkg"
+clscorefile = report_dir / "scores_clusters.xlsx"
+fimpfile = report_dir / "feature_importances.xlsx"
--- a/justclust/plots.py
+++ b/justclust/plots.py
--- a/poetry.lock
+++ b/poetry.lock
No results found