Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • samuele.zilio/clustering
  • URS/justnature/clustering
2 results
Show changes
import os import os
import sys import sys
import geopandas as gpd
import pandas as pd import pandas as pd
from justclust.data.bolzano import cols, read_data, sez_cols, wcols from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols
from justclust.explore.algclusters import explore_models, get_algclusters from justclust.explore.algclusters import explore_models, get_algclusters
from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs
from justclust.paths.bolzano import report_dir from justclust.paths.bolzano import (
clfile,
clscorefile,
fimpfile,
prescorefile,
report_dir,
selclfile,
)
from justclust.plots import reporting
# %% # %%
# read and filter/select data # read and filter/select data
...@@ -17,7 +26,7 @@ raw = read_data() ...@@ -17,7 +26,7 @@ raw = read_data()
# use selected columns # use selected columns
selected = raw.loc[:, cols].dropna() selected = raw.loc[:, cols].dropna()
selected.rename(columns=conv, inplace=True)
# %% # %%
# Explore clustering pre-processing # Explore clustering pre-processing
...@@ -33,13 +42,13 @@ preprocs = get_preprocs(selected, apply_weight) ...@@ -33,13 +42,13 @@ preprocs = get_preprocs(selected, apply_weight)
# pkeys = sorted(preprocs.keys())[:10] # pkeys = sorted(preprocs.keys())[:10]
# preprocs = {k: preprocs[k] for k in pkeys} # preprocs = {k: preprocs[k] for k in pkeys}
prescorefile = report_dir / "scores_preprocess.xlsx"
if prescorefile.exists(): if prescorefile.exists():
print(f"Reading pre-processed scores from: {prescorefile}") print(f"Reading pre-processed scores from: {prescorefile}")
pre_scores = pd.read_excel(prescorefile) pre_scores = pd.read_excel(prescorefile)
else: else:
print( print(
f"Pre-processed file ({prescorefile}) not found, start computing the main metrics" f"Pre-processed file ({prescorefile}) not found, "
"start computing the main metrics"
) )
pre_scores = explore_preprocs(selected, preprocs) pre_scores = explore_preprocs(selected, preprocs)
pre_scores.to_excel(prescorefile) pre_scores.to_excel(prescorefile)
...@@ -70,58 +79,142 @@ pre = preprocs[pre_key] ...@@ -70,58 +79,142 @@ pre = preprocs[pre_key]
# %% # %%
# explore the clustering space # explore the clustering space
geo_out = raw.loc[selected.index, sez_cols]
models = get_algclusters(preprocs=pre)
sc, fi = explore_models(
data=selected,
cldf=geo_out,
rdir=report_dir,
models=models,
n_jobs=-1,
filters=[
("n_clusters", (5.0, 15.0)),
# silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
# different cluster is more similar.
("silhouette", (0.5, None)),
# davies_bouldin_score: The minimum score is zero, with lower values
# indicating better clustering.
# ("davies bouldin", (None, 0.6)),
# calinski_harabasz_score: higher Calinski-Harabasz score relates
# to a model with better defined clusters
# ("calinski harabasz", (10, None)),
# mean_el_per_cl: if you need to filter clusters with a mean
# number of elements
# ("mean_el_per_cl", (10, None)),
# % covered by clusters: percentage o, f elements that have been assigned
# to a cluster
("% covered by clusters", (50, None)),
],
)
# split file to make them not too heavy if clfile.exists():
cc = [c for c in geo_out.columns if c not in sez_cols] print(f"File: {clfile} exists load cluster from the file")
algs = sorted(set([c.split("__")[0].split("_")[-1] for c in cc])) print("WARNING: clusters read from file, not computed from scratch!")
geo_out = gpd.read_file(clfile)
for alg in algs: # force index to mantain consistency with raw and selected DF
acols = [c for c in cc if alg in c] geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True)
if len(acols) > 20: print(f"Reading clusters' scores from: {clscorefile}")
kl = [c.split("_")[0] for c in acols] sc = pd.read_excel(clscorefile, header=0, index_col=0)
for k in kl: print(f"Reading feature importances from: {fimpfile}")
klfile = report_dir / f"clusters_{alg}_{k}.gpkg" fi = pd.read_excel(fimpfile, header=0, index_col=0)
kcols = [c for c in acols if c.startswith(k)] else:
print(f"Saving {klfile} with cols: {kcols}") print(f"File: {clfile} does not exist. Start computing the clusters")
geo_out.loc[:, sez_cols + kcols].to_file(klfile, driver="GPKG") geo_out = raw.loc[:, sez_cols]
models = get_algclusters(preprocs=pre)
# save all columns to open all the clusters
clfile = report_dir / f"clusters_{alg}.gpkg" sc, fi = explore_models(
print(f"Saving {clfile}") data=selected,
geo_out.to_file(clfile, driver="GPKG") cldf=geo_out,
rdir=report_dir,
clscorefile = report_dir / "scores_clusters.xlsx" models=models,
sc.to_excel(clscorefile) n_jobs=-1,
filters=[
fimpfile = report_dir / "feature_importances.xlsx" ("n_clusters", (5.0, 15.0)),
fi.to_excel(fimpfile) # silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
# different cluster is more similar.
("silhouette", (0.5, None)),
# davies_bouldin_score: The minimum score is zero, with lower values
# indicating better clustering.
# ("davies bouldin", (None, 0.6)),
# calinski_harabasz_score: higher Calinski-Harabasz score relates
# to a model with better defined clusters
# ("calinski harabasz", (10, None)),
# mean_el_per_cl: if you need to filter clusters with a mean
# number of elements
# ("mean_el_per_cl", (10, None)),
# % covered by clusters: percentage o, f elements that have been assigned
# to a cluster
("% covered by clusters", (50, None)),
],
)
# %%
# Export the clusters into separate files
# split file to make them not too heavy
scs = sc.set_index("label", drop=False, verify_integrity=True).loc[
sc.selected.values, :
]
algs = scs.groupby("model")
for alg, adf in algs:
if len(adf) > 20:
for k, kdf in adf.groupby("n_clusters"):
klfile = report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg"
print(f"Saving {klfile}")
geo_out.loc[:, sez_cols + list(kdf.index)].to_file(
klfile, driver="GPKG", index=True
)
# save all columns to open all the clusters
print(f"Saving {clfile}")
geo_out.to_file(clfile, driver="GPKG", index=True)
print(f"Saving {clscorefile}")
sc.to_excel(clscorefile)
print(f"Saving {fimpfile}")
fi.to_excel(fimpfile)
# %%
# Select cluster to be selected
# Select the cluster id that need to be compared
# the ID can be taken from the `clscorefile`
sel_clsts = [
# label
# "hdbscan__mcs-15_ms-05_m-euclidean_csm-eom", # k06 @51.6%
# "hdbscan__mcs-15_ms-05_m-sqeuclidean_csm-eom", # k07 @50.2%
# "hdbscan__mcs-10_ms-05_m-sqeuclidean_csm-eom", # k09 @60.3%
# "hdbscan__mcs-10_ms-01_m-euclidean_csm-eom", # k12 @74.5%
# "hdbscan__mcs-07_ms-05_m-chebyshev_csm-eom", # k15 @77.5%
]
if len(sel_clsts) == 0:
print(
f"You must select the clusters to be analyzed from {clscorefile} "
"using the cluster label as UUID"
)
sccols = [
"label",
"% covered by clusters",
"n_clusters",
"silhouette",
"davies bouldin",
"calinski harabasz",
]
# save the default value
old_max_rows = pd.get_option("display.max_rows")
pd.set_option("display.max_rows", None)
print(
"Available clusters are:\n",
sc.loc[sc["selected"] is True, sccols].sort_values(
by=["n_clusters", "silhouette", "davies bouldin", "calinski harabasz"],
ascending=[True, False, True, False],
),
)
# save default value back
pd.set_option("display.max_rows", old_max_rows)
sys.exit(1)
for clname in sel_clsts:
if clname not in geo_out.columns:
raise ValueError(
f"The selected cluster is: {clname!r} but "
"it is not available. Select one of the "
f"following: {geo_out.columns!r}."
)
# select only the cluster of interest
clsts = geo_out.loc[
selected.index,
sel_clsts
+ [
"geometry",
],
]
print(f"Saving selected clusters {clsts}")
clsts.to_file(selclfile, driver="GPKG", index=True)
for clname in sel_clsts:
print(f"Reporting: {clname}...")
reporting(
data=selected.loc[clsts.index],
clname=clname,
clsts=clsts,
cdir=report_dir / clname,
pal="turbo",
preprocs=pre,
)
...@@ -145,6 +145,34 @@ wcols = np.array( ...@@ -145,6 +145,34 @@ wcols = np.array(
dtype=float, dtype=float,
) )
# define nice text label to not expose the internal nomenclature
conv = {
# air quality: Street canyon risk per census unit [%]
"P_Area_r1": "area very-low AQ-risk [%]",
"P_Area_r2": "area low AQ-risk [%]",
"P_Area_r3": "area medium-low AQ-risk [%]",
"P_Area_r4": "area medium-high AQ-risk [%]",
"P_Area_r6": "area high AQ-risk [%]",
"P_Area_r9": "area very-high AQ-risk [%]",
# carbon
"finale/m²": "Carbon emission building [ton CO2/m²]",
"kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]",
# FFH
"P_Prot_are": "Protected area [%]",
# spatial
"P_area_300": "Accessibility urban green areas (<5 min) [%]",
"MEAN_Resul": "Walkability Index",
# temporal
"P_Area_dif": "Soil sealing between 2022 and 2018 [%]",
# thermal
"mean_norm": "Surface urban heat island",
# socio demographic
"indice di vecchiaia": "Age Index",
"%stranieri": "Foreign population [%]",
"%con figli": "Families with children [%]",
"%senza figli": "Families without children [%]",
"%Single": "Families with one component [%]",
}
# %% # %%
# Define function to read files # Define function to read files
...@@ -153,6 +181,7 @@ wcols = np.array( ...@@ -153,6 +181,7 @@ wcols = np.array(
def read_data(): def read_data():
aq_scr = gpd.read_file(paths.aq_scr_shp) aq_scr = gpd.read_file(paths.aq_scr_shp)
co2_ab = gpd.read_file(paths.co2_ab_shp) co2_ab = gpd.read_file(paths.co2_ab_shp)
co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
co2_em = gpd.read_file(paths.co2_em_shp) co2_em = gpd.read_file(paths.co2_em_shp)
co2_em["finale/m²"] = co2_em["finale"] / co2_em.area co2_em["finale/m²"] = co2_em["finale"] / co2_em.area
...@@ -188,4 +217,6 @@ def read_data(): ...@@ -188,4 +217,6 @@ def read_data():
], ],
axis="columns", axis="columns",
) )
raw.index = co2_ab["SEZ"]
raw.index.name = None
return raw return raw
...@@ -370,7 +370,7 @@ def model_worker( ...@@ -370,7 +370,7 @@ def model_worker(
if rdir is not None: if rdir is not None:
clcount.to_excel(rdir / f"{long_label}.xlsx") clcount.to_excel(rdir / f"{long_label}.xlsx")
fimp = feature_importances(tdata, labels, label) fimp = feature_importances(tdata, labels, label)
return long_label, labels, xcores, fimp return label, labels, xcores, fimp
def apply_filter( def apply_filter(
......
...@@ -13,3 +13,12 @@ tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp" ...@@ -13,3 +13,12 @@ tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp"
thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp" thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp"
socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg" socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg"
# main outputs into the report folder
# - pre-processing
prescorefile = report_dir / "scores_preprocess.xlsx"
# - clustering
clfile = report_dir / "clusters_all.gpkg"
selclfile = report_dir / "selected_clusters.gpkg"
clscorefile = report_dir / "scores_clusters.xlsx"
fimpfile = report_dir / "feature_importances.xlsx"
This diff is collapsed.
This diff is collapsed.