Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • samuele.zilio/clustering
  • URS/justnature/clustering
2 results
Show changes
import os
import sys
import geopandas as gpd
import pandas as pd
from justclust.data.bolzano import cols, read_data, sez_cols, wcols
from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols
from justclust.explore.algclusters import explore_models, get_algclusters
from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs
from justclust.paths.bolzano import report_dir
from justclust.paths.bolzano import (
clfile,
clscorefile,
fimpfile,
prescorefile,
report_dir,
selclfile,
)
from justclust.plots import reporting
# %%
# read and filter/select data
......@@ -17,7 +26,7 @@ raw = read_data()
# use selected columns
selected = raw.loc[:, cols].dropna()
selected.rename(columns=conv, inplace=True)
# %%
# Explore clustering pre-processing
......@@ -33,13 +42,13 @@ preprocs = get_preprocs(selected, apply_weight)
# pkeys = sorted(preprocs.keys())[:10]
# preprocs = {k: preprocs[k] for k in pkeys}
prescorefile = report_dir / "scores_preprocess.xlsx"
if prescorefile.exists():
print(f"Reading pre-processed scores from: {prescorefile}")
pre_scores = pd.read_excel(prescorefile)
else:
print(
f"Pre-processed file ({prescorefile}) not found, start computing the main metrics"
f"Pre-processed file ({prescorefile}) not found, "
"start computing the main metrics"
)
pre_scores = explore_preprocs(selected, preprocs)
pre_scores.to_excel(prescorefile)
......@@ -70,58 +79,142 @@ pre = preprocs[pre_key]
# %%
# explore the clustering space
geo_out = raw.loc[selected.index, sez_cols]
models = get_algclusters(preprocs=pre)
sc, fi = explore_models(
data=selected,
cldf=geo_out,
rdir=report_dir,
models=models,
n_jobs=-1,
filters=[
("n_clusters", (5.0, 15.0)),
# silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
# different cluster is more similar.
("silhouette", (0.5, None)),
# davies_bouldin_score: The minimum score is zero, with lower values
# indicating better clustering.
# ("davies bouldin", (None, 0.6)),
# calinski_harabasz_score: higher Calinski-Harabasz score relates
# to a model with better defined clusters
# ("calinski harabasz", (10, None)),
# mean_el_per_cl: if you need to filter clusters with a mean
# number of elements
# ("mean_el_per_cl", (10, None)),
# % covered by clusters: percentage o, f elements that have been assigned
# to a cluster
("% covered by clusters", (50, None)),
],
)
# split file to make them not too heavy
cc = [c for c in geo_out.columns if c not in sez_cols]
algs = sorted(set([c.split("__")[0].split("_")[-1] for c in cc]))
for alg in algs:
acols = [c for c in cc if alg in c]
if len(acols) > 20:
kl = [c.split("_")[0] for c in acols]
for k in kl:
klfile = report_dir / f"clusters_{alg}_{k}.gpkg"
kcols = [c for c in acols if c.startswith(k)]
print(f"Saving {klfile} with cols: {kcols}")
geo_out.loc[:, sez_cols + kcols].to_file(klfile, driver="GPKG")
# save all columns to open all the clusters
clfile = report_dir / f"clusters_{alg}.gpkg"
print(f"Saving {clfile}")
geo_out.to_file(clfile, driver="GPKG")
clscorefile = report_dir / "scores_clusters.xlsx"
sc.to_excel(clscorefile)
fimpfile = report_dir / "feature_importances.xlsx"
fi.to_excel(fimpfile)
if clfile.exists():
print(f"File: {clfile} exists load cluster from the file")
print("WARNING: clusters read from file, not computed from scratch!")
geo_out = gpd.read_file(clfile)
# force index to mantain consistency with raw and selected DF
geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True)
print(f"Reading clusters' scores from: {clscorefile}")
sc = pd.read_excel(clscorefile, header=0, index_col=0)
print(f"Reading feature importances from: {fimpfile}")
fi = pd.read_excel(fimpfile, header=0, index_col=0)
else:
print(f"File: {clfile} does not exist. Start computing the clusters")
geo_out = raw.loc[:, sez_cols]
models = get_algclusters(preprocs=pre)
sc, fi = explore_models(
data=selected,
cldf=geo_out,
rdir=report_dir,
models=models,
n_jobs=-1,
filters=[
("n_clusters", (5.0, 15.0)),
# silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
# different cluster is more similar.
("silhouette", (0.5, None)),
# davies_bouldin_score: The minimum score is zero, with lower values
# indicating better clustering.
# ("davies bouldin", (None, 0.6)),
# calinski_harabasz_score: higher Calinski-Harabasz score relates
# to a model with better defined clusters
# ("calinski harabasz", (10, None)),
# mean_el_per_cl: if you need to filter clusters with a mean
# number of elements
# ("mean_el_per_cl", (10, None)),
# % covered by clusters: percentage o, f elements that have been assigned
# to a cluster
("% covered by clusters", (50, None)),
],
)
# %%
# Export the clusters into separate files
# split file to make them not too heavy
scs = sc.set_index("label", drop=False, verify_integrity=True).loc[
sc.selected.values, :
]
algs = scs.groupby("model")
for alg, adf in algs:
if len(adf) > 20:
for k, kdf in adf.groupby("n_clusters"):
klfile = report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg"
print(f"Saving {klfile}")
geo_out.loc[:, sez_cols + list(kdf.index)].to_file(
klfile, driver="GPKG", index=True
)
# save all columns to open all the clusters
print(f"Saving {clfile}")
geo_out.to_file(clfile, driver="GPKG", index=True)
print(f"Saving {clscorefile}")
sc.to_excel(clscorefile)
print(f"Saving {fimpfile}")
fi.to_excel(fimpfile)
# %%
# Select cluster to be selected
# Select the cluster id that need to be compared
# the ID can be taken from the `clscorefile`
sel_clsts = [
# label
# "hdbscan__mcs-15_ms-05_m-euclidean_csm-eom", # k06 @51.6%
# "hdbscan__mcs-15_ms-05_m-sqeuclidean_csm-eom", # k07 @50.2%
# "hdbscan__mcs-10_ms-05_m-sqeuclidean_csm-eom", # k09 @60.3%
# "hdbscan__mcs-10_ms-01_m-euclidean_csm-eom", # k12 @74.5%
# "hdbscan__mcs-07_ms-05_m-chebyshev_csm-eom", # k15 @77.5%
]
if len(sel_clsts) == 0:
print(
f"You must select the clusters to be analyzed from {clscorefile} "
"using the cluster label as UUID"
)
sccols = [
"label",
"% covered by clusters",
"n_clusters",
"silhouette",
"davies bouldin",
"calinski harabasz",
]
# save the default value
old_max_rows = pd.get_option("display.max_rows")
pd.set_option("display.max_rows", None)
print(
"Available clusters are:\n",
sc.loc[sc["selected"] is True, sccols].sort_values(
by=["n_clusters", "silhouette", "davies bouldin", "calinski harabasz"],
ascending=[True, False, True, False],
),
)
# save default value back
pd.set_option("display.max_rows", old_max_rows)
sys.exit(1)
for clname in sel_clsts:
if clname not in geo_out.columns:
raise ValueError(
f"The selected cluster is: {clname!r} but "
"it is not available. Select one of the "
f"following: {geo_out.columns!r}."
)
# select only the cluster of interest
clsts = geo_out.loc[
selected.index,
sel_clsts
+ [
"geometry",
],
]
print(f"Saving selected clusters {clsts}")
clsts.to_file(selclfile, driver="GPKG", index=True)
for clname in sel_clsts:
print(f"Reporting: {clname}...")
reporting(
data=selected.loc[clsts.index],
clname=clname,
clsts=clsts,
cdir=report_dir / clname,
pal="turbo",
preprocs=pre,
)
......@@ -145,6 +145,34 @@ wcols = np.array(
dtype=float,
)
# define nice text label to not expose the internal nomenclature
conv = {
# air quality: Street canyon risk per census unit [%]
"P_Area_r1": "area very-low AQ-risk [%]",
"P_Area_r2": "area low AQ-risk [%]",
"P_Area_r3": "area medium-low AQ-risk [%]",
"P_Area_r4": "area medium-high AQ-risk [%]",
"P_Area_r6": "area high AQ-risk [%]",
"P_Area_r9": "area very-high AQ-risk [%]",
# carbon
"finale/m²": "Carbon emission building [ton CO2/m²]",
"kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]",
# FFH
"P_Prot_are": "Protected area [%]",
# spatial
"P_area_300": "Accessibility urban green areas (<5 min) [%]",
"MEAN_Resul": "Walkability Index",
# temporal
"P_Area_dif": "Soil sealing between 2022 and 2018 [%]",
# thermal
"mean_norm": "Surface urban heat island",
# socio demographic
"indice di vecchiaia": "Age Index",
"%stranieri": "Foreign population [%]",
"%con figli": "Families with children [%]",
"%senza figli": "Families without children [%]",
"%Single": "Families with one component [%]",
}
# %%
# Define function to read files
......@@ -153,6 +181,7 @@ wcols = np.array(
def read_data():
aq_scr = gpd.read_file(paths.aq_scr_shp)
co2_ab = gpd.read_file(paths.co2_ab_shp)
co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
co2_em = gpd.read_file(paths.co2_em_shp)
co2_em["finale/m²"] = co2_em["finale"] / co2_em.area
......@@ -188,4 +217,6 @@ def read_data():
],
axis="columns",
)
raw.index = co2_ab["SEZ"]
raw.index.name = None
return raw
......@@ -370,7 +370,7 @@ def model_worker(
if rdir is not None:
clcount.to_excel(rdir / f"{long_label}.xlsx")
fimp = feature_importances(tdata, labels, label)
return long_label, labels, xcores, fimp
return label, labels, xcores, fimp
def apply_filter(
......
......@@ -13,3 +13,12 @@ tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp"
thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp"
socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg"
# main outputs into the report folder
# - pre-processing
prescorefile = report_dir / "scores_preprocess.xlsx"
# - clustering
clfile = report_dir / "clusters_all.gpkg"
selclfile = report_dir / "selected_clusters.gpkg"
clscorefile = report_dir / "scores_clusters.xlsx"
fimpfile = report_dir / "feature_importances.xlsx"
This diff is collapsed.
This diff is collapsed.