Skip to content
Snippets Groups Projects
Commit 339d6454 authored by Claudio Zandonella's avatar Claudio Zandonella
Browse files

Adapt code for Bolzano results:

- remove variable 'family without childer' same as family with children
- set selected preprocessing and model
- remove redundant metrics
- remove outlier
- update module dependencies
- check if preprocessing is not none before running
parent 38d224ce
Branches main
No related tags found
No related merge requests found
.ruff_cache
.venv
.DS_Store
__pycache__
# Output folder bolzano
report/bolzano
/.luarc.json
# %%
import os
import sys
import geopandas as gpd
import numpy as np
import pandas as pd
from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols
......@@ -28,6 +31,11 @@ raw = read_data()
selected = raw.loc[:, cols].dropna()
selected.rename(columns=conv, inplace=True)
# %%
# Remove error observation
mask = selected['Carbon emission building [ton CO2/m²]'] > 8
selected = selected[~ mask]
# %%
# Explore clustering pre-processing
......@@ -67,9 +75,15 @@ print(pre_scores["hopkins"].describe())
# %%
# select a preprocessing
pre_key = (
"s:RobustScaler(quantile_range=(2, 98))|"
# "s:RobustScaler(quantile_range=(2, 98))|"
# "w:true|"
# "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
"s:Normalizer()|"
"w:true|"
"d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
# "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
)
if pre_key not in preprocs.keys():
skeys = "\n".join([f" - {k!r}" for k in sorted(preprocs.keys())])
......@@ -84,7 +98,7 @@ if clfile.exists():
print(f"File: {clfile} exists load cluster from the file")
print("WARNING: clusters read from file, not computed from scratch!")
geo_out = gpd.read_file(clfile)
# force index to mantain consistency with raw and selected DF
# force index to maintain consistency with raw and selected DF
geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True)
print(f"Reading clusters' scores from: {clscorefile}")
sc = pd.read_excel(clscorefile, header=0, index_col=0)
......@@ -102,7 +116,7 @@ else:
models=models,
n_jobs=-1,
filters=[
("n_clusters", (5.0, 15.0)),
("n_clusters", (5.0, 12.0)),
# silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
......@@ -117,9 +131,9 @@ else:
# mean_el_per_cl: if you need to filter clusters with a mean
# number of elements
# ("mean_el_per_cl", (10, None)),
# % covered by clusters: percentage o, f elements that have been assigned
# % covered by clusters: percentage of elements that have been assigned
# to a cluster
("% covered by clusters", (50, None)),
("% covered by clusters", (80, None)),
],
)
......@@ -160,6 +174,10 @@ sel_clsts = [
# "hdbscan__mcs-10_ms-05_m-sqeuclidean_csm-eom", # k09 @60.3%
# "hdbscan__mcs-10_ms-01_m-euclidean_csm-eom", # k12 @74.5%
# "hdbscan__mcs-07_ms-05_m-chebyshev_csm-eom", # k15 @77.5%
"hdbscan__mcs-10_ms-00_m-euclidean_csm-eom", # k6 96% .91 .17 2653
# "hdbscan__mcs-12_ms-02_m-euclidean_csm-eom", # k7 91% .85 .27 2338
# "hdbscan__mcs-02_ms-05_m-euclidean_csm-eom", # k10 96% .87 .25 1674
]
if len(sel_clsts) == 0:
......@@ -180,7 +198,7 @@ if len(sel_clsts) == 0:
pd.set_option("display.max_rows", None)
print(
"Available clusters are:\n",
sc.loc[sc["selected"] is True, sccols].sort_values(
sc.loc[sc["selected"], sccols].sort_values(
by=["n_clusters", "silhouette", "davies bouldin", "calinski harabasz"],
ascending=[True, False, True, False],
),
......@@ -218,3 +236,5 @@ for clname in sel_clsts:
pal="turbo",
preprocs=pre,
)
# %%
......@@ -106,7 +106,7 @@ cols = [
"indice di vecchiaia",
"%stranieri",
"%con figli",
"%senza figli",
# "%senza figli", # same as con figli
"%Single",
]
......@@ -136,11 +136,11 @@ wcols = np.array(
# socio demographic
# "%65+",
# "%minori",
1.0 / 5.0,
1.0 / 5.0,
1.0 / 5.0,
1.0 / 5.0,
1.0 / 5.0,
1.0 / 4.0,
1.0 / 4.0,
1.0 / 4.0,
# "%senza figli"
1.0 / 4.0,
],
dtype=float,
)
......@@ -170,7 +170,7 @@ conv = {
"indice di vecchiaia": "Age Index",
"%stranieri": "Foreign population [%]",
"%con figli": "Families with children [%]",
"%senza figli": "Families without children [%]",
# "%senza figli": "Families without children [%]",
"%Single": "Families with one component [%]",
}
......
......@@ -92,10 +92,10 @@ def get_algclusters(preprocs=None):
[
"euclidean",
# "haversine", # only 2D
"cityblock",
# "cityblock", # same as manhattan
# "cosine",
"l1",
"l2",
# "l1", # same as manhattan
# "l2", # same as euclidean
"manhattan",
"braycurtis",
"canberra",
......@@ -175,7 +175,11 @@ def get_algclusters(preprocs=None):
def feature_importances(
data: pd.DataFrame, labels: List[int], label: str, clf: Callable = None
data: pd.DataFrame,
labels: List[int],
label: str,
clf: Callable = None,
max_depth = 10
) -> pd.DataFrame:
"""Compute feature importance.
......@@ -186,7 +190,7 @@ def feature_importances(
data : pd.DataFrame
DataFrame with the transformed data used for clustering
labels : List[int]
Label with the assignd cluster for each element in the data
Label with the assigned cluster for each element in the data
label : str
name of the column that will be added in the returned DataFrame
clf: Callable
......@@ -200,7 +204,7 @@ def feature_importances(
DataFrame with the feature importances computed
"""
if clf is None:
clf = ens.RandomForestClassifier(max_depth=10, n_estimators=500, random_state=1)
clf = ens.RandomForestClassifier(max_depth=max_depth, n_estimators=500, random_state=1)
clf.fit(data.values, labels)
return pd.DataFrame(
clf.feature_importances_,
......@@ -281,7 +285,8 @@ def exec_model(
# apply all the post-processing actions
for post in model.get("post-processing", []):
post(clst, data, labels)
if post is not None:
post(clst, data, labels)
return clst, labels
......@@ -514,9 +519,10 @@ def explore_models(
tdata = data.copy()
# apply all the transformations
for pre in model.get("pre-processing", []):
tdata = pre.fit_transform(tdata)
if pre is not None:
tdata = pre.fit_transform(tdata)
# avoit to pre-process the data for every attempt
# avoid to pre-process the data for every attempt
trans[mname] = tdata
# compute Hopkins statistics to test the clusterability of the daset
hop = hopkins(tdata, sampling_size=150)
......
......@@ -715,7 +715,8 @@ def reporting(
preprocs = [] if preprocs is None else preprocs
tdata = data.copy()
for pre in preprocs:
tdata = pre.fit_transform(tdata)
if pre is not None:
tdata = pre.fit_transform(tdata)
tdata = pd.DataFrame(
tdata,
......
This diff is collapsed.
......@@ -15,10 +15,13 @@ scikit-learn = "^1.2.0"
tqdm = "^4.64.1"
joblib = "^1.2.0"
openpyxl = "^3.0.10"
nbformat = "^5.7.3"
nbclient = "^0.7.2"
[tool.poetry.group.dev.dependencies]
pre-commit = "^2.20.0"
ruff = "^0.0.189"
ipykernel = "^6.20.2"
[virtualenvs]
create = true
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment