small fixes

90744cad · Claudio Zandonella · 5b4aacef · 90744cad · 90744cad · 90744cad
Commit 90744cad authored 2 years ago by Claudio Zandonella
--- a/justclust/analysis/bolzano.py
+++ b/justclust/analysis/bolzano.py
@@ -27,11 +27,6 @@ raw = read_data()
 selected = raw.loc[:, cols].dropna()
 selected.rename(columns=conv, inplace=True)

-# %%
-# Remove error observation
-mask = selected.index == 8888888
-selected = selected[~ mask]
-
 # %%
 # Explore clustering pre-processing

@@ -77,7 +72,7 @@ pre_key = (

    "s:Normalizer()|"
    "w:true|"
-    "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
+    "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd')"

    # "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
 )
@@ -102,6 +97,7 @@ if my_paths.clfile.exists():
    fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0)
 else:
    print(f"File: {my_paths.clfile} does not exist. Start computing the clusters")
+    print(f"Preprocessing: {pre_key}")
    geo_out = raw.loc[:, sez_cols]
    models = get_algclusters(preprocs=pre)


--- a/justclust/data/bolzano.py
+++ b/justclust/data/bolzano.py
@@ -7,14 +7,22 @@ import justclust.paths.paths as paths
 my_paths = paths.define_paths(city = 'bolzano')
 # %%

+# columns selected for the columns of the outputs
+sez_cols = [
+    "SEZ", "geometry",
+    "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"
+    ]
+
+id_col = ['SEZ']
+
 #----    Air Quality    ----

 # Define available files
-aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"
+prisk_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"

 # Define columns and other constant values
-aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
-aq_nor_cols = [
+prisk_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
+prisk_nor_cols = [
    "P_Area_r1",
    "P_Area_r2",
    "P_Area_r3",
@@ -39,9 +47,6 @@ ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee
 co2_em_raw_cols = ["finale"]
 co2_em_nor_cols = ["finale/m²"]

-# columns selected for the columns of the outputs
-sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"]
-
 #----    FFH    ----

 # Define available files
@@ -116,13 +121,13 @@ socio_perc_cols = [

 cols_dict = {
    # Air quality
-    'air_q':{
-        "P_Area_r1": "area very-low P-risk [%]",
-        "P_Area_r2": "area low P-risk [%]",
-        "P_Area_r3": "area medium-low P-risk [%]",
-        "P_Area_r4": "area medium-high P-risk [%]",
-        "P_Area_r6": "area high P-risk [%]",
-        "P_Area_r9": "area very-high P-risk [%]"
+    'prisk':{
+        "P_Area_r1": "Area very-low P-risk [%]",
+        "P_Area_r2": "Area low P-risk [%]",
+        "P_Area_r3": "Area medium-low P-risk [%]",
+        "P_Area_r4": "Area medium-high P-risk [%]",
+        "P_Area_r6": "Area high P-risk [%]",
+        "P_Area_r9": "Area very-high P-risk [%]"
        },
    # Carbon
    'carbon':{
@@ -165,7 +170,7 @@ conv = {k:v for values in cols_dict.values() for k,v in values.items()}
 # define subset of columns according to categories

 selected_col = {
-    'col_aq': list(cols_dict.get('air_q').values()),
+    'col_prisk': list(cols_dict.get('prisk').values()),
    'col_carbon':list(cols_dict.get('carbon').values()),
    'col_char':\
        list(cols_dict.get('ffh').values())+\
@@ -181,7 +186,7 @@ selected_col = {


 def read_data():
-    aq_scr = gpd.read_file(aq_scr_shp)
+    prisk_scr = gpd.read_file(prisk_scr_shp)
    co2_ab = gpd.read_file(co2_ab_shp)
    co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
    co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
@@ -207,7 +212,7 @@ def read_data():
    raw = pd.concat(
        [
            co2_ab.loc[:, sez_cols],
-            aq_scr.loc[:, aq_raw_cols + aq_nor_cols],
+            prisk_scr.loc[:, prisk_raw_cols + prisk_nor_cols],
            co2_ab.loc[:, co2_raw_cols + co2_nor_cols],
            co2_em.loc[:, co2_em_raw_cols + co2_em_nor_cols + ems],
            ffh_pa.loc[:, ffh_raw_cols + ffh_nor_cols],
@@ -221,4 +226,9 @@ def read_data():
    )
    raw.index = co2_ab["SEZ"]
    raw.index.name = None
+
+    # Remove error observation
+    mask = raw.index == 8888888
+    raw = raw[~ mask]
+
    return raw
--- a/justclust/explore/preprocs.py
+++ b/justclust/explore/preprocs.py
@@ -32,7 +32,7 @@ def get_preprocs(selected=None, apply_weight=None):
        pre.StandardScaler(with_mean=True, with_std=True),
        pre.Normalizer(norm="l2"),
        pre.Normalizer(norm="l1"),
-        pre.Normalizer(norm="max"),
+        pre.Normalizer(norm="max"), # create sparse matrixes that lead to error in fast ica https://github.com/scikit-learn/scikit-learn/issues/2089#issuecomment-1126183560
        pre.QuantileTransformer(
            n_quantiles=1000,
            output_distribution="uniform",
@@ -81,11 +81,11 @@ def get_preprocs(selected=None, apply_weight=None):
        dec.PCA(),
        dec.PCA(n_components="mle", svd_solver="full"),
        dec.PCA(svd_solver="randomized", whiten=True),
-        dec.FastICA(whiten="warn"),
-        dec.FastICA(whiten="unit-variance"),
-        dec.FactorAnalysis(max_iter=200),
-        dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=-1),
-        dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=-1),
+        dec.FastICA(whiten="warn", max_iter= 500),
+        dec.FastICA(whiten="unit-variance", max_iter= 500),
+        dec.FactorAnalysis(max_iter=500),
+        dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=None),
+        dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=None),
    ]
    preprocs = {
        f"s:{s}|w:{'true' if w else 'false'}|d:{d}": (s, w, d)

--- a/justclust/plots.py
+++ b/justclust/plots.py
@@ -311,7 +311,11 @@ def plot_profiles2(
        row = floor(c / ncols)
        col = floor(c - (row * ncols))
        print(f"cl: {cl}, row: {row}, col: {col}")
-        ax1 = axes[row][col]
+
+        if nrows > 1:
+            ax1 = axes[row][col]
+        else:
+            ax1 = axes[col]

        data = [clf[vname][cl] for vname in vnames]

@@ -530,7 +534,12 @@ def plot_stats(
    for c, cl in enumerate(cl_cat):
        irow = floor(c / ncols)
        icol = floor(c - (irow * ncols))
-        ax = axes[irow][icol]
+        
+        if nrows > 1:
+            ax = axes[irow][icol]
+        else:
+            ax = axes[icol]
+
        light_bg = is_light(*cmap[cl], hsp_threshold=hsp_threshold)
        sdf = stdfs[cl]
        if idx_order is None:

--- a/justclust/quarto.py
+++ b/justclust/quarto.py