Compare revisions

Claudio Zandonella · Claudio Zandonella · 90744cad · 90744cad · 90744cad · 90744cad
--- a/justclust/analysis/bolzano.py
+++ b/justclust/analysis/bolzano.py
@@ -27,11 +27,6 @@ raw = read_data()
 selected = raw.loc[:, cols].dropna()
 selected.rename(columns=conv, inplace=True)

-# %%
-# Remove error observation
-mask = selected.index == 8888888
-selected = selected[~ mask]
-
 # %%
 # Explore clustering pre-processing

@@ -77,7 +72,7 @@ pre_key = (

    "s:Normalizer()|"
    "w:true|"
-    "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
+    "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd')"

    # "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
 )
@@ -102,6 +97,7 @@ if my_paths.clfile.exists():
    fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0)
 else:
    print(f"File: {my_paths.clfile} does not exist. Start computing the clusters")
+    print(f"Preprocessing: {pre_key}")
    geo_out = raw.loc[:, sez_cols]
    models = get_algclusters(preprocs=pre)

@@ -133,7 +129,7 @@ else:
        ],
    )

-    # %%
+    
    # Export the clusters into separate files

    # split file to make them not too heavy

--- a/justclust/data/bolzano.py
+++ b/justclust/data/bolzano.py
@@ -7,14 +7,22 @@ import justclust.paths.paths as paths
 my_paths = paths.define_paths(city = 'bolzano')
 # %%

+# columns selected for the columns of the outputs
+sez_cols = [
+    "SEZ", "geometry",
+    "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"
+    ]
+
+id_col = ['SEZ']
+
 #----    Air Quality    ----

 # Define available files
-aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"
+prisk_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"

 # Define columns and other constant values
-aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
-aq_nor_cols = [
+prisk_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
+prisk_nor_cols = [
    "P_Area_r1",
    "P_Area_r2",
    "P_Area_r3",
@@ -39,9 +47,6 @@ ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee
 co2_em_raw_cols = ["finale"]
 co2_em_nor_cols = ["finale/m²"]

-# columns selected for the columns of the outputs
-sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"]
-
 #----    FFH    ----

 # Define available files
@@ -116,13 +121,13 @@ socio_perc_cols = [

 cols_dict = {
    # Air quality
-    'air_q':{
-        "P_Area_r1": "area very-low P-risk [%]",
-        "P_Area_r2": "area low P-risk [%]",
-        "P_Area_r3": "area medium-low P-risk [%]",
-        "P_Area_r4": "area medium-high P-risk [%]",
-        "P_Area_r6": "area high P-risk [%]",
-        "P_Area_r9": "area very-high P-risk [%]"
+    'prisk':{
+        "P_Area_r1": "Area very-low P-risk [%]",
+        "P_Area_r2": "Area low P-risk [%]",
+        "P_Area_r3": "Area medium-low P-risk [%]",
+        "P_Area_r4": "Area medium-high P-risk [%]",
+        "P_Area_r6": "Area high P-risk [%]",
+        "P_Area_r9": "Area very-high P-risk [%]"
        },
    # Carbon
    'carbon':{
@@ -165,7 +170,7 @@ conv = {k:v for values in cols_dict.values() for k,v in values.items()}
 # define subset of columns according to categories

 selected_col = {
-    'col_aq': list(cols_dict.get('air_q').values()),
+    'col_prisk': list(cols_dict.get('prisk').values()),
    'col_carbon':list(cols_dict.get('carbon').values()),
    'col_char':\
        list(cols_dict.get('ffh').values())+\
@@ -181,7 +186,7 @@ selected_col = {


 def read_data():
-    aq_scr = gpd.read_file(aq_scr_shp)
+    prisk_scr = gpd.read_file(prisk_scr_shp)
    co2_ab = gpd.read_file(co2_ab_shp)
    co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
    co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
@@ -207,7 +212,7 @@ def read_data():
    raw = pd.concat(
        [
            co2_ab.loc[:, sez_cols],
-            aq_scr.loc[:, aq_raw_cols + aq_nor_cols],
+            prisk_scr.loc[:, prisk_raw_cols + prisk_nor_cols],
            co2_ab.loc[:, co2_raw_cols + co2_nor_cols],
            co2_em.loc[:, co2_em_raw_cols + co2_em_nor_cols + ems],
            ffh_pa.loc[:, ffh_raw_cols + ffh_nor_cols],
@@ -221,4 +226,9 @@ def read_data():
    )
    raw.index = co2_ab["SEZ"]
    raw.index.name = None
+
+    # Remove error observation
+    mask = raw.index == 8888888
+    raw = raw[~ mask]
+
    return raw
--- a/justclust/explore/algclusters.py
+++ b/justclust/explore/algclusters.py
@@ -525,7 +525,10 @@ def explore_models(
        # avoid to pre-process the data for every attempt
        trans[mname] = tdata
        # compute Hopkins statistics to test the clusterability of the daset
-        hop = hopkins(tdata, sampling_size=150)
+
+        # check sample is no greater than df size
+        sampling_size = min(tdata.shape[0], 150)
+        hop = hopkins(tdata, sampling_size=sampling_size)
        print(f"{mname}, Hopkins: {hop:.5f}")
        # print_hopkins(mname, hopkins)


--- a/justclust/explore/preprocs.py
+++ b/justclust/explore/preprocs.py
@@ -32,7 +32,7 @@ def get_preprocs(selected=None, apply_weight=None):
        pre.StandardScaler(with_mean=True, with_std=True),
        pre.Normalizer(norm="l2"),
        pre.Normalizer(norm="l1"),
-        pre.Normalizer(norm="max"),
+        pre.Normalizer(norm="max"), # create sparse matrixes that lead to error in fast ica https://github.com/scikit-learn/scikit-learn/issues/2089#issuecomment-1126183560
        pre.QuantileTransformer(
            n_quantiles=1000,
            output_distribution="uniform",
@@ -81,11 +81,11 @@ def get_preprocs(selected=None, apply_weight=None):
        dec.PCA(),
        dec.PCA(n_components="mle", svd_solver="full"),
        dec.PCA(svd_solver="randomized", whiten=True),
-        dec.FastICA(whiten="warn"),
-        dec.FastICA(whiten="unit-variance"),
-        dec.FactorAnalysis(max_iter=200),
-        dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=-1),
-        dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=-1),
+        dec.FastICA(whiten="warn", max_iter= 500),
+        dec.FastICA(whiten="unit-variance", max_iter= 500),
+        dec.FactorAnalysis(max_iter=500),
+        dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=None),
+        dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=None),
    ]
    preprocs = {
        f"s:{s}|w:{'true' if w else 'false'}|d:{d}": (s, w, d)
@@ -142,7 +142,10 @@ def preprocs_worker(
    else:
        dn = "None"

-    hop = hopkins(df, sampling_size=150)
+    # check sample is no greater than df size
+    sampling_size = min(df.shape[0], 150)
+
+    hop = hopkins(df, sampling_size=sampling_size)

    long_label = f"s:{s!r}|w:{w!r}|d:{d!r}"
    pre_id = shake_128(long_label.encode()).hexdigest(4)

--- a/justclust/hopkins.py
+++ b/justclust/hopkins.py
@@ -53,26 +53,17 @@ def get_nearest_sample(df: pd.DataFrame, uniformly_selected_observations: pd.Dat
 def simulate_df_with_same_variation(
    df: pd.DataFrame, sampling_size: int
 ) -> pd.DataFrame:
-    max_data_frame = df.max()
+    
    min_data_frame = df.min()
-    uniformly_selected_values_0 = np.random.uniform(
-        min_data_frame[0], max_data_frame[0], sampling_size
-    )
-    uniformly_selected_values_1 = np.random.uniform(
-        min_data_frame[1], max_data_frame[1], sampling_size
-    )
-    uniformly_selected_observations = np.column_stack(
-        (uniformly_selected_values_0, uniformly_selected_values_1)
-    )
-    if len(max_data_frame) >= 2:
-        for i in range(2, len(max_data_frame)):
-            uniformly_selected_values_i = np.random.uniform(
-                min_data_frame[i], max_data_frame[i], sampling_size
-            )
-            to_stack = (uniformly_selected_observations, uniformly_selected_values_i)
-            uniformly_selected_observations = np.column_stack(to_stack)
-    uniformly_selected_observations_df = pd.DataFrame(uniformly_selected_observations)
-    return uniformly_selected_observations_df
+    max_data_frame = df.max()
+
+    obs_all = []
+    for min, max in zip(min_data_frame, max_data_frame):
+        obs = np.random.uniform(min, max, sampling_size)
+        obs_all.append(obs)
+    
+    res = pd.DataFrame(obs_all).T
+    return res


 def get_distance_sample_to_nearest_neighbours(df: pd.DataFrame, data_frame_sample):

--- a/justclust/plots.py
+++ b/justclust/plots.py
@@ -311,7 +311,11 @@ def plot_profiles2(
        row = floor(c / ncols)
        col = floor(c - (row * ncols))
        print(f"cl: {cl}, row: {row}, col: {col}")
-        ax1 = axes[row][col]
+
+        if nrows > 1:
+            ax1 = axes[row][col]
+        else:
+            ax1 = axes[col]

        data = [clf[vname][cl] for vname in vnames]

@@ -530,7 +534,12 @@ def plot_stats(
    for c, cl in enumerate(cl_cat):
        irow = floor(c / ncols)
        icol = floor(c - (irow * ncols))
-        ax = axes[irow][icol]
+        
+        if nrows > 1:
+            ax = axes[irow][icol]
+        else:
+            ax = axes[icol]
+
        light_bg = is_light(*cmap[cl], hsp_threshold=hsp_threshold)
        sdf = stdfs[cl]
        if idx_order is None:

--- a/justclust/quarto.py
+++ b/justclust/quarto.py
No results found