Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • samuele.zilio/clustering
  • URS/justnature/clustering
2 results
Show changes
Commits on Source (2)
......@@ -27,11 +27,6 @@ raw = read_data()
selected = raw.loc[:, cols].dropna()
selected.rename(columns=conv, inplace=True)
# %%
# Remove error observation
mask = selected.index == 8888888
selected = selected[~ mask]
# %%
# Explore clustering pre-processing
......@@ -77,7 +72,7 @@ pre_key = (
"s:Normalizer()|"
"w:true|"
"d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
"d:DictionaryLearning(alpha=0.1, fit_algorithm='cd')"
# "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
)
......@@ -102,6 +97,7 @@ if my_paths.clfile.exists():
fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0)
else:
print(f"File: {my_paths.clfile} does not exist. Start computing the clusters")
print(f"Preprocessing: {pre_key}")
geo_out = raw.loc[:, sez_cols]
models = get_algclusters(preprocs=pre)
......@@ -133,7 +129,7 @@ else:
],
)
# %%
# Export the clusters into separate files
# split file to make them not too heavy
......
......@@ -7,14 +7,22 @@ import justclust.paths.paths as paths
my_paths = paths.define_paths(city = 'bolzano')
# %%
# columns selected for the columns of the outputs
sez_cols = [
"SEZ", "geometry",
"SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"
]
id_col = ['SEZ']
#---- Air Quality ----
# Define available files
aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"
prisk_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"
# Define columns and other constant values
aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
aq_nor_cols = [
prisk_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
prisk_nor_cols = [
"P_Area_r1",
"P_Area_r2",
"P_Area_r3",
......@@ -39,9 +47,6 @@ ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee
co2_em_raw_cols = ["finale"]
co2_em_nor_cols = ["finale/m²"]
# columns selected for the columns of the outputs
sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"]
#---- FFH ----
# Define available files
......@@ -116,13 +121,13 @@ socio_perc_cols = [
cols_dict = {
# Air quality
'air_q':{
"P_Area_r1": "area very-low P-risk [%]",
"P_Area_r2": "area low P-risk [%]",
"P_Area_r3": "area medium-low P-risk [%]",
"P_Area_r4": "area medium-high P-risk [%]",
"P_Area_r6": "area high P-risk [%]",
"P_Area_r9": "area very-high P-risk [%]"
'prisk':{
"P_Area_r1": "Area very-low P-risk [%]",
"P_Area_r2": "Area low P-risk [%]",
"P_Area_r3": "Area medium-low P-risk [%]",
"P_Area_r4": "Area medium-high P-risk [%]",
"P_Area_r6": "Area high P-risk [%]",
"P_Area_r9": "Area very-high P-risk [%]"
},
# Carbon
'carbon':{
......@@ -165,7 +170,7 @@ conv = {k:v for values in cols_dict.values() for k,v in values.items()}
# define subset of columns according to categories
selected_col = {
'col_aq': list(cols_dict.get('air_q').values()),
'col_prisk': list(cols_dict.get('prisk').values()),
'col_carbon':list(cols_dict.get('carbon').values()),
'col_char':\
list(cols_dict.get('ffh').values())+\
......@@ -181,7 +186,7 @@ selected_col = {
def read_data():
aq_scr = gpd.read_file(aq_scr_shp)
prisk_scr = gpd.read_file(prisk_scr_shp)
co2_ab = gpd.read_file(co2_ab_shp)
co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
......@@ -207,7 +212,7 @@ def read_data():
raw = pd.concat(
[
co2_ab.loc[:, sez_cols],
aq_scr.loc[:, aq_raw_cols + aq_nor_cols],
prisk_scr.loc[:, prisk_raw_cols + prisk_nor_cols],
co2_ab.loc[:, co2_raw_cols + co2_nor_cols],
co2_em.loc[:, co2_em_raw_cols + co2_em_nor_cols + ems],
ffh_pa.loc[:, ffh_raw_cols + ffh_nor_cols],
......@@ -221,4 +226,9 @@ def read_data():
)
raw.index = co2_ab["SEZ"]
raw.index.name = None
# Remove error observation
mask = raw.index == 8888888
raw = raw[~ mask]
return raw
......@@ -525,7 +525,10 @@ def explore_models(
# avoid to pre-process the data for every attempt
trans[mname] = tdata
# compute Hopkins statistics to test the clusterability of the daset
hop = hopkins(tdata, sampling_size=150)
# check sample is no greater than df size
sampling_size = min(tdata.shape[0], 150)
hop = hopkins(tdata, sampling_size=sampling_size)
print(f"{mname}, Hopkins: {hop:.5f}")
# print_hopkins(mname, hopkins)
......
......@@ -32,7 +32,7 @@ def get_preprocs(selected=None, apply_weight=None):
pre.StandardScaler(with_mean=True, with_std=True),
pre.Normalizer(norm="l2"),
pre.Normalizer(norm="l1"),
pre.Normalizer(norm="max"),
pre.Normalizer(norm="max"), # create sparse matrixes that lead to error in fast ica https://github.com/scikit-learn/scikit-learn/issues/2089#issuecomment-1126183560
pre.QuantileTransformer(
n_quantiles=1000,
output_distribution="uniform",
......@@ -81,11 +81,11 @@ def get_preprocs(selected=None, apply_weight=None):
dec.PCA(),
dec.PCA(n_components="mle", svd_solver="full"),
dec.PCA(svd_solver="randomized", whiten=True),
dec.FastICA(whiten="warn"),
dec.FastICA(whiten="unit-variance"),
dec.FactorAnalysis(max_iter=200),
dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=-1),
dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=-1),
dec.FastICA(whiten="warn", max_iter= 500),
dec.FastICA(whiten="unit-variance", max_iter= 500),
dec.FactorAnalysis(max_iter=500),
dec.DictionaryLearning(fit_algorithm="lars", alpha=0.1, n_jobs=None),
dec.DictionaryLearning(fit_algorithm="cd", alpha=0.1, n_jobs=None),
]
preprocs = {
f"s:{s}|w:{'true' if w else 'false'}|d:{d}": (s, w, d)
......@@ -142,7 +142,10 @@ def preprocs_worker(
else:
dn = "None"
hop = hopkins(df, sampling_size=150)
# check sample is no greater than df size
sampling_size = min(df.shape[0], 150)
hop = hopkins(df, sampling_size=sampling_size)
long_label = f"s:{s!r}|w:{w!r}|d:{d!r}"
pre_id = shake_128(long_label.encode()).hexdigest(4)
......
......@@ -53,26 +53,17 @@ def get_nearest_sample(df: pd.DataFrame, uniformly_selected_observations: pd.Dat
def simulate_df_with_same_variation(
df: pd.DataFrame, sampling_size: int
) -> pd.DataFrame:
max_data_frame = df.max()
min_data_frame = df.min()
uniformly_selected_values_0 = np.random.uniform(
min_data_frame[0], max_data_frame[0], sampling_size
)
uniformly_selected_values_1 = np.random.uniform(
min_data_frame[1], max_data_frame[1], sampling_size
)
uniformly_selected_observations = np.column_stack(
(uniformly_selected_values_0, uniformly_selected_values_1)
)
if len(max_data_frame) >= 2:
for i in range(2, len(max_data_frame)):
uniformly_selected_values_i = np.random.uniform(
min_data_frame[i], max_data_frame[i], sampling_size
)
to_stack = (uniformly_selected_observations, uniformly_selected_values_i)
uniformly_selected_observations = np.column_stack(to_stack)
uniformly_selected_observations_df = pd.DataFrame(uniformly_selected_observations)
return uniformly_selected_observations_df
max_data_frame = df.max()
obs_all = []
for min, max in zip(min_data_frame, max_data_frame):
obs = np.random.uniform(min, max, sampling_size)
obs_all.append(obs)
res = pd.DataFrame(obs_all).T
return res
def get_distance_sample_to_nearest_neighbours(df: pd.DataFrame, data_frame_sample):
......
......@@ -311,7 +311,11 @@ def plot_profiles2(
row = floor(c / ncols)
col = floor(c - (row * ncols))
print(f"cl: {cl}, row: {row}, col: {col}")
ax1 = axes[row][col]
if nrows > 1:
ax1 = axes[row][col]
else:
ax1 = axes[col]
data = [clf[vname][cl] for vname in vnames]
......@@ -530,7 +534,12 @@ def plot_stats(
for c, cl in enumerate(cl_cat):
irow = floor(c / ncols)
icol = floor(c - (irow * ncols))
ax = axes[irow][icol]
if nrows > 1:
ax = axes[irow][icol]
else:
ax = axes[icol]
light_bg = is_light(*cmap[cl], hsp_threshold=hsp_threshold)
sdf = stdfs[cl]
if idx_order is None:
......
This diff is collapsed.