Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
clustering
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
URS
JustNature
clustering
Commits
0f867da4
Commit
0f867da4
authored
1 year ago
by
Claudio Zandonella
Browse files
Options
Downloads
Patches
Plain Diff
update bolzano analysis
parent
88ac748d
Branches
main
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
justclust/analysis/bolzano.py
+30
-20
30 additions, 20 deletions
justclust/analysis/bolzano.py
justclust/data/bolzano.py
+77
-118
77 additions, 118 deletions
justclust/data/bolzano.py
with
107 additions
and
138 deletions
justclust/analysis/bolzano.py
+
30
−
20
View file @
0f867da4
...
...
@@ -7,7 +7,7 @@ import geopandas as gpd
import
numpy
as
np
import
pandas
as
pd
from
justclust.data.bolzano
import
cols
,
conv
,
read_data
,
sez_cols
,
wcols
from
justclust.data.bolzano
import
cols
,
conv
,
read_data
,
sez_cols
,
wcols
,
selected_col
from
justclust.explore.algclusters
import
explore_models
,
get_algclusters
from
justclust.explore.preprocs
import
ApplyW
,
explore_preprocs
,
get_preprocs
import
justclust.paths.paths
as
paths
...
...
@@ -66,15 +66,19 @@ print(pre_scores["hopkins"].describe())
# %%
# select a preprocessing
pre_key
=
(
# "s:RobustScaler(quantile_range=(2, 98))|"
# "w:true|"
# "d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
"
s:Normalizer()|
"
"
w:true|
"
"
d:DictionaryLearning(alpha=0.1, fit_algorithm=
'
cd
'
)
"
# "s:QuantileTransformer()|w:false|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', n_jobs=-1)"
"
s:MaxAbsScaler()|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)
"
# try-1
# "s:MinMaxScaler()|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-2
# "s:Normalizer(norm='max')|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-3
# "s:Normalizer(norm='l1')|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-4
# "s:QuantileTransformer(random_state=2023)|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-5
# "s:Normalizer()|w:true|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-6
# "s:RobustScaler(quantile_range=(20, 80))|w:true|d:DictionaryLearning(alpha=0.1, fit_algorithm='cd', max_iter=2000, n_jobs=-1, random_state=2023)" # try-7
# "s:Normalizer(norm='max')|w:false|d:DictionaryLearning(alpha=0.1, max_iter=2000, n_jobs=-1, random_state=2023)" # try-8
)
if
pre_key
not
in
preprocs
.
keys
():
skeys
=
"
\n
"
.
join
([
f
"
-
{
k
!r}
"
for
k
in
sorted
(
preprocs
.
keys
())])
...
...
@@ -108,12 +112,12 @@ else:
models
=
models
,
n_jobs
=-
1
,
filters
=
[
(
"
n_clusters
"
,
(
5.0
,
1
2
.0
)),
(
"
n_clusters
"
,
(
5.0
,
1
0
.0
)),
# silhouette_score: The best value is 1 and the worst value is -1.
# Values near 0 indicate overlapping clusters. Negative values generally
# indicate that a sample has been assigned to the wrong cluster, as a
# different cluster is more similar.
(
"
silhouette
"
,
(
0.5
,
None
)),
(
"
silhouette
"
,
(
0.
8
5
,
None
)),
# davies_bouldin_score: The minimum score is zero, with lower values
# indicating better clustering.
# ("davies bouldin", (None, 0.6)),
...
...
@@ -160,16 +164,21 @@ else:
# Select the cluster id that need to be compared
# the ID can be taken from the `clscorefile`
sel_clsts
=
[
# label
# "hdbscan__mcs-15_ms-05_m-euclidean_csm-eom", # k06 @51.6%
# "hdbscan__mcs-15_ms-05_m-sqeuclidean_csm-eom", # k07 @50.2%
# "hdbscan__mcs-10_ms-05_m-sqeuclidean_csm-eom", # k09 @60.3%
# "hdbscan__mcs-10_ms-01_m-euclidean_csm-eom", # k12 @74.5%
# "hdbscan__mcs-07_ms-05_m-chebyshev_csm-eom", # k15 @77.5%
"
hdbscan__mcs-10_ms-00_m-euclidean_csm-eom
"
,
# k6 96% .91 .17 2653
# "hdbscan__mcs-12_ms-02_m-euclidean_csm-eom", # k7 91% .85 .27 2338
# "hdbscan__mcs-02_ms-05_m-euclidean_csm-eom", # k10 96% .87 .25 1674
# try-1
# "hdbscan__mcs-10_ms-00_m-braycurtis_csm-eom", # 8 clst 84.558824 0.961501 0.097361 7967.110672
"
hdbscan__mcs-10_ms-00_m-euclidean_csm-eom
"
# 9 clst 91.911765 0.916117 0.488055 1082.154275
# # try-2
# "hdbscan__mcs-10_ms-00_m-braycurtis_csm-eom", # 8 clst 84.926471 0.938385 0.306445 1210.865543
# "hdbscan__mcs-10_ms-05_m-chebyshev_csm-eom" # 9 clst 89.705882 0.921724 0.379492 2048.702598
# # try-8
# "hdbscan__mcs-10_ms-00_m-braycurtis_csm-eom", # 5 clst 86.397059 0.991400 0.068170 38957.400453
# "hdbscan__mcs-07_ms-00_m-euclidean_csm-eom", # 6 clst 90.073529 0.991330 0.067792 34901.021320
# "hdbscan__mcs-07_ms-03_m-euclidean_csm-eom", # 7 clst 92.647059 0.989539 0.074449 25493.584344
]
if
len
(
sel_clsts
)
==
0
:
...
...
@@ -232,6 +241,7 @@ for clname in sel_clsts:
qmd
.
create_qmd_report
(
city
=
'
bolzano
'
,
clname
=
clname
,
selected_col
=
selected_col
,
cdir
=
my_paths
.
report_dir
/
clname
)
...
...
This diff is collapsed.
Click to expand it.
justclust/data/bolzano.py
+
77
−
118
View file @
0f867da4
# %%
import
geopandas
as
gpd
import
numpy
as
np
import
pandas
as
pd
...
...
@@ -8,64 +10,47 @@ my_paths = paths.define_paths(city = 'bolzano')
# %%
# columns selected for the columns of the outputs
sez_cols
=
[
"
SEZ
"
,
"
geometry
"
,
"
SEZ2011
"
,
"
COD_REG
"
,
"
COD_ISTAT
"
,
"
PRO_COM
"
]
sez_cols
=
[
"
SEZ
"
,
"
geometry
"
]
id_col
=
[
'
SEZ
'
]
#---- Air Quality ----
# Define available files
prisk_scr_shp
=
my_paths
.
rawdata_dir
/
"
Air quality
"
/
"
street_canyon_risk.shp
"
airq_scr_shp
=
my_paths
.
rawdata_dir
/
"
Air quality
"
/
"
street_canyon_risk.shp
"
pm25_scr_shp
=
my_paths
.
rawdata_dir
/
"
Air quality
"
/
"
pm_25_concentration.shp
"
# Define columns and other constant values
prisk_raw_cols
=
[
"
Area_r1
"
,
"
Area_r2
"
,
"
Area_r3
"
,
"
Area_r4
"
,
"
Area_r6
"
,
"
Area_r9
"
]
prisk_nor_cols
=
[
"
P_Area_r1
"
,
"
P_Area_r2
"
,
"
P_Area_r3
"
,
"
P_Area_r4
"
,
"
P_Area_r6
"
,
"
P_Area_r9
"
,
]
airq_cols
=
[
"
risk_1_n
"
,
"
risk_2_n
"
,
"
risk_3_n
"
]
pm25_cols
=
[
"
pm_25_n
"
]
#---- Carbon ----
# Define available files
co2_ab_shp
=
my_paths
.
rawdata_dir
/
"
Carbon
"
/
"
c_absorption.shp
"
co2_em_shp
=
my_paths
.
rawdata_dir
/
"
Carbon
"
/
"
c_emission.shp
"
co2_ab_shp
=
my_paths
.
rawdata_dir
/
"
Carbon
"
/
"
c
arbon
_absorption.shp
"
co2_em_shp
=
my_paths
.
rawdata_dir
/
"
Carbon
"
/
"
c
arbon
_emission.shp
"
# Define columns and other constant values
# Absorption
co2_raw_cols
=
[
"
CO2
"
]
co2_nor_cols
=
[
"
kgCO2/m²
"
]
# Emmissions
ems
=
[
"
gasolio_em
"
,
"
gpl_em
"
,
"
ee_em
"
,
"
metano_em
"
,
"
legno_em
"
,
"
tot_em
"
,
"
c_ee_em
"
]
co2_em_raw_cols
=
[
"
finale
"
]
co2_em_nor_cols
=
[
"
finale/m²
"
]
co2_ab_cols
=
[
"
c_ab_n
"
]
co2_em_cols
=
[
"
c_em_n
"
]
#---- FFH ----
# Define available files
ffh_pa_shp
=
my_paths
.
rawdata_dir
/
"
FFH
"
/
"
protected_areas.shp
"
ffh_vegetation_shp
=
my_paths
.
rawdata_dir
/
"
FFH
"
/
"
degree_of_vegetation.shp
"
ffh_na_shp
=
my_paths
.
rawdata_dir
/
"
FFH
"
/
"
connectivity_natural_areas.shp
"
# Define columns and other constant values
ffh_
raw
_cols
=
[
"
F_Area_m2
"
]
ffh_n
or
_cols
=
[
"
P_Prot_are
"
]
ffh_
vegetation
_cols
=
[
"
veg_n
"
]
ffh_n
a
_cols
=
[
"
conn_n
"
]
#---- Spatial ----
# Define available files
spt_ag_shp
=
my_paths
.
rawdata_dir
/
"
Spatial
"
/
"
accessibility_green_areas.shp
"
spt_wk_shp
=
my_paths
.
rawdata_dir
/
"
Spatial
"
/
"
walkability.shp
"
spt_ag_shp
=
my_paths
.
rawdata_dir
/
"
Spatial
"
/
"
accessibility_urban_green_areas.shp
"
# Define columns and other constant values
spt_ag_raw_cols
=
[
"
Area_300m_
"
,
"
Area_750m_
"
]
spt_ag_nor_cols
=
[
"
P_area_300
"
,
"
P_area_750
"
]
spt_wk_nor_cols
=
[
"
MEAN_Resul
"
]
spt_ag_cols
=
[
"
area_800_n
"
]
#---- Temporal ----
...
...
@@ -73,47 +58,28 @@ spt_wk_nor_cols = ["MEAN_Resul"]
tem_ss_shp
=
my_paths
.
rawdata_dir
/
"
Temporal
"
/
"
soil_sealing.shp
"
# Define columns and other constant values
tem_raw_cols
=
[
"
Area_diff
"
]
tem_nor_cols
=
[
"
P_Area_dif
"
]
tem_ss_cols
=
[
"
s_area_n
"
]
#---- Thermal ----
# Define available files
thm_sh_shp
=
my_paths
.
rawdata_dir
/
"
Thermal
"
/
"
suhi
.shp
"
thm_sh_shp
=
my_paths
.
rawdata_dir
/
"
Thermal
"
/
"
heat_stress_zones
.shp
"
# Define columns and other constant values
thm_
nor
_cols
=
[
"
mean_norm
"
]
thm_
sh
_cols
=
[
"
stress_n
"
]
#---- Socio ----
# Define available files
socio_
gpkg
=
my_paths
.
rawdata_dir
/
"
Socio
"
/
"
bz_pop_2021.gpkg
"
socio_
shp
=
my_paths
.
rawdata_dir
/
"
Socio
"
/
"
demographic_data.shp
"
# Define columns and other constant values
socio_abs_cols
=
[
"
Maschi
"
,
"
Femmine
"
,
"
Totale popolazione
"
,
"
0-14
"
,
"
15-64
"
,
"
65 e oltre
"
,
"
indice di vecchiaia
"
,
"
0-17
"
,
"
Stranieri
"
,
"
Media componenti
"
,
"
Totale famiglie con figli
"
,
"
Totale famiglie senza figli
"
,
"
Totale famiglie
"
,
"
Single
"
,
]
socio_perc_cols
=
[
"
%65+
"
,
"
%minori
"
,
"
%stranieri
"
,
"
%con figli
"
,
"
%senza figli
"
,
"
%Single
"
,
]
socio_cols
=
[
"
pop_den
"
,
'
aging_in
'
,
'
foreign
'
,
'
fam_w_ch
'
]
# %%
...
...
@@ -121,37 +87,36 @@ socio_perc_cols = [
cols_dict
=
{
# Air quality
'
prisk
'
:{
"
P_Area_r1
"
:
"
Area very-low P-risk [%]
"
,
"
P_Area_r2
"
:
"
Area low P-risk [%]
"
,
"
P_Area_r3
"
:
"
Area medium-low P-risk [%]
"
,
"
P_Area_r4
"
:
"
Area medium-high P-risk [%]
"
,
"
P_Area_r6
"
:
"
Area high P-risk [%]
"
,
"
P_Area_r9
"
:
"
Area very-high P-risk [%]
"
'
airq
'
:{
"
pm_25_n
"
:
"
pm 2.5 concentrations [µg/m3]
"
,
"
risk_1_n
"
:
"
Area low AQ-risk [%]
"
,
"
risk_2_n
"
:
"
Area medium AQ-risk [%]
"
,
"
risk_3_n
"
:
"
Area high AQ-risk [%]
"
},
# Carbon
'
carbon
'
:{
"
finale/m²
"
:
"
Carbon emission building [ton
CO2/m
²
]
"
,
"
kgCO2/m²
"
:
"
Carbon
absorption vegetation [kg
CO2/m²]
"
"
c_ab_n
"
:
"
Carbon absorption vegetation [kg
CO2/m
2
]
"
,
"
c_em_n
"
:
"
Carbon
emission building [ton
CO2/m²]
"
},
# FFH
'
ffh
'
:{
"
P_Prot_are
"
:
"
Protected area [%]
"
},
'
ffh
'
:{
"
veg_n
"
:
"
Vegetation quality
"
,
"
conn_n
"
:
"
Betweenness centrality
"
},
# Spatial
'
spatial
'
:{
"
P_area_300
"
:
"
Accessibility public gardens (<5 min) [%]
"
,
"
MEAN_Resul
"
:
"
Walkability Index
"
"
area_800_n
"
:
"
Accessibility urban green areas (<10 min) [%]
"
,
},
# Temporal
'
temporal
'
:{
"
P_A
rea_
dif
"
:
"
Soil sealing between 2022 and 2018 [%]
"
},
'
temporal
'
:{
"
s_a
rea_
n
"
:
"
Soil sealing between 2022 and 2018 [%]
"
},
# Thermal
'
thermal
'
:{
"
mean_norm
"
:
"
Surface urban heat island
"
},
'
thermal
'
:{
"
stress_n
"
:
"
Heat stress zones
"
},
# Socio
'
socio
'
:{
"
indice di vecchiaia
"
:
"
Age Index
"
,
"
%stranieri
"
:
"
Foreign population [%]
"
,
"
%con figli
"
:
"
Families with children [%]
"
,
# "%senza figli": "Families without children [%]",
"
%Single
"
:
"
Families with one component [%]
"
"
pop_den
"
:
"
Population density [inhab/km2]
"
,
"
aging_in
"
:
"
Aging index
"
,
"
foreign
"
:
"
Foreign population [%]
"
,
"
fam_w_ch
"
:
"
Families with children [%]
"
}
}
...
...
@@ -170,7 +135,7 @@ conv = {k:v for values in cols_dict.values() for k,v in values.items()}
# define subset of columns according to categories
selected_col
=
{
'
col_
prisk
'
:
list
(
cols_dict
.
get
(
'
prisk
'
).
values
()),
'
col_
airq
'
:
list
(
cols_dict
.
get
(
'
airq
'
).
values
()),
'
col_carbon
'
:
list
(
cols_dict
.
get
(
'
carbon
'
).
values
()),
'
col_char
'
:
\
list
(
cols_dict
.
get
(
'
ffh
'
).
values
())
+
\
...
...
@@ -186,49 +151,43 @@ selected_col = {
def
read_data
():
prisk_scr
=
gpd
.
read_file
(
prisk_scr_shp
)
pm25_scr
=
gpd
.
read_file
(
pm25_scr_shp
)
airq_scr
=
gpd
.
read_file
(
airq_scr_shp
)
co2_ab
=
gpd
.
read_file
(
co2_ab_shp
)
co2_ab
[
"
SEZ
"
]
=
co2_ab
[
"
SEZ
"
].
astype
(
int
)
co2_ab
[
"
kgCO2/m²
"
]
=
co2_ab
[
"
CO2
"
]
/
co2_ab
.
area
co2_em
=
gpd
.
read_file
(
co2_em_shp
)
co2_em
[
"
finale/m²
"
]
=
co2_em
[
"
finale
"
]
/
co2_em
.
area
ffh_
pa
=
gpd
.
read_file
(
ffh_
pa
_shp
)
ffh_na
=
gpd
.
read_file
(
ffh_na_shp
)
ffh_
vegetation
=
gpd
.
read_file
(
ffh_
vegetation
_shp
)
spt_ag
=
gpd
.
read_file
(
spt_ag_shp
)
spt_wk
=
gpd
.
read_file
(
spt_wk_shp
)
tem_ss
=
gpd
.
read_file
(
tem_ss_shp
)
thm_sh
=
gpd
.
read_file
(
thm_sh_shp
)
socio_pk
=
gpd
.
read_file
(
socio_gpkg
,
driver
=
"
GPKG
"
)
# fix column
res
=
[]
for
val
in
socio_pk
[
"
indice di vecchiaia
"
]:
try
:
v
=
float
(
val
.
strip
().
replace
(
"
,
"
,
"
.
"
))
except
Exception
:
v
=
np
.
nan
res
.
append
(
v
)
socio_pk
[
"
indice di vecchiaia
"
]
=
np
.
array
(
res
,
dtype
=
float
)
raw
=
pd
.
concat
(
[
co2_ab
.
loc
[:,
sez_cols
],
prisk_scr
.
loc
[:,
prisk_raw_cols
+
prisk_nor_cols
],
co2_ab
.
loc
[:,
co2_raw_cols
+
co2_nor_cols
],
co2_em
.
loc
[:,
co2_em_raw_cols
+
co2_em_nor_cols
+
ems
],
ffh_pa
.
loc
[:,
ffh_raw_cols
+
ffh_nor_cols
],
spt_ag
.
loc
[:,
spt_ag_raw_cols
+
spt_ag_nor_cols
],
spt_wk
.
loc
[:,
spt_wk_nor_cols
],
tem_ss
.
loc
[:,
tem_raw_cols
+
tem_nor_cols
],
thm_sh
.
loc
[:,
thm_nor_cols
],
socio_pk
.
loc
[:,
socio_abs_cols
+
socio_perc_cols
],
],
axis
=
"
columns
"
,
)
raw
.
index
=
co2_ab
[
"
SEZ
"
]
socio_pk
=
gpd
.
read_file
(
socio_shp
)
list_data
=
[
pm25_scr
.
loc
[:,
id_col
+
[
'
geometry
'
]
+
pm25_cols
],
airq_scr
.
loc
[:,
id_col
+
airq_cols
],
co2_ab
.
loc
[:,
id_col
+
co2_ab_cols
],
co2_em
.
loc
[:,
id_col
+
co2_em_cols
],
ffh_na
.
loc
[:,
id_col
+
ffh_na_cols
],
ffh_vegetation
.
loc
[:,
id_col
+
ffh_vegetation_cols
],
spt_ag
.
loc
[:,
id_col
+
spt_ag_cols
],
tem_ss
.
loc
[:,
id_col
+
tem_ss_cols
],
thm_sh
.
loc
[:,
id_col
+
thm_sh_cols
],
socio_pk
.
loc
[:,
id_col
+
socio_cols
],
]
raw
=
list_data
[
0
]
for
df
in
list_data
[
1
:]:
raw
=
pd
.
merge
(
raw
,
df
,
on
=
id_col
)
raw
=
raw
.
set_index
(
id_col
,
drop
=
False
)
raw
.
index
.
name
=
None
# Remove error observation
mask
=
raw
.
index
==
8888888
raw
=
raw
[
~
mask
]
# rename columns
raw
=
raw
.
rename
(
columns
=
{
id_col
[
0
]:
'
SEZ
'
})
# set not valid values to Nan
raw
=
raw
.
replace
(
88888
,
np
.
nan
)
return
raw
# %%
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment