Skip to content
Snippets Groups Projects
Commit 9323434f authored by Claudio Zandonella's avatar Claudio Zandonella
Browse files

revise management paths and data import

- path are defined by a class object define_paths()
- inizialized according to specific city
- all file with the columns names are defined in the data file
- define dict of selected columns and their labels cols_dict
- then used do define automatically weighs and other objects
- fix issue on plot.py
parent f842af4a
No related branches found
No related tags found
No related merge requests found
......@@ -4,5 +4,5 @@
__pycache__
# Output folder bolzano
report/bolzano
report/*
/.luarc.json
......@@ -10,17 +10,12 @@ import pandas as pd
from justclust.data.bolzano import cols, conv, read_data, sez_cols, wcols
from justclust.explore.algclusters import explore_models, get_algclusters
from justclust.explore.preprocs import ApplyW, explore_preprocs, get_preprocs
from justclust.paths.bolzano import (
clfile,
clscorefile,
fimpfile,
prescorefile,
report_dir,
selclfile,
)
import justclust.paths.paths as paths
from justclust.plots import reporting
import justclust.quarto as qmd
my_paths = paths.define_paths(city = 'bolzano')
# %%
# read and filter/select data
raw = read_data()
......@@ -34,14 +29,14 @@ selected.rename(columns=conv, inplace=True)
# %%
# Remove error observation
mask = selected['Carbon emission building [ton CO2/m²]'] > 8
mask = selected.index == 8888888
selected = selected[~ mask]
# %%
# Explore clustering pre-processing
# create a directory if does not exists
os.makedirs(report_dir, exist_ok=True)
os.makedirs(my_paths.report_dir, exist_ok=True)
# weighted = selected * wcols
apply_weight = ApplyW(wcols)
......@@ -51,16 +46,16 @@ preprocs = get_preprocs(selected, apply_weight)
# pkeys = sorted(preprocs.keys())[:10]
# preprocs = {k: preprocs[k] for k in pkeys}
if prescorefile.exists():
print(f"Reading pre-processed scores from: {prescorefile}")
pre_scores = pd.read_excel(prescorefile)
if my_paths.prescorefile.exists():
print(f"Reading pre-processed scores from: {my_paths.prescorefile}")
pre_scores = pd.read_excel(my_paths.prescorefile)
else:
print(
f"Pre-processed file ({prescorefile}) not found, "
f"Pre-processed file ({my_paths.prescorefile}) not found, "
"start computing the main metrics"
)
pre_scores = explore_preprocs(selected, preprocs)
pre_scores.to_excel(prescorefile)
pre_scores.to_excel(my_paths.prescorefile)
print("=" * 50)
print(f"BEST PRE-PROCESSING (10/{len(pre_scores)}):")
......@@ -95,25 +90,25 @@ pre = preprocs[pre_key]
# %%
# explore the clustering space
if clfile.exists():
print(f"File: {clfile} exists load cluster from the file")
if my_paths.clfile.exists():
print(f"File: {my_paths.clfile} exists load cluster from the file")
print("WARNING: clusters read from file, not computed from scratch!")
geo_out = gpd.read_file(clfile)
geo_out = gpd.read_file(my_paths.clfile)
# force index to maintain consistency with raw and selected DF
geo_out.set_index("SEZ", drop=False, verify_integrity=True, inplace=True)
print(f"Reading clusters' scores from: {clscorefile}")
sc = pd.read_excel(clscorefile, header=0, index_col=0)
print(f"Reading feature importances from: {fimpfile}")
fi = pd.read_excel(fimpfile, header=0, index_col=0)
print(f"Reading clusters' scores from: {my_paths.clscorefile}")
sc = pd.read_excel(my_paths.clscorefile, header=0, index_col=0)
print(f"Reading feature importances from: {my_paths.fimpfile}")
fi = pd.read_excel(my_paths.fimpfile, header=0, index_col=0)
else:
print(f"File: {clfile} does not exist. Start computing the clusters")
print(f"File: {my_paths.clfile} does not exist. Start computing the clusters")
geo_out = raw.loc[:, sez_cols]
models = get_algclusters(preprocs=pre)
sc, fi = explore_models(
data=selected,
cldf=geo_out,
rdir=report_dir,
rdir=my_paths.report_dir,
models=models,
n_jobs=-1,
filters=[
......@@ -149,19 +144,19 @@ else:
for alg, adf in algs:
if len(adf) > 20:
for k, kdf in adf.groupby("n_clusters"):
klfile = report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg"
klfile = my_paths.report_dir / f"clusters_{alg.lower()}_k{k:03d}.gpkg"
print(f"Saving {klfile}")
geo_out.loc[:, sez_cols + list(kdf.index)].to_file(
klfile, driver="GPKG", index=True
)
# save all columns to open all the clusters
print(f"Saving {clfile}")
geo_out.to_file(clfile, driver="GPKG", index=True)
print(f"Saving {clscorefile}")
sc.to_excel(clscorefile)
print(f"Saving {fimpfile}")
fi.to_excel(fimpfile)
print(f"Saving {my_paths.clfile}")
geo_out.to_file(my_paths.clfile, driver="GPKG", index=True)
print(f"Saving {my_paths.clscorefile}")
sc.to_excel(my_paths.clscorefile)
print(f"Saving {my_paths.fimpfile}")
fi.to_excel(my_paths.fimpfile)
# %%
# Select cluster to be selected
......@@ -183,7 +178,7 @@ sel_clsts = [
if len(sel_clsts) == 0:
print(
f"You must select the clusters to be analyzed from {clscorefile} "
f"You must select the clusters to be analyzed from {my_paths.clscorefile} "
"using the cluster label as UUID"
)
sccols = [
......@@ -225,7 +220,7 @@ clsts = geo_out.loc[
],
]
print(f"Saving selected clusters {clsts}")
clsts.to_file(selclfile, driver="GPKG", index=True)
clsts.to_file(my_paths.selclfile, driver="GPKG", index=True)
for clname in sel_clsts:
print(f"Reporting: {clname}...")
......@@ -233,7 +228,7 @@ for clname in sel_clsts:
data=selected.loc[clsts.index],
clname=clname,
clsts=clsts,
cdir=report_dir / clname,
cdir=my_paths.report_dir / clname,
pal="turbo",
preprocs=pre,
)
......@@ -241,7 +236,7 @@ for clname in sel_clsts:
qmd.create_qmd_report(
city = 'bolzano',
clname= clname,
cdir = report_dir / clname
cdir = my_paths.report_dir / clname
)
# %%
......@@ -2,14 +2,17 @@ import geopandas as gpd
import numpy as np
import pandas as pd
from justclust.paths import bolzano as paths
import justclust.paths.paths as paths
my_paths = paths.define_paths(city = 'bolzano')
# %%
# Define columns and other constant values
quantile_range = (20, 80)
#---- Air Quality ----
# Define available files
aq_scr_shp = my_paths.rawdata_dir / "Air quality" / "street_canyon_risk.shp"
# air quality
# Define columns and other constant values
aq_raw_cols = ["Area_r1", "Area_r2", "Area_r3", "Area_r4", "Area_r6", "Area_r9"]
aq_nor_cols = [
"P_Area_r1",
......@@ -20,34 +23,68 @@ aq_nor_cols = [
"P_Area_r9",
]
# carbon
# absorption
#---- Carbon ----
# Define available files
co2_ab_shp = my_paths.rawdata_dir / "Carbon" / "c_absorption.shp"
co2_em_shp = my_paths.rawdata_dir / "Carbon" / "c_emission.shp"
# Define columns and other constant values
# Absorption
co2_raw_cols = ["CO2"]
co2_nor_cols = ["kgCO2/m²"]
# emmissions
# Emmissions
ems = ["gasolio_em", "gpl_em", "ee_em", "metano_em", "legno_em", "tot_em", "c_ee_em"]
co2_em_raw_cols = ["finale"]
co2_em_nor_cols = ["finale/m²"]
# flora fauna and habitat
# columns selected for the columns of the outputs
sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"]
#---- FFH ----
# Define available files
ffh_pa_shp = my_paths.rawdata_dir / "FFH" / "protected_areas.shp"
# Define columns and other constant values
ffh_raw_cols = ["F_Area_m2"]
ffh_nor_cols = ["P_Prot_are"]
# spatial
#---- Spatial ----
# Define available files
spt_ag_shp = my_paths.rawdata_dir / "Spatial" / "accessibility_green_areas.shp"
spt_wk_shp = my_paths.rawdata_dir / "Spatial" / "walkability.shp"
# Define columns and other constant values
spt_ag_raw_cols = ["Area_300m_", "Area_750m_"]
spt_ag_nor_cols = ["P_area_300", "P_area_750"]
spt_wk_nor_cols = ["MEAN_Resul"]
# temporary
#---- Temporal ----
# Define available files
tem_ss_shp = my_paths.rawdata_dir / "Temporal" / "soil_sealing.shp"
# Define columns and other constant values
tem_raw_cols = ["Area_diff"]
tem_nor_cols = ["P_Area_dif"]
# thermal
#---- Thermal ----
# Define available files
thm_sh_shp = my_paths.rawdata_dir / "Thermal" / "suhi.shp"
# Define columns and other constant values
thm_nor_cols = ["mean_norm"]
# socio-demographic
#---- Socio ----
# Define available files
socio_gpkg = my_paths.rawdata_dir / "Socio" / "bz_pop_2021.gpkg"
# Define columns and other constant values
socio_abs_cols = [
"Maschi",
"Femmine",
......@@ -73,163 +110,90 @@ socio_perc_cols = [
"%Single",
]
# columns selected for the columns of the outputs
sez_cols = ["geometry", "SEZ", "SEZ2011", "COD_REG", "COD_ISTAT", "PRO_COM"]
# %%
## SELECT COLUMNS TO BE USED
#---- SELECT COLUMNS TO BE USED ----
cols_dict = {
# Air quality
'air_q':{
"P_Area_r1": "area very-low P-risk [%]",
"P_Area_r2": "area low P-risk [%]",
"P_Area_r3": "area medium-low P-risk [%]",
"P_Area_r4": "area medium-high P-risk [%]",
"P_Area_r6": "area high P-risk [%]",
"P_Area_r9": "area very-high P-risk [%]"
},
# Carbon
'carbon':{
"finale/m²": "Carbon emission building [ton CO2/m²]",
"kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]"
},
# FFH
'ffh':{"P_Prot_are": "Protected area [%]"},
# Spatial
'spatial':{
"P_area_300": "Accessibility public gardens (<5 min) [%]",
"MEAN_Resul": "Walkability Index"
},
# Temporal
'temporal':{"P_Area_dif":"Soil sealing between 2022 and 2018 [%]"},
# Thermal
'thermal':{"mean_norm": "Surface urban heat island"},
# Socio
'socio':{
"indice di vecchiaia": "Age Index",
"%stranieri": "Foreign population [%]",
"%con figli": "Families with children [%]",
# "%senza figli": "Families without children [%]",
"%Single": "Families with one component [%]"
}
}
cols = [
# air-quality: % risk based on SVF and road classification
"P_Area_r1",
"P_Area_r2",
"P_Area_r3",
"P_Area_r4",
"P_Area_r6",
"P_Area_r9",
# carbon: absorption and emmissions
"kgCO2/m²",
"finale/m²",
# FFH: % protected area, % to 5 min to prot. areas
"P_Prot_are",
# spatial: % of area within a distance from protected areas and walkability index
"P_area_300",
# "P_area_750",
"MEAN_Resul",
# temporal: % area changed
"P_Area_dif",
# thermal
"mean_norm",
# socio demographic
# "%65+",
# "%minori",
"indice di vecchiaia",
"%stranieri",
"%con figli",
# "%senza figli", # same as con figli
"%Single",
]
# Get only selected columns old-names
cols = [col for value in cols_dict.values() for col in list(value.keys())]
# define weights
# define weights columns
wcols = np.array(
[
# air-quality: % risk based on SVF and road classification
1.0 / 6.0,
1.0 / 6.0,
1.0 / 6.0,
1.0 / 6.0,
1.0 / 6.0,
1.0 / 6.0,
# carbon: absorption and emmissions
1.0 / 2.0,
1.0 / 2.0,
# FFH: % protected area, % to 5 min to prot. areas
1.0,
# spatial: % of area within a distance from protected areas and walkability index
1.0 / 2.0,
# "P_area_750",
1.0 / 2.0,
# temporal: % area changed
1.0,
# thermal
1.0,
# socio demographic
# "%65+",
# "%minori",
1.0 / 4.0,
1.0 / 4.0,
1.0 / 4.0,
# "%senza figli"
1.0 / 4.0,
],
[weight for value in cols_dict.values() for weight in [1/len(value)] * len(value)],
dtype=float,
)
# define nice text label to not expose the internal nomenclature
conv = {
# air quality: Street canyon risk per census unit [%]
"P_Area_r1": "area very-low AQ-risk [%]",
"P_Area_r2": "area low AQ-risk [%]",
"P_Area_r3": "area medium-low AQ-risk [%]",
"P_Area_r4": "area medium-high AQ-risk [%]",
"P_Area_r6": "area high AQ-risk [%]",
"P_Area_r9": "area very-high AQ-risk [%]",
# carbon
"finale/m²": "Carbon emission building [ton CO2/m²]",
"kgCO2/m²": "Carbon absorption vegetation [kg CO2/m²]",
# FFH
"P_Prot_are": "Protected area [%]",
# spatial
"P_area_300": "Accessibility urban green areas (<5 min) [%]",
"MEAN_Resul": "Walkability Index",
# temporal
"P_Area_dif": "Soil sealing between 2022 and 2018 [%]",
# thermal
"mean_norm": "Surface urban heat island",
# socio demographic
"indice di vecchiaia": "Age Index",
"%stranieri": "Foreign population [%]",
"%con figli": "Families with children [%]",
# "%senza figli": "Families without children [%]",
"%Single": "Families with one component [%]",
}
conv = {k:v for values in cols_dict.values() for k,v in values.items()}
# %%
# define subset of columns according to categories
selected_col = {
'col_aq':[
'area very-low AQ-risk [%]',
'area low AQ-risk [%]',
'area medium-low AQ-risk [%]',
'area medium-high AQ-risk [%]',
'area high AQ-risk [%]',
'area very-high AQ-risk [%]'
],
'col_carbon':[
"Carbon emission building [ton CO2/m²]",
"Carbon absorption vegetation [kg CO2/m²]"
],
'col_char':[
"Protected area [%]",
"Accessibility urban green areas (<5 min) [%]",
"Walkability Index",
"Soil sealing between 2022 and 2018 [%]",
"Surface urban heat island"
],
'col_socio':[
"Age Index",
"Foreign population [%]",
"Families with children [%]",
"Families with one component [%]"
]
'col_aq': list(cols_dict.get('air_q').values()),
'col_carbon':list(cols_dict.get('carbon').values()),
'col_char':\
list(cols_dict.get('ffh').values())+\
list(cols_dict.get('spatial').values())+\
list(cols_dict.get('temporal').values())+\
list(cols_dict.get('thermal').values()),
'col_socio':list(cols_dict.get('socio').values())
}
# %%
# Define function to read files
def read_data():
aq_scr = gpd.read_file(paths.aq_scr_shp)
co2_ab = gpd.read_file(paths.co2_ab_shp)
aq_scr = gpd.read_file(aq_scr_shp)
co2_ab = gpd.read_file(co2_ab_shp)
co2_ab["SEZ"] = co2_ab["SEZ"].astype(int)
co2_ab["kgCO2/m²"] = co2_ab["CO2"] / co2_ab.area
co2_em = gpd.read_file(paths.co2_em_shp)
co2_em = gpd.read_file(co2_em_shp)
co2_em["finale/m²"] = co2_em["finale"] / co2_em.area
ffh_pa = gpd.read_file(paths.ffh_pa_shp)
spt_ag = gpd.read_file(paths.spt_ag_shp)
spt_wk = gpd.read_file(paths.spt_wk_shp)
tem_ss = gpd.read_file(paths.tem_ss_shp)
thm_sh = gpd.read_file(paths.thm_sh_shp)
ffh_pa = gpd.read_file(ffh_pa_shp)
spt_ag = gpd.read_file(spt_ag_shp)
spt_wk = gpd.read_file(spt_wk_shp)
tem_ss = gpd.read_file(tem_ss_shp)
thm_sh = gpd.read_file(thm_sh_shp)
socio_pk = gpd.read_file(paths.socio_gpkg, driver="GPKG")
socio_pk = gpd.read_file(socio_gpkg, driver="GPKG")
# fix column
res = []
for val in socio_pk["indice di vecchiaia"]:
......
from justclust.paths import drawdir, repdir
rawdata_dir = drawdir / "bolzano"
report_dir = repdir / "bolzano"
aq_scr_shp = rawdata_dir / "Air quality" / "street_canyon_risk.shp"
co2_ab_shp = rawdata_dir / "Carbon" / "c_absorption.shp"
co2_em_shp = rawdata_dir / "Carbon" / "c_emission.shp"
ffh_pa_shp = rawdata_dir / "FFH" / "protected_areas.shp"
spt_ag_shp = rawdata_dir / "Spatial" / "accessibility_green_areas.shp"
spt_wk_shp = rawdata_dir / "Spatial" / "walkability.shp"
tem_ss_shp = rawdata_dir / "Temporal" / "soil_sealing.shp"
thm_sh_shp = rawdata_dir / "Thermal" / "suhi.shp"
socio_gpkg = rawdata_dir / "Socio" / "bz_pop_2021.gpkg"
# main outputs into the report folder
# - pre-processing
prescorefile = report_dir / "scores_preprocess.xlsx"
# - clustering
clfile = report_dir / "clusters_all.gpkg"
selclfile = report_dir / "selected_clusters.gpkg"
clscorefile = report_dir / "scores_clusters.xlsx"
fimpfile = report_dir / "feature_importances.xlsx"
from justclust.paths import drawdir, repdir
class define_paths(object):
def __init__(self, city = 'city-example'):
self.drawdir = drawdir
self.repdir = repdir
self.rawdata_dir = drawdir / city
self.report_dir = repdir / city
# main outputs into the report folder
# - pre-processing
self.prescorefile = self.report_dir / "scores_preprocess.xlsx"
# - clustering
self.clfile = self.report_dir / "clusters_all.gpkg"
self.selclfile = self.report_dir / "selected_clusters.gpkg"
self.clscorefile = self.report_dir / "scores_clusters.xlsx"
self.fimpfile = self.report_dir / "feature_importances.xlsx"
......@@ -867,9 +867,14 @@ def reporting(
)
fig.savefig(cdir / "cardinality.png", dpi=600)
# check for invalid vlae 0 se to None
min_samples = int(ms.split("-")[1])
if int(ms.split("-")[1]) == 0:
min_samples = None
clusterer = hdbscan.HDBSCAN(
min_cluster_size=int(mcs.split("-")[1]),
min_samples=int(ms.split("-")[1]),
min_samples=min_samples,
metric=dist.split("-")[1],
cluster_selection_method=csm.split("-")[1],
)
......
......@@ -805,8 +805,8 @@ execute:
---
\pagebreak
"""
settings =r"""
```{python}
settings =f"""
```{graph_open}python{graph_close}
import os
import sys
import geopandas as gpd
......@@ -817,12 +817,14 @@ import matplotlib.pyplot as plt
# Custom modules
from justclust.data.bolzano import cols, conv, read_data, selected_col
from justclust.paths.bolzano import clfile
import justclust.paths.paths as paths
import justclust.quarto as qmd
plt.rcParams['figure.dpi'] = 600
plt.rcParams['savefig.dpi'] = 600
my_paths = paths.define_paths(city = '{city}')
```
"""
data_load = f"""
......@@ -834,12 +836,12 @@ selected = raw.loc[:, cols].dropna()
selected.rename(columns=conv, inplace=True)
# Remove error observation
# mask = selected['Carbon emission building [ton CO2/m²]'] > 8
# selected = selected[~ mask]
mask = selected.index == 8888888
selected = selected[~ mask]
clname = '{clname}'
geo_out = gpd.read_file(clfile)
geo_out = gpd.read_file(my_paths.clfile)
geo_out.set_index('SEZ', drop=False, verify_integrity=True, inplace=True)
clsts = geo_out.loc[selected.index,[clname, 'geometry']]
selected = clsts.join(selected, validate='1:1')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment