{ "cells": [ { "cell_type": "markdown", "id": "52f38942", "metadata": {}, "source": [ "# DRAVOGRAD- DRAVA RIVER\n", "\n", "13/10/2021\n", "\n", "In this notebook results of different feature selections are compared for the Dravograd Basin (of which we have around 40 years of data)\n", "\n", "Input data are ERA5 metereological reanalysis quantile mapped and downscaled by ZAMG.\n", "\n", "Precipitation, temperature and potential evapotranspiration are selected as input.\n", "\n", "The settings are the following:\n", "\n", " A) 180 features are selected with PCA, the same numeriosity as setting C) ;\n", "\n", " B) 36 features are selectedwith PCA, the same numeriosity as setting D) ;\n", " \n", " C) metereological inputs spatial statistics are used as input: mean, the 5th, 25th, 75th and 95th quantiles are selected.\n", " \n", " D) metereological inputs are spatially averaged.\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "efcc49ff", "metadata": {}, "outputs": [], "source": [ "from sf_runoff import daily_climatology, spatial_avg_daily_input, spatial_stats_daily_input, compute_anomalies\n", "from nested_CV import SVR_nested_CV_gridsearch, SVR_PCA_nested_CV_gridsearch\n", "from test import evaluate_prediction, plot_prediction, plot_anomalies\n", "from test import nested_CV_PCA_SVR_predict, nested_CV_SVR_predict\n", "from classic_CV_predict import classic_CV_PCA_SVR_predict, classic_CV_SVR_predict\n", "\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.stats import gaussian_kde\n", "\n", "from sklearn.svm import SVR\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.compose import TransformedTargetRegressor\n", "from sklearn.model_selection import GridSearchCV,TimeSeriesSplit\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.decomposition import PCA\n", "\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import os\n", "\n", "import pdb\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "ac06f5c5", "metadata": {}, "outputs": [], "source": [ "path=r'C:\\Users\\mmazzolini\\OneDrive - Scientific Network South Tyrol\\Documents\\conda\\daily_input\\\\'\n", "# ## Feature selection\n", "daily_input = pd.read_csv(path+'HEDravograd_Drava_1952_2019.txt', index_col=0, parse_dates=True)\n", "\n", "daily_input_TPE = spatial_avg_daily_input(daily_input)\n", "\n", "daily_input_stat = spatial_stats_daily_input(daily_input)" ] }, { "cell_type": "markdown", "id": "0b7efaf0", "metadata": {}, "source": [ "import sys, importlib\n", "importlib.reload(sys.modules['test'])\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "a024e5fe", "metadata": {}, "outputs": [], "source": [ "#define the possible parameters value (where Gridsearch is applied)\n", "\n", "C_range=np.logspace(-2, 2, 10)\n", "epsilon_range=np.logspace(-6, -1, 5)\n", "#n_range = [17, 50, 200]\n", "components_range = [5*3*12]\n", "#do not enlarge t_range for now\n", "t_range=[12]\n", "n_splits=6\n", "test_size=730" ] }, { "cell_type": "markdown", "id": "e7d5c48a", "metadata": {}, "source": [ "# A) PCA+SVR" ] }, { "cell_type": "markdown", "id": "18861993", "metadata": {}, "source": [ "### TRAIN A PCA+SVR MODEL " ] }, { "cell_type": "code", "execution_count": null, "id": "aacb3a01", "metadata": { "scrolled": true }, "outputs": [], "source": [ "C,eps,n=SVR_PCA_nested_CV_gridsearch(daily_input, C_range, epsilon_range, components_range, t_range,n_splits,test_size)\n", "print(f'C={C}')\n", "print(f'eps={eps}')\n", "print(f'n={n}')" ] }, { "cell_type": "markdown", "id": "9e7bfb3b", "metadata": {}, "source": [ "C=0.21544346900318834\n", "epsilon=0.003593813663804626\n", "n=180\n" ] }, { "cell_type": "markdown", "id": "c4d61ea5", "metadata": {}, "source": [ "### PREDICT RUNOFF ON TEST SET AND QUANTIFY THE PERFORMANCE" ] }, { "cell_type": "code", "execution_count": null, "id": "db450a92", "metadata": { "scrolled": true }, "outputs": [], "source": [ "radius_for_ensemble = 1\n", "predictions = nested_CV_PCA_SVR_predict(daily_input, C, eps, n, 12, n_splits, test_size, radius_for_ensemble)\n", "plot_prediction(predictions)" ] }, { "cell_type": "code", "execution_count": null, "id": "5302ec81", "metadata": { "scrolled": true }, "outputs": [], "source": [ "r2_PCA180 = evaluate_prediction(predictions)\n", "#r2_PCA = evaluate_prediction(predictions.iloc[:,1:])" ] }, { "cell_type": "markdown", "id": "fad3f100", "metadata": {}, "source": [ "### ANALYSE RESULTS AS ANOMALIES" ] }, { "cell_type": "markdown", "id": "f8c1bbc4", "metadata": {}, "source": [ "### 1) PREDICT THE WHOLE SERIES WITH CLASSIC CV" ] }, { "cell_type": "code", "execution_count": null, "id": "efb89688", "metadata": {}, "outputs": [], "source": [ "predictions_for_clim = classic_CV_PCA_SVR_predict(daily_input, C, eps, n, 12, n_splits)\n", "climatologies =predictions_for_clim.groupby(by=predictions_for_clim.index.day_of_year).mean().loc[:,['prediction','true_runoff']]\n", "climatologies['doty']=climatologies.index\n" ] }, { "cell_type": "code", "execution_count": null, "id": "acf4e400", "metadata": { "scrolled": true }, "outputs": [], "source": [ "sns.set_theme(style=\"whitegrid\")\n", "ax,fig=plt.subplots(figsize=(20,10))\n", "sns.lineplot(data=climatologies,x='doty',y='prediction',label='prediction climatology')\n", "sns.lineplot(data=climatologies,x='doty',y='true_runoff',label='real climatology')\n", "plt.legend()" ] }, { "cell_type": "markdown", "id": "288e06d0", "metadata": {}, "source": [ "### 2) COMPUTE THE ANOMALIES AND PLOT THEM" ] }, { "cell_type": "code", "execution_count": null, "id": "59b6fec9", "metadata": { "scrolled": true }, "outputs": [], "source": [ "a=compute_anomalies(climatologies, predictions)\n", "a['split']=predictions.split\n", "plot_anomalies(a)" ] }, { "cell_type": "markdown", "id": "d2010fb8", "metadata": {}, "source": [ "## B) PCA+SVR " ] }, { "cell_type": "markdown", "id": "ca99ffdd", "metadata": {}, "source": [ "### TRAIN A PCA+SVR MODEL " ] }, { "cell_type": "code", "execution_count": null, "id": "9dfb2e78", "metadata": { "scrolled": true }, "outputs": [], "source": [ "components_range2=[12*3]\n", "C2,eps2,n2=SVR_PCA_nested_CV_gridsearch(daily_input, C_range, epsilon_range, components_range2, t_range,n_splits,test_size)\n", "print(f'C={C2}')\n", "print(f'epsilon={eps2}')\n", "print(f'n={n2}')" ] }, { "cell_type": "markdown", "id": "9c3703f0", "metadata": {}, "source": [ "C2 =0.5994842503189409\n", "eps2=10e-06\n", "n2 =36" ] }, { "cell_type": "markdown", "id": "73abca82", "metadata": {}, "source": [ "### PREDICT RUNOFF ON TEST SET AND QUANTIFY THE PERFORMANCE" ] }, { "cell_type": "code", "execution_count": null, "id": "952414ce", "metadata": { "scrolled": true }, "outputs": [], "source": [ "radius_for_ensemble = 1\n", "predictions2 = nested_CV_PCA_SVR_predict(daily_input, C2, eps2, n2, 12, n_splits, test_size, radius_for_ensemble)\n", "plot_prediction(predictions2)" ] }, { "cell_type": "markdown", "id": "b48d327e", "metadata": {}, "source": [ "predictions=pd.read_csv('tial.csv',date_parser=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "40451941", "metadata": { "scrolled": true }, "outputs": [], "source": [ "r2_PCA36 = evaluate_prediction(predictions2)\n", "#r2_PCA = evaluate_prediction(predictions.iloc[:,1:])" ] }, { "cell_type": "markdown", "id": "2a649917", "metadata": {}, "source": [ "### ANALYSE RESULTS AS ANOMALIES" ] }, { "cell_type": "markdown", "id": "6743d6e8", "metadata": {}, "source": [ "### 1) PREDICT THE WHOLE SERIES WITH CLASSIC CV" ] }, { "cell_type": "code", "execution_count": null, "id": "9740603c", "metadata": {}, "outputs": [], "source": [ "predictions_for_clim2 = classic_CV_PCA_SVR_predict(daily_input, C2, eps2, n2, 12, n_splits)\n", "climatologies2 =predictions_for_clim2.groupby(by=predictions_for_clim2.index.day_of_year).mean().loc[:,['prediction','true_runoff']]\n", "climatologies2['doty']=climatologies2.index\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9b999433", "metadata": { "scrolled": true }, "outputs": [], "source": [ "sns.set_theme(style=\"whitegrid\")\n", "ax,fig=plt.subplots(figsize=(20,10))\n", "sns.lineplot(data=climatologies2,x='doty',y='prediction',label='prediction climatology')\n", "sns.lineplot(data=climatologies2,x='doty',y='true_runoff',label='real climatology')\n", "plt.legend()" ] }, { "cell_type": "markdown", "id": "9052ba3a", "metadata": {}, "source": [ "### 2) COMPUTE THE ANOMALIES AND PLOT THEM" ] }, { "cell_type": "code", "execution_count": null, "id": "d8f33316", "metadata": {}, "outputs": [], "source": [ "a2=compute_anomalies(climatologies2, predictions2)\n", "a2['split']=predictions2.split\n", "plot_anomalies(a2)" ] }, { "cell_type": "markdown", "id": "b95dfd1e", "metadata": {}, "source": [ "# C) SVR \n", "## WITH SPATIAL STATISTICS AS INPUT" ] }, { "cell_type": "code", "execution_count": null, "id": "6bf784d0", "metadata": {}, "outputs": [], "source": [ "C3, eps3 = SVR_nested_CV_gridsearch(daily_input_stat, C_range,epsilon_range, t_range,n_splits,test_size)\n", "print(f'C3={C3}')\n", "print(f'eps3={eps3}')" ] }, { "cell_type": "markdown", "id": "c2ff44a9", "metadata": {}, "source": [ "### PREDICT RUNOFF ON TEST SET AND QUANTIFY THE PERFORMANCE" ] }, { "cell_type": "code", "execution_count": null, "id": "7868a11c", "metadata": { "scrolled": true }, "outputs": [], "source": [ "radius_for_ensemble = 1\n", "predictions3 = nested_CV_SVR_predict(daily_input_stat, C3, eps3, 12, n_splits, test_size, radius_for_ensemble)" ] }, { "cell_type": "code", "execution_count": null, "id": "12333e95", "metadata": { "scrolled": true }, "outputs": [], "source": [ "#predictions=pd.read_csv('tial.csv',date_parser=True)\n", "plot_prediction(predictions3)" ] }, { "cell_type": "code", "execution_count": null, "id": "136457d5", "metadata": { "scrolled": false }, "outputs": [], "source": [ "r2_STATS= evaluate_prediction(predictions3)" ] }, { "cell_type": "markdown", "id": "228f771a", "metadata": {}, "source": [ "### ANALYSE RESULTS AS ANOMALIES" ] }, { "cell_type": "markdown", "id": "ae5e6ce8", "metadata": {}, "source": [ "### 1) PREDICT THE WHOLE SERIES WITH CLASSIC CV" ] }, { "cell_type": "code", "execution_count": null, "id": "e2b07ef7", "metadata": {}, "outputs": [], "source": [ "predictions_for_clim3 = classic_CV_SVR_predict(daily_input_stat, C3, eps3, 12, n_splits)" ] }, { "cell_type": "code", "execution_count": null, "id": "82fd50a5", "metadata": {}, "outputs": [], "source": [ "climatologies3 =predictions_for_clim3.groupby(by=predictions_for_clim3.index.day_of_year).mean().loc[:,['prediction','true_runoff']]\n", "climatologies3['doty']=climatologies3.index\n" ] }, { "cell_type": "code", "execution_count": null, "id": "419702a5", "metadata": { "scrolled": true }, "outputs": [], "source": [ "sns.set_theme(style=\"whitegrid\")\n", "ax,fig=plt.subplots(figsize=(20,10))\n", "sns.lineplot(data=climatologies3,x='doty',y='prediction',label='prediction climatology')\n", "sns.lineplot(data=climatologies3,x='doty',y='true_runoff',label='real climatology')\n", "plt.legend()" ] }, { "cell_type": "markdown", "id": "f6fe45d0", "metadata": {}, "source": [ "### 2) COMPUTE THE ANOMALIES AND PLOT THEM" ] }, { "cell_type": "code", "execution_count": null, "id": "b09f1c65", "metadata": {}, "outputs": [], "source": [ "a3=compute_anomalies(climatologies3, predictions3)\n", "a3['split']=predictions3.split\n", "plot_anomalies(a3)" ] }, { "cell_type": "markdown", "id": "09620a81", "metadata": {}, "source": [ "# D) SVR \n", "## WITH SPATIAL AVERAGE AS INPUT" ] }, { "cell_type": "code", "execution_count": null, "id": "88f9717b", "metadata": {}, "outputs": [], "source": [ "C4, eps4 = SVR_nested_CV_gridsearch(daily_input_TPE, C_range,epsilon_range, t_range,n_splits,test_size)\n", "print(f'C4={C4}')\n", "print(f'eps4={eps4}')" ] }, { "cell_type": "markdown", "id": "02ac0761", "metadata": {}, "source": [ "### PREDICT RUNOFF ON TEST SET AND QUANTIFY THE PERFORMANCE" ] }, { "cell_type": "code", "execution_count": null, "id": "7023341f", "metadata": { "scrolled": true }, "outputs": [], "source": [ "radius_for_ensemble = 1\n", "predictions4 = nested_CV_SVR_predict(daily_input_stat, C4, eps4, 12, n_splits, test_size, radius_for_ensemble)" ] }, { "cell_type": "code", "execution_count": null, "id": "61a84773", "metadata": { "scrolled": true }, "outputs": [], "source": [ "#predictions=pd.read_csv('tial.csv',date_parser=True)\n", "plot_prediction(predictions4)" ] }, { "cell_type": "code", "execution_count": null, "id": "af30824e", "metadata": { "scrolled": false }, "outputs": [], "source": [ "r2_TPE = evaluate_prediction(predictions4)" ] }, { "cell_type": "markdown", "id": "0004c82c", "metadata": {}, "source": [ "### ANALYSE RESULTS AS ANOMALIES" ] }, { "cell_type": "markdown", "id": "d7b3d635", "metadata": {}, "source": [ "### 1) PREDICT THE WHOLE SERIES WITH CLASSIC CV" ] }, { "cell_type": "code", "execution_count": null, "id": "bcdad921", "metadata": {}, "outputs": [], "source": [ "predictions_for_clim4 = classic_CV_SVR_predict(daily_input_stat, C4, eps4, 12, n_splits)" ] }, { "cell_type": "code", "execution_count": null, "id": "ea57f052", "metadata": {}, "outputs": [], "source": [ "climatologies4 =predictions_for_clim4.groupby(by=predictions_for_clim4.index.day_of_year).mean().loc[:,['prediction','true_runoff']]\n", "climatologies4['doty']=climatologies4.index\n" ] }, { "cell_type": "code", "execution_count": null, "id": "57add82d", "metadata": { "scrolled": true }, "outputs": [], "source": [ "sns.set_theme(style=\"whitegrid\")\n", "ax,fig=plt.subplots(figsize=(20,10))\n", "sns.lineplot(data=climatologies4,x='doty',y='prediction',label='prediction climatology')\n", "sns.lineplot(data=climatologies4,x='doty',y='true_runoff',label='real climatology')\n", "plt.legend()" ] }, { "cell_type": "markdown", "id": "c2da6417", "metadata": {}, "source": [ "### 2) COMPUTE THE ANOMALIES AND PLOT THEM" ] }, { "cell_type": "code", "execution_count": null, "id": "752574e2", "metadata": {}, "outputs": [], "source": [ "a4=compute_anomalies(climatologies4, predictions4)\n", "a4['split']=predictions4.split\n", "plot_anomalies(a4)" ] }, { "cell_type": "markdown", "id": "4869891d", "metadata": {}, "source": [ "# COMPARE RESULTS" ] }, { "cell_type": "code", "execution_count": null, "id": "9bf593c1", "metadata": {}, "outputs": [], "source": [ "results = pd.concat([r2_PCA180,r2_PCA36,r2_STATS,r2_TPE],axis=1)\n", "results.columns=['A) PCA(180)','B) PCA(180)','C) TPE_STATS','D) TPE_AVG']\n", "results.iloc[2:].plot.bar()\n", "plt.title('R^2 RESULTS COMPARISON')\n", "plt.ylabel('r^2 [/]')" ] }, { "cell_type": "code", "execution_count": null, "id": "6a0f020b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a628d04d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "07c80b4b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7574cbca", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }