DRAVOGRAD_15-Copy1.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a5b7fa31",
   "metadata": {},
   "source": [
    "# DRAVOGRAD- DRAVA RIVER\n",
    "\n",
    "19/10/2021\n",
    "\n",
    "In this notebook results of different feature selections are compared for the Dravograd Basin (of which we have around 40 years of data)\n",
    "\n",
    "Input data are ERA5 metereological reanalysis quantile mapped and downscaled by ZAMG.\n",
    "\n",
    "15 days averages over the previous year of pecipitation, temperature and potential evapotranspiration are selected as input.\n",
    "\n",
    "The settings are the following:\n",
    "\n",
    "    A) 180 features are selected with PCA, the same numeriosity as setting C) ;\n",
    "\n",
    "    B) 36 features are selectedwith PCA, the same numeriosity as setting D) ;\n",
    "        \n",
    "    C) metereological inputs spatial statistics are used as input: mean, the 5th, 25th, 75th and 95th quantiles are selected.\n",
    "        \n",
    "    D) metereological inputs are spatially averaged.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "efcc49ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sf_runoff import daily_climatology, spatial_avg_daily_input, spatial_stats_daily_input, compute_anomalies\n",
    "from nested_CV import SVR_nested_CV_gridsearch, SVR_PCA_nested_CV_gridsearch\n",
    "from test import evaluate_prediction, plot_prediction, plot_anomalies\n",
    "from test import nested_CV_PCA_SVR_predict, nested_CV_SVR_predict\n",
    "from classic_CV_predict import classic_CV_PCA_SVR_predict, classic_CV_SVR_predict\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.stats import gaussian_kde\n",
    "\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.compose import TransformedTargetRegressor\n",
    "from sklearn.model_selection import GridSearchCV,TimeSeriesSplit\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import os\n",
    "\n",
    "import pdb\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f11761aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "path=r'C:\\Users\\mmazzolini\\OneDrive - Scientific Network South Tyrol\\Documents\\conda\\daily_input\\\\'\n",
    "\n",
    "daily_input = pd.read_csv(path+'HEDravograd_Drava_1952_2019.txt', index_col=0, parse_dates=True).astype('float32')\n",
    "\n",
    "daily_input_TPE = spatial_avg_daily_input(daily_input).astype('float32')\n",
    "\n",
    "daily_input_stat = spatial_stats_daily_input(daily_input).astype('float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bab54ce7",
   "metadata": {},
   "source": [
    "import sys, importlib\n",
    "importlib.reload(sys.modules['test'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a024e5fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "#define the possible parameters value (where Gridsearch is applied)\n",
    "\n",
    "C_range=np.logspace(-2, 2, 10)\n",
    "epsilon_range=np.logspace(-6, -2, 5)\n",
    "#n_range = [17, 50, 200]\n",
    "components_range = [5*3*24]\n",
    "#do not enlarge t_range for now\n",
    "\n",
    "t_range=[24]\n",
    "\n",
    "#define the temporal unit\n",
    "t_unit=15\n",
    "n_splits=10\n",
    "test_size=365"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7d5c48a",
   "metadata": {},
   "source": [
    "# A) PCA+SVR"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18861993",
   "metadata": {},
   "source": [
    "### TRAIN A PCA+SVR MODEL "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "aacb3a01",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mmazzolini\\.conda\\envs\\ado\\lib\\site-packages\\joblib\\externals\\loky\\process_executor.py:688: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n",
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\mmazzolini\\.conda\\envs\\ado\\lib\\site-packages\\sklearn\\model_selection\\_search.py:922: UserWarning: One or more of the test scores are non-finite: [0.56732319 0.56730931 0.56738403 0.56741894 0.56698353 0.64647372\n",
      " 0.64641015 0.64642175 0.64644063 0.64708145 0.6675348  0.66753546\n",
      " 0.66758763 0.6680089  0.66868631 0.66593785 0.66580957 0.6661129\n",
      " 0.66658101 0.66807678 0.6631981  0.66284985 0.66283214 0.66275594\n",
      " 0.66434368 0.65481724 0.65625838 0.65489764 0.65542396 0.65536472\n",
      " 0.63989756 0.64083217 0.64333993 0.64097485        nan 0.64402253\n",
      "        nan 0.64316301 0.64565183 0.64709809 0.65608431 0.66025241\n",
      "        nan        nan        nan        nan        nan 0.64122172\n",
      "        nan 0.62904516]\n",
      "  warnings.warn(\n",
      "C:\\Users\\mmazzolini\\.conda\\envs\\ado\\lib\\site-packages\\sklearn\\model_selection\\_search.py:922: UserWarning: One or more of the train scores are non-finite: [0.45772834 0.45772035 0.45772763 0.45770001 0.45719031 0.58781645\n",
      " 0.5878289  0.58779987 0.58776526 0.58765745 0.70777745 0.70774931\n",
      " 0.70775225 0.70778127 0.70794423 0.81819123 0.81816976 0.81811986\n",
      " 0.81820455 0.81818229 0.91899307 0.91895802 0.91898942 0.91900283\n",
      " 0.9190149  0.9790693  0.97906333 0.97907522 0.97905555 0.97908029\n",
      "        nan 0.99416289 0.99415837 0.99418102        nan 0.9981007\n",
      "        nan 0.9981009  0.99810992 0.9981407  0.99948531 0.99948513\n",
      "        nan        nan        nan        nan        nan 0.99992782\n",
      "        nan 0.99985103]\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 1 folds for each of 50 candidates, totalling 50 fits\n"
     ]
    },