Notebook to assess different distributions.

73b60483 · Frisinghelli Daniel · bd79fde1 · 73b60483
Commit 73b60483 authored 3 years ago by Frisinghelli Daniel
--- a/Notebooks/pr_distribution.ipynb
+++ b/Notebooks/pr_distribution.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "63805b4a-b30e-4c10-a948-bc59651ca7a6",
+   "metadata": {},
+   "source": [
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28982ce9-bf0c-4eb1-8b9e-bec118359966",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# builtins\n",
+    "import datetime\n",
+    "import warnings\n",
+    "import calendar\n",
+    "\n",
+    "# externals\n",
+    "import xarray as xr\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import pandas as pd\n",
+    "import scipy.stats as stats\n",
+    "from mpl_toolkits.axes_grid1.inset_locator import inset_axes\n",
+    "import scipy.stats as stats\n",
+    "from IPython.display import Image\n",
+    "from sklearn.metrics import r2_score, roc_curve, auc, classification_report\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# locals\n",
+    "from climax.main.io import ERA5_PATH, OBS_PATH, TARGET_PATH, DEM_PATH\n",
+    "from climax.main.config import CALIB_PERIOD, VALID_PERIOD\n",
+    "from pysegcnn.core.utils import search_files\n",
+    "from pysegcnn.core.graphics import plot_classification_report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2373d894-e252-4f16-826b-88731e195259",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# model predictions and observations NetCDF \n",
+    "y_true = xr.open_dataset(search_files(OBS_PATH.joinpath('pr'), 'OBS_pr(.*).nc$').pop())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c2c1912-a947-4afe-84a7-895726be5cfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset to calibration and validation period\n",
+    "y_calib = y_true.sel(time=CALIB_PERIOD).precipitation.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed7d1686-968e-49e9-ba34-d03658ba3b32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mask missing values\n",
+    "y_calib = y_calib[~np.isnan(y_calib)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b570e39-f242-49ef-8aef-eff8fbcf7c4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only use values greater 0\n",
+    "y_calib = y_calib[y_calib > 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5de4933a-ef9d-4afe-8af6-ff68d91860ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fit gamma distribution to data\n",
+    "alpha, loc, beta = stats.gamma.fit(y_calib, loc=0.1)\n",
+    "gamma_calib = stats.gamma(alpha, loc=loc, scale=beta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75b74f7c-c9d7-4d52-b140-e0ad9de17b69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fit generalized pareto distribution to data\n",
+    "alpha, loc, beta = stats.genpareto.fit(y_calib, loc=0.1)\n",
+    "genpareto_calib = stats.genpareto(alpha, loc=loc, scale=beta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14ade547-443a-457a-bccd-d88d049b9d81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute empirical quantiles\n",
+    "quantiles = np.arange(0.01, 1, 0.005)\n",
+    "\n",
+    "# empirical quantiles and theoretical quantiles\n",
+    "eq = np.quantile(y_calib, quantiles)\n",
+    "tq_gamma = gamma_calib.ppf(quantiles)\n",
+    "tq_genpareto = genpareto_calib.ppf(quantiles)\n",
+    "\n",
+    "# Q-Q plot \n",
+    "RANGE = 40\n",
+    "fig, ax = plt.subplots(1, 1, figsize=(6, 6))\n",
+    "ax.scatter(eq, tq_gamma, color='grey', label='Gamma')\n",
+    "ax.scatter(eq, tq_genpareto, color='k', label='GenPareto')\n",
+    "ax.plot(np.arange(0, RANGE), np.arange(0, RANGE), '--k')\n",
+    "ax.set_xlim(0, RANGE)\n",
+    "ax.set_ylim(0, RANGE)\n",
+    "ax.set_ylabel('Theoretical quantiles');\n",
+    "ax.set_xlabel('Empirical quantiles');\n",
+    "ax.legend(frameon=False, fontsize=12);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0fea8ac-bac0-4096-bc81-90d799f8ab94",
+   "metadata": {},
+   "source": [
+    "### Empirical quantiles per grid point"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dcb3348-5d22-4324-b840-2c305983e826",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quantiles = np.arange(0.01, 1, 0.01)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18ce44b4-d6c0-4950-9cd2-7a3af1095b24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset to calibration period\n",
+    "y_calib_p = y_true.sel(time=CALIB_PERIOD).precipitation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a02c42e0-591c-4630-89b8-5dd8ef71a4a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute empirical quantiles over time\n",
+    "equantiles = y_calib_p.quantile(quantiles, dim='time')\n",
+    "equantiles = equantiles.rename({'quantile': 'q'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "966d2724-2628-4842-abc9-695711945347",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# iterate over the grid points\n",
+    "gammaq = np.ones(shape=(len(y_calib_p.q), len(y_calib_p.y), len(y_calib_p.x))) * np.nan\n",
+    "for i, _ in enumerate(y_calib_p.x):\n",
+    "    print('Rows: {}/{}'.format(i + 1, len(y_calib_p.x)))\n",
+    "    for j, _ in enumerate(y_calib_p.y):\n",
+    "    \n",
+    "        # current grid point: xarray.Dataset, dimensions=(time)\n",
+    "        point = y_calib_p.isel(x=i, y=j).values\n",
+    "        \n",
+    "        # mask missing values\n",
+    "        point = point[~np.isnan(point)]\n",
+    "        \n",
+    "        # check if the grid point is valid\n",
+    "        if point.size < 1:\n",
+    "            # move on to next grid point\n",
+    "            continue\n",
+    "            \n",
+    "        # consider only values > 0\n",
+    "        point = point[point > 0]\n",
+    "            \n",
+    "        # fit Gamma distribution to grid point\n",
+    "        alpha, loc, beta = stats.gamma.fit(point)\n",
+    "        gamma = stats.gamma(alpha, loc=loc, scale=beta)\n",
+    "        \n",
+    "        # compute theoretical quantiles of fitted gamma distribution\n",
+    "        tq = gamma.ppf(quantiles)\n",
+    "        \n",
+    "        # store theoretical quantiles for current grid point\n",
+    "        gammaq[:, j, i] = tq\n",
+    "\n",
+    "# store theoretical quantiles in xarray.DataArray\n",
+    "tquantiles = xr.DataArray(data=gammaq, dims=['q', 'y', 'x'],\n",
+    "                          coords=dict(q=quantiles, lat=y_calib_p.y, lon=y_calib_p.x),\n",
+    "                          name='precipitation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "601de7cb-35f4-40e1-9b51-2dab23102659",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute bias in theoretical quantiles\n",
+    "biasq = tquantiles - equantiles"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:63805b4a-b30e-4c10-a948-bc59651ca7a6 tags:
+
+### Imports
+
+%% Cell type:code id:28982ce9-bf0c-4eb1-8b9e-bec118359966 tags:
+
+``` python
+# builtins
+import datetime
+import warnings
+import calendar
+
+# externals
+import xarray as xr
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import scipy.stats as stats
+from mpl_toolkits.axes_grid1.inset_locator import inset_axes
+import scipy.stats as stats
+from IPython.display import Image
+from sklearn.metrics import r2_score, roc_curve, auc, classification_report
+from sklearn.model_selection import train_test_split
+
+# locals
+from climax.main.io import ERA5_PATH, OBS_PATH, TARGET_PATH, DEM_PATH
+from climax.main.config import CALIB_PERIOD, VALID_PERIOD
+from pysegcnn.core.utils import search_files
+from pysegcnn.core.graphics import plot_classification_report
+```
+
+%% Cell type:code id:2373d894-e252-4f16-826b-88731e195259 tags:
+
+``` python
+# model predictions and observations NetCDF
+y_true = xr.open_dataset(search_files(OBS_PATH.joinpath('pr'), 'OBS_pr(.*).nc$').pop())
+```
+
+%% Cell type:code id:0c2c1912-a947-4afe-84a7-895726be5cfd tags:
+
+``` python
+# subset to calibration and validation period
+y_calib = y_true.sel(time=CALIB_PERIOD).precipitation.values
+```
+
+%% Cell type:code id:ed7d1686-968e-49e9-ba34-d03658ba3b32 tags:
+
+``` python
+# mask missing values
+y_calib = y_calib[~np.isnan(y_calib)]
+```
+
+%% Cell type:code id:1b570e39-f242-49ef-8aef-eff8fbcf7c4d tags:
+
+``` python
+# only use values greater 0
+y_calib = y_calib[y_calib > 0]
+```
+
+%% Cell type:code id:5de4933a-ef9d-4afe-8af6-ff68d91860ce tags:
+
+``` python
+# fit gamma distribution to data
+alpha, loc, beta = stats.gamma.fit(y_calib, loc=0.1)
+gamma_calib = stats.gamma(alpha, loc=loc, scale=beta)
+```
+
+%% Cell type:code id:75b74f7c-c9d7-4d52-b140-e0ad9de17b69 tags:
+
+``` python
+# fit generalized pareto distribution to data
+alpha, loc, beta = stats.genpareto.fit(y_calib, loc=0.1)
+genpareto_calib = stats.genpareto(alpha, loc=loc, scale=beta)
+```
+
+%% Cell type:code id:14ade547-443a-457a-bccd-d88d049b9d81 tags:
+
+``` python
+# compute empirical quantiles
+quantiles = np.arange(0.01, 1, 0.005)
+
+# empirical quantiles and theoretical quantiles
+eq = np.quantile(y_calib, quantiles)
+tq_gamma = gamma_calib.ppf(quantiles)
+tq_genpareto = genpareto_calib.ppf(quantiles)
+
+# Q-Q plot
+RANGE = 40
+fig, ax = plt.subplots(1, 1, figsize=(6, 6))
+ax.scatter(eq, tq_gamma, color='grey', label='Gamma')
+ax.scatter(eq, tq_genpareto, color='k', label='GenPareto')
+ax.plot(np.arange(0, RANGE), np.arange(0, RANGE), '--k')
+ax.set_xlim(0, RANGE)
+ax.set_ylim(0, RANGE)
+ax.set_ylabel('Theoretical quantiles');
+ax.set_xlabel('Empirical quantiles');
+ax.legend(frameon=False, fontsize=12);
+```
+
+%% Cell type:markdown id:c0fea8ac-bac0-4096-bc81-90d799f8ab94 tags:
+
+### Empirical quantiles per grid point
+
+%% Cell type:code id:4dcb3348-5d22-4324-b840-2c305983e826 tags:
+
+``` python
+quantiles = np.arange(0.01, 1, 0.01)
+```
+
+%% Cell type:code id:18ce44b4-d6c0-4950-9cd2-7a3af1095b24 tags:
+
+``` python
+# subset to calibration period
+y_calib_p = y_true.sel(time=CALIB_PERIOD).precipitation
+```
+
+%% Cell type:code id:a02c42e0-591c-4630-89b8-5dd8ef71a4a0 tags:
+
+``` python
+# compute empirical quantiles over time
+equantiles = y_calib_p.quantile(quantiles, dim='time')
+equantiles = equantiles.rename({'quantile': 'q'})
+```
+
+%% Cell type:code id:966d2724-2628-4842-abc9-695711945347 tags:
+
+``` python
+# iterate over the grid points
+gammaq = np.ones(shape=(len(y_calib_p.q), len(y_calib_p.y), len(y_calib_p.x))) * np.nan
+for i, _ in enumerate(y_calib_p.x):
+    print('Rows: {}/{}'.format(i + 1, len(y_calib_p.x)))
+    for j, _ in enumerate(y_calib_p.y):
+
+        # current grid point: xarray.Dataset, dimensions=(time)
+        point = y_calib_p.isel(x=i, y=j).values
+
+        # mask missing values
+        point = point[~np.isnan(point)]
+
+        # check if the grid point is valid
+        if point.size < 1:
+            # move on to next grid point
+            continue
+
+        # consider only values > 0
+        point = point[point > 0]
+
+        # fit Gamma distribution to grid point
+        alpha, loc, beta = stats.gamma.fit(point)
+        gamma = stats.gamma(alpha, loc=loc, scale=beta)
+
+        # compute theoretical quantiles of fitted gamma distribution
+        tq = gamma.ppf(quantiles)
+
+        # store theoretical quantiles for current grid point
+        gammaq[:, j, i] = tq
+
+# store theoretical quantiles in xarray.DataArray
+tquantiles = xr.DataArray(data=gammaq, dims=['q', 'y', 'x'],
+                          coords=dict(q=quantiles, lat=y_calib_p.y, lon=y_calib_p.x),
+                          name='precipitation')
+```
+
+%% Cell type:code id:601de7cb-35f4-40e1-9b51-2dab23102659 tags:
+
+``` python
+# compute bias in theoretical quantiles
+biasq = tquantiles - equantiles
+```