Added evaluating of bootstrapped model training.

48d71cb4 · Frisinghelli Daniel · 9dc9f436 · 48d71cb4
Commit 48d71cb4 authored 3 years ago by Frisinghelli Daniel
--- a/Notebooks/eval_bootstrap.ipynb
+++ b/Notebooks/eval_bootstrap.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fde8874d-299f-4f48-a10a-9fb6a00b43b9",
+   "metadata": {},
+   "source": [
+    "# Evaluate bootstrapped model results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "969d063b-5262-4324-901f-0a48630c4f27",
+   "metadata": {},
+   "source": [
+    "### Imports and constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8af00ae4-4aeb-4ff8-a46a-65966b28c440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# builtins\n",
+    "import pathlib\n",
+    "\n",
+    "# externals\n",
+    "import numpy as np\n",
+    "import xarray as xr\n",
+    "\n",
+    "# locals\n",
+    "from climax.main.io import OBS_PATH, ERA5_PATH\n",
+    "from climax.main.config import VALID_PERIOD\n",
+    "from pysegcnn.core.utils import search_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bc74835-dc59-46ed-849b-3ff614e53eee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mapping from predictands to variable names\n",
+    "NAMES = {'tasmin': 'minimum temperature', 'tasmax': 'maximum temperature', 'pr': 'precipitation'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8a63ef3-35ef-4ffa-b1f3-5c2986eb7eb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# path to bootstrapped model results\n",
+    "RESULTS = pathlib.Path('/mnt/CEPH_PROJECTS/FACT_CLIMAX/ERA5_PRED/bootstrap')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7eae545b-4d8a-4689-a6c0-4aba2cb9104e",
+   "metadata": {},
+   "source": [
+    "## Search model configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e856f80-14fd-405f-a44e-cc77863f8e5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# predictand to evaluate\n",
+    "PREDICTAND = 'tasmin'\n",
+    "LOSS = 'L1Loss'\n",
+    "OPTIM = 'Adam'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "011b792d-7349-44ad-997d-11f236472a11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model to evaluate\n",
+    "model = 'USegNet_{}_ztuvq_500_850_p_dem_doy_{}_{}'.format(PREDICTAND, LOSS, OPTIM)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc4ca6f0-5490-4522-8661-e36bd1be11b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get bootstrapped models\n",
+    "models = sorted(search_files(RESULTS.joinpath(PREDICTAND), model + '(.*).nc$'),\n",
+    "                key=lambda x: int(x.stem.split('_')[-1]))\n",
+    "models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e790ed9f-451c-4368-849d-06d9c50f797c",
+   "metadata": {},
+   "source": [
+    "### Load observations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0862e0c8-06df-45d6-bc1b-002ffb6e9915",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load observations\n",
+    "y_true = xr.open_dataset(OBS_PATH.joinpath(PREDICTAND, 'OBS_{}_1980_2018.nc'.format(PREDICTAND)),\n",
+    "                         chunks={'time': 365})\n",
+    "y_true = y_true.sel(time=VALID_PERIOD)  # subset to time period covered by predictions\n",
+    "y_true = y_true.rename({NAMES[PREDICTAND]: PREDICTAND}) if PREDICTAND == 'pr' else y_true"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aba38642-85d1-404a-81f3-65d23985fb7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mask of missing values\n",
+    "missing = np.isnan(y_true[PREDICTAND])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4512ed2-d503-4bc1-ae76-84560c101a14",
+   "metadata": {},
+   "source": [
+    "### Load reference data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f90f6abf-5fd6-49c0-a1ad-f62242b3d3a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ERA-5 reference dataset\n",
+    "if PREDICTAND == 'pr':\n",
+    "    y_refe = xr.open_dataset(search_files(ERA5_PATH.joinpath('ERA5', 'total_precipitation'), '.nc$').pop(),\n",
+    "                             chunks={'time': 365})\n",
+    "    y_refe = y_refe.rename({'tp': 'pr'})\n",
+    "else:\n",
+    "    y_refe = xr.open_dataset(search_files(ERA5_PATH.joinpath('ERA5', '2m_{}_temperature'.format(PREDICTAND.lstrip('tas'))), '.nc$').pop(),\n",
+    "                             chunks={'time': 365})\n",
+    "    y_refe = y_refe - 273.15  # convert to °C\n",
+    "    y_refe = y_refe.rename({'t2m': PREDICTAND})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea6d5f56-4f39-4e9a-976d-00ff28fce95c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset to time period covered by predictions\n",
+    "y_refe = y_refe.sel(time=VALID_PERIOD).drop_vars('lambert_azimuthal_equal_area')\n",
+    "y_refe = y_refe.transpose('time', 'y', 'x')  # change order of dimensions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d37702de-da5f-4306-acc1-e569471c1f12",
+   "metadata": {},
+   "source": [
+    "### Load QM-adjusted reference data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fffbd267-d08b-44f4-869c-7056c4f19c28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_refe_qm = xr.open_dataset(ERA5_PATH.joinpath('QM_ERA5_{}_day_19912010.nc'.format(PREDICTAND)), chunks={'time': 365})\n",
+    "y_refe_qm = y_refe_qm.transpose('time', 'x', 'y')  # change order of dimensions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16fa580e-27a7-4758-9164-7f607df7179d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# center hours at 00:00:00 rather than 12:00:00\n",
+    "y_refe_qm['time'] = np.asarray([t.astype('datetime64[D]') for t in y_refe_qm.time.values])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6789791f-006b-49b3-aa04-34e4ed8e1571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset to time period covered by predictions\n",
+    "y_refe_qm = y_refe_qm.sel(time=VALID_PERIOD).drop_vars('lambert_azimuthal_equal_area')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b51cfb3f-caa8-413e-a12d-47bbafcef1df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# align datasets and mask missing values\n",
+    "y_true, y_refe, y_refe_qm = xr.align(y_true[PREDICTAND], y_refe[PREDICTAND], y_refe_qm[PREDICTAND], join='override')\n",
+    "y_refe = y_refe.where(~missing, other=np.nan)\n",
+    "y_refe_qm = y_refe_qm.where(~missing, other=np.nan)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4a6c286-6b88-487d-866c-3cb633686dac",
+   "metadata": {},
+   "source": [
+    "### Load model predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccaf0118-da51-43b0-a2b6-56ba4b252999",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = [xr.open_dataset(sim, chunks={'time': 365}) for sim in models]\n",
+    "if PREDICTAND == 'pr':\n",
+    "    y_pred = [y_p.rename({NAMES[PREDICTAND]: PREDICTAND}) for y_p in y_pred]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df3f018e-4723-4878-b72a-0586b68e6dfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# align datasets and mask missing values\n",
+    "y_prob = []\n",
+    "for i, y_p in enumerate(y_pred):\n",
+    "    \n",
+    "    # check whether evaluating precipitation or temperatures\n",
+    "    if len(y_p.data_vars) > 1:\n",
+    "        _, _, y_p, y_p_prob = xr.align(y_true, y_refe, y_p[PREDICTAND], y_p.prob, join='override')\n",
+    "        y_p_prob = y_p_prob.where(~missing, other=np.nan)  # mask missing values\n",
+    "        y_prob.append(y_p_prob)\n",
+    "    else:\n",
+    "        _, _, y_p = xr.align(y_true, y_refe, y_p[PREDICTAND], join='override')\n",
+    "    \n",
+    "    # mask missing values\n",
+    "    y_p = y_p.where(~missing, other=np.nan)\n",
+    "    y_pred[i] = y_p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7effbf83-7977-4a47-aa6d-d57c4c4c22c6",
+   "metadata": {},
+   "source": [
+    "## Bias, MAE, and RMSE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8c23b7b-ccdf-412a-a30d-ac686af03d9f",
+   "metadata": {},
+   "source": [
+    "Calculate yearly average bias, MAE, and RMSE over entire reference period for model predictions, ERA-5, and QM-adjusted ERA-5."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7939a4d2-4eff-4507-86f8-dba7c0b635df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# yearly average values over validation period\n",
+    "y_pred_yearly_avg = [y_p.groupby('time.year').mean(dim='time') for y_p in y_pred]\n",
+    "y_refe_yearly_avg = y_refe.groupby('time.year').mean(dim='time')\n",
+    "y_refe_qm_yearly_avg = y_refe_qm.groupby('time.year').mean(dim='time')\n",
+    "y_true_yearly_avg = y_true.groupby('time.year').mean(dim='time')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cce7ffbf-7e16-45f1-a2b6-5bc688595ee7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# yearly average bias, mae, and rmse for model predictions\n",
+    "bias_pred = [y_p - y_true_yearly_avg for y_p in y_pred_yearly_avg]\n",
+    "mae_pred = [np.abs(y_p - y_true_yearly_avg) for y_p in y_pred_yearly_avg]\n",
+    "rmse_pred = [np.sqrt((y_p - y_true_yearly_avg) ** 2) for y_p in y_pred_yearly_avg]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64e29db7-998d-4952-84b0-1c79016ab9a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# yearly average bias, mae, and rmse for ERA-5\n",
+    "bias_refe = y_refe_yearly_avg - y_true_yearly_avg\n",
+    "mae_refe = np.abs(y_refe_yearly_avg - y_true_yearly_avg)\n",
+    "rmse_refe = np.sqrt((y_refe_yearly_avg - y_true_yearly_avg) ** 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb7dc5c2-c518-4251-8d27-5b52e20e0397",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# yearly average bias, mae, and rmse for QM-Adjusted ERA-5\n",
+    "bias_refe_qm = y_refe_qm_yearly_avg - y_true_yearly_avg\n",
+    "mae_refe_qm = np.abs(y_refe_qm_yearly_avg - y_true_yearly_avg)\n",
+    "rmse_refe_qm = np.sqrt((y_refe_qm_yearly_avg - y_true_yearly_avg) ** 2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:fde8874d-299f-4f48-a10a-9fb6a00b43b9 tags:
+# Evaluate bootstrapped model results
+%% Cell type:markdown id:969d063b-5262-4324-901f-0a48630c4f27 tags:
+### Imports and constants
+%% Cell type:code id:8af00ae4-4aeb-4ff8-a46a-65966b28c440 tags:
+``` python
+# builtins
+import pathlib
+# externals
+import numpy as np
+import xarray as xr
+# locals
+from climax.main.io import OBS_PATH, ERA5_PATH
+from climax.main.config import VALID_PERIOD
+from pysegcnn.core.utils import search_files
+```
+%% Cell type:code id:5bc74835-dc59-46ed-849b-3ff614e53eee tags:
+``` python
+# mapping from predictands to variable names
+NAMES = {'tasmin': 'minimum temperature', 'tasmax': 'maximum temperature', 'pr': 'precipitation'}
+```
+%% Cell type:code id:c8a63ef3-35ef-4ffa-b1f3-5c2986eb7eb1 tags:
+``` python
+# path to bootstrapped model results
+RESULTS = pathlib.Path('/mnt/CEPH_PROJECTS/FACT_CLIMAX/ERA5_PRED/bootstrap')
+```
+%% Cell type:markdown id:7eae545b-4d8a-4689-a6c0-4aba2cb9104e tags:
+## Search model configuration
+%% Cell type:code id:3e856f80-14fd-405f-a44e-cc77863f8e5b tags:
+``` python
+# predictand to evaluate
+PREDICTAND = 'tasmin'
+LOSS = 'L1Loss'
+OPTIM = 'Adam'
+```
+%% Cell type:code id:011b792d-7349-44ad-997d-11f236472a11 tags:
+``` python
+# model to evaluate
+model = 'USegNet_{}_ztuvq_500_850_p_dem_doy_{}_{}'.format(PREDICTAND, LOSS, OPTIM)
+```
+%% Cell type:code id:dc4ca6f0-5490-4522-8661-e36bd1be11b7 tags:
+``` python
+# get bootstrapped models
+models = sorted(search_files(RESULTS.joinpath(PREDICTAND), model + '(.*).nc$'),
+                key=lambda x: int(x.stem.split('_')[-1]))
+models
+```
+%% Cell type:markdown id:e790ed9f-451c-4368-849d-06d9c50f797c tags:
+### Load observations
+%% Cell type:code id:0862e0c8-06df-45d6-bc1b-002ffb6e9915 tags:
+``` python
+# load observations
+y_true = xr.open_dataset(OBS_PATH.joinpath(PREDICTAND, 'OBS_{}_1980_2018.nc'.format(PREDICTAND)),
+                         chunks={'time': 365})
+y_true = y_true.sel(time=VALID_PERIOD)  # subset to time period covered by predictions
+y_true = y_true.rename({NAMES[PREDICTAND]: PREDICTAND}) if PREDICTAND == 'pr' else y_true
+```
+%% Cell type:code id:aba38642-85d1-404a-81f3-65d23985fb7a tags:
+``` python
+# mask of missing values
+missing = np.isnan(y_true[PREDICTAND])
+```
+%% Cell type:markdown id:d4512ed2-d503-4bc1-ae76-84560c101a14 tags:
+### Load reference data
+%% Cell type:code id:f90f6abf-5fd6-49c0-a1ad-f62242b3d3a0 tags:
+``` python
+# ERA-5 reference dataset
+if PREDICTAND == 'pr':
+    y_refe = xr.open_dataset(search_files(ERA5_PATH.joinpath('ERA5', 'total_precipitation'), '.nc$').pop(),
+                             chunks={'time': 365})
+    y_refe = y_refe.rename({'tp': 'pr'})
+else:
+    y_refe = xr.open_dataset(search_files(ERA5_PATH.joinpath('ERA5', '2m_{}_temperature'.format(PREDICTAND.lstrip('tas'))), '.nc$').pop(),
+                             chunks={'time': 365})
+    y_refe = y_refe - 273.15  # convert to °C
+    y_refe = y_refe.rename({'t2m': PREDICTAND})
+```
+%% Cell type:code id:ea6d5f56-4f39-4e9a-976d-00ff28fce95c tags:
+``` python
+# subset to time period covered by predictions
+y_refe = y_refe.sel(time=VALID_PERIOD).drop_vars('lambert_azimuthal_equal_area')
+y_refe = y_refe.transpose('time', 'y', 'x')  # change order of dimensions
+```
+%% Cell type:markdown id:d37702de-da5f-4306-acc1-e569471c1f12 tags:
+### Load QM-adjusted reference data
+%% Cell type:code id:fffbd267-d08b-44f4-869c-7056c4f19c28 tags:
+``` python
+y_refe_qm = xr.open_dataset(ERA5_PATH.joinpath('QM_ERA5_{}_day_19912010.nc'.format(PREDICTAND)), chunks={'time': 365})
+y_refe_qm = y_refe_qm.transpose('time', 'x', 'y')  # change order of dimensions
+```
+%% Cell type:code id:16fa580e-27a7-4758-9164-7f607df7179d tags:
+``` python
+# center hours at 00:00:00 rather than 12:00:00
+y_refe_qm['time'] = np.asarray([t.astype('datetime64[D]') for t in y_refe_qm.time.values])
+```
+%% Cell type:code id:6789791f-006b-49b3-aa04-34e4ed8e1571 tags:
+``` python
+# subset to time period covered by predictions
+y_refe_qm = y_refe_qm.sel(time=VALID_PERIOD).drop_vars('lambert_azimuthal_equal_area')
+```
+%% Cell type:code id:b51cfb3f-caa8-413e-a12d-47bbafcef1df tags:
+``` python
+# align datasets and mask missing values
+y_true, y_refe, y_refe_qm = xr.align(y_true[PREDICTAND], y_refe[PREDICTAND], y_refe_qm[PREDICTAND], join='override')
+y_refe = y_refe.where(~missing, other=np.nan)
+y_refe_qm = y_refe_qm.where(~missing, other=np.nan)
+```
+%% Cell type:markdown id:b4a6c286-6b88-487d-866c-3cb633686dac tags:
+### Load model predictions
+%% Cell type:code id:ccaf0118-da51-43b0-a2b6-56ba4b252999 tags:
+``` python
+y_pred = [xr.open_dataset(sim, chunks={'time': 365}) for sim in models]
+if PREDICTAND == 'pr':
+    y_pred = [y_p.rename({NAMES[PREDICTAND]: PREDICTAND}) for y_p in y_pred]
+```
+%% Cell type:code id:df3f018e-4723-4878-b72a-0586b68e6dfd tags:
+``` python
+# align datasets and mask missing values
+y_prob = []
+for i, y_p in enumerate(y_pred):
+    # check whether evaluating precipitation or temperatures
+    if len(y_p.data_vars) > 1:
+        _, _, y_p, y_p_prob = xr.align(y_true, y_refe, y_p[PREDICTAND], y_p.prob, join='override')
+        y_p_prob = y_p_prob.where(~missing, other=np.nan)  # mask missing values
+        y_prob.append(y_p_prob)
+    else:
+        _, _, y_p = xr.align(y_true, y_refe, y_p[PREDICTAND], join='override')
+    # mask missing values
+    y_p = y_p.where(~missing, other=np.nan)
+    y_pred[i] = y_p
+```
+%% Cell type:markdown id:7effbf83-7977-4a47-aa6d-d57c4c4c22c6 tags:
+## Bias, MAE, and RMSE
+%% Cell type:markdown id:b8c23b7b-ccdf-412a-a30d-ac686af03d9f tags:
+Calculate yearly average bias, MAE, and RMSE over entire reference period for model predictions, ERA-5, and QM-adjusted ERA-5.
+%% Cell type:code id:7939a4d2-4eff-4507-86f8-dba7c0b635df tags:
+``` python
+# yearly average values over validation period
+y_pred_yearly_avg = [y_p.groupby('time.year').mean(dim='time') for y_p in y_pred]
+y_refe_yearly_avg = y_refe.groupby('time.year').mean(dim='time')
+y_refe_qm_yearly_avg = y_refe_qm.groupby('time.year').mean(dim='time')
+y_true_yearly_avg = y_true.groupby('time.year').mean(dim='time')
+```
+%% Cell type:code id:cce7ffbf-7e16-45f1-a2b6-5bc688595ee7 tags:
+``` python
+# yearly average bias, mae, and rmse for model predictions
+bias_pred = [y_p - y_true_yearly_avg for y_p in y_pred_yearly_avg]
+mae_pred = [np.abs(y_p - y_true_yearly_avg) for y_p in y_pred_yearly_avg]
+rmse_pred = [np.sqrt((y_p - y_true_yearly_avg) ** 2) for y_p in y_pred_yearly_avg]
+```
+%% Cell type:code id:64e29db7-998d-4952-84b0-1c79016ab9a9 tags:
+``` python
+# yearly average bias, mae, and rmse for ERA-5
+bias_refe = y_refe_yearly_avg - y_true_yearly_avg
+mae_refe = np.abs(y_refe_yearly_avg - y_true_yearly_avg)
+rmse_refe = np.sqrt((y_refe_yearly_avg - y_true_yearly_avg) ** 2)
+```
+%% Cell type:code id:bb7dc5c2-c518-4251-8d27-5b52e20e0397 tags:
+``` python
+# yearly average bias, mae, and rmse for QM-Adjusted ERA-5
+bias_refe_qm = y_refe_qm_yearly_avg - y_true_yearly_avg
+mae_refe_qm = np.abs(y_refe_qm_yearly_avg - y_true_yearly_avg)
+rmse_refe_qm = np.sqrt((y_refe_qm_yearly_avg - y_true_yearly_avg) ** 2)
+```