Refactor.

458843ac · Frisinghelli Daniel · 1d0f412e · 458843ac
Commit 458843ac authored 3 years ago by Frisinghelli Daniel
--- a/Notebooks/pr_distribution.ipynb
+++ b/Notebooks/pr_distribution.ipynb
@@ -24,7 +24,7 @@
    "import xarray as xr\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
-    "import seequantilesn as sns\n",
+    "import seaborn as sns\n",
    "import pandas as pd\n",
    "import scipy.stats as stats\n",
    "from mpl_toolkits.axes_grid1.inset_locator import inset_axes\n",

 %% Cell type:markdown id:63805b4a-b30e-4c10-a948-bc59651ca7a6 tags:

 ### Imports

 %% Cell type:code id:28982ce9-bf0c-4eb1-8b9e-bec118359966 tags:

 ``` python
 # builtins
 import datetime
 import warnings
 import calendar

 # externals
 import xarray as xr
 import numpy as np
 import matplotlib.pyplot as plt
-import seequantilesn as sns
+import seaborn as sns
 import pandas as pd
 import scipy.stats as stats
 from mpl_toolkits.axes_grid1.inset_locator import inset_axes
 import scipy.stats as stats
 from IPython.display import Image
 from sklearn.metrics import r2_score, roc_curve, auc, classification_report
 from sklearn.model_selection import train_test_split

 # locals
 from climax.main.io import ERA5_PATH, OBS_PATH, TARGET_PATH, DEM_PATH
 from climax.main.config import CALIB_PERIOD, VALID_PERIOD
 from pysegcnn.core.utils import search_files
 from pysegcnn.core.graphics import plot_classification_report
 ```

 %% Cell type:code id:de6ae734-3a6a-477e-a5a0-8b9ec5911369 tags:

 ``` python
 # entire reference period
 REFERENCE_PERIOD = np.concatenate([CALIB_PERIOD, VALID_PERIOD], axis=0)
 ```

 %% Cell type:code id:534d9565-4b58-4959-bef3-edde969e2364 tags:

 ``` python
 # empirical quantiles
 quantiles = np.arange(0.01, 1, 0.005)
 ```

 %% Cell type:markdown id:12382efb-1a3a-4ede-a904-7f762bfe56c7 tags:

 ### Load observations

 %% Cell type:code id:2373d894-e252-4f16-826b-88731e195259 tags:

 ``` python
 # model predictions and observations NetCDF
 y_true = xr.open_dataset(search_files(OBS_PATH.joinpath('pr'), 'OBS_pr(.*).nc$').pop())
 ```

 %% Cell type:markdown id:5d30b543-aa3b-45f3-b8e8-90d72f4f6896 tags:

 ### Select time period

 %% Cell type:code id:f902683a-a560-48f9-b2d1-ef9c341ca69a tags:

 ``` python
 # time period
 PERIOD = REFERENCE_PERIOD
 ```

 %% Cell type:code id:0c2c1912-a947-4afe-84a7-895726be5cfd tags:

 ``` python
 # subset to time period
 y = y_true.sel(time=PERIOD)
 ```

 %% Cell type:markdown id:f6d01e1e-9dc2-4c31-a31a-a6c91abc7fb4 tags:

 ### Fit distributions: annually

 %% Cell type:code id:0ffce851-50fc-4795-84b9-972e4f1a5169 tags:

 ``` python
 # helper function retrieving only valid observations
 def valid(ds):
    valid = ds.precipitation.values
    valid = valid[~np.isnan(valid)]  # mask missing values
    valid = valid[valid > 0]  # only consider pr > 0
    return valid
 ```

 %% Cell type:code id:6f68803b-4dbc-4d43-99c0-a32e482b647a tags:

 ``` python
 # valid observations
 y_valid = valid(y)
 ```

 %% Cell type:code id:5de4933a-ef9d-4afe-8af6-ff68d91860ce tags:

 ``` python
 # fit gamma distribution to data
 alpha, loc, beta = stats.gamma.fit(y_valid, floc=0)
 gamma = stats.gamma(alpha, loc=loc, scale=beta)
 ```

 %% Cell type:code id:dcd9bfeb-67dc-4b63-98fd-c86c3a07c2b0 tags:

 ``` python
 # fit lognormal distribution
 alpha, loc, beta = stats.lognorm.fit(y_valid, floc=0)
 lognorm = stats.lognorm(alpha, loc=loc, scale=beta)
 ```

 %% Cell type:code id:75b74f7c-c9d7-4d52-b140-e0ad9de17b69 tags:

 ``` python
 # fit generalized pareto distribution to data
 alpha, loc, beta = stats.genpareto.fit(y_valid, floc=0)
 genpareto = stats.genpareto(alpha, loc=loc, scale=beta)
 ```

 %% Cell type:code id:d489a3e7-7ece-440e-bbd9-1cfd739d822c tags:

 ``` python
 # fit exponential distribution to data
 loc, beta = stats.expon.fit(y_valid, floc=0)
 expon = stats.expon(loc=loc, scale=beta)
 ```

 %% Cell type:code id:01d8c7d9-541e-481d-b0de-e8590c571ca5 tags:

 ``` python
 # fit weibull distribution to data
 alpha, loc, beta = stats.weibull_min.fit(y_valid, floc=0)
 weibull = stats.weibull_min(alpha, loc=loc, scale=beta)
 ```

 %% Cell type:code id:14ade547-443a-457a-bccd-d88d049b9d81 tags:

 ``` python
 # empirical quantiles and theoretical quantiles
 eq = np.quantile(y_valid, quantiles)
 tq_gamma = gamma.ppf(quantiles)
 tq_genpareto = genpareto.ppf(quantiles)
 tq_expon = expon.ppf(quantiles)
 tq_lognorm = lognorm.ppf(quantiles)
 tq_weibull = weibull.ppf(quantiles)

 # Q-Q plot
 RANGE = 40
 fig, ax = plt.subplots(1, 1, figsize=(6, 6))
 ax.scatter(eq, tq_gamma, marker='*', color='k', label='Gamma')
 ax.scatter(eq, tq_genpareto, marker='x', color='k', label='GenPareto')
 ax.scatter(eq, tq_expon, marker='o', color='k', label='Expon')
 ax.scatter(eq, tq_lognorm, marker='+', color='k', label='LogNorm')
 ax.scatter(eq, tq_weibull, marker='^', color='k', label='Weibull')
 ax.plot(np.arange(0, RANGE), np.arange(0, RANGE), '--k')
 ax.set_xlim(0, RANGE)
 ax.set_ylim(0, RANGE)
 ax.set_xticks(np.arange(0, RANGE + 5, 5))
 ax.set_yticks(np.arange(0, RANGE + 5, 5))
 ax.set_xticklabels([str(t) for t in np.arange(0, RANGE + 5, 5)], fontsize=12)
 ax.set_yticklabels([str(t) for t in np.arange(0, RANGE + 5, 5)], fontsize=12)
 ax.set_ylabel('Theoretical quantiles', fontsize=14);
 ax.set_xlabel('Empirical quantiles', fontsize=14);
 ax.legend(frameon=False, fontsize=14);
 ax.set_title('Reference period: {} - {}'.format(str(PERIOD[0]), str(PERIOD[-1])), fontsize=14)

 # save figure
 fig.savefig('./Figures/pr_distribution.png', bbox_inches='tight', dpi=300)
 ```

 %% Cell type:markdown id:5fd0e9d8-759d-45ee-bb1f-9c749ac23e8e tags:

 ### Fit distributions: monthly

 %% Cell type:code id:156e5415-4065-4887-b759-0e665d671b38 tags:

 ``` python
 # get the indices of the observations for each month
 month_idx = y.groupby('time.month').groups
 ```

 %% Cell type:code id:092e865d-f033-4f60-8098-86ae5068e045 tags:

 ``` python
 # fit distribution to observations for each month
 month_gamma = {}
 month_genpareto = {}
 month_expon = {}
 month_lognorm = {}
 month_weibull = {}
 for month, idx in month_idx.items():
    print('Month: {}'.format(calendar.month_name[month]))
    # select the data of the current month
    data = y.isel(time=idx)
    data = valid(data)

    # fit distributions

    # gamma
    alpha, loc, beta = stats.gamma.fit(data, floc=0)
    gamma = stats.gamma(alpha, loc=loc, scale=beta)
    month_gamma[month] = gamma

    # genpareto
    alpha, loc, beta = stats.genpareto.fit(data, floc=0)
    genpareto = stats.genpareto(alpha, loc=loc, scale=beta)
    month_genpareto[month] = genpareto

    # exponential
    loc, beta = stats.expon.fit(data, floc=0)
    expon = stats.expon(loc=loc, scale=beta)
    month_expon[month] = expon

    # lognormal
    alpha, loc, beta = stats.lognorm.fit(data, floc=0)
    lognorm = stats.lognorm(alpha, loc=loc, scale=beta)
    month_lognorm[month] = lognorm

    # weibull
    alpha, loc, beta = stats.weibull_min.fit(data, floc=0)
    weibull = stats.weibull_min(alpha, loc=loc, scale=beta)
    month_weibull[month] = weibull
 ```

 %% Cell type:code id:396e5ee4-1632-4591-b93b-91fa6ac1d373 tags:

 ``` python
 # plot empirical vs. theoretical quantiles for each month
 fig, axes = plt.subplots(4, 3, figsize=(12, 12), sharex=True, sharey=True)
 axes = axes.flatten()

 RANGE = 40
 for month, idx in month_idx.items():
    # axis to plot
    ax = axes[month - 1]

    # compute empirical quantiles
    data = y.isel(time=idx)
    data = valid(data)
    eq = np.quantile(data, quantiles)

    # compute theoretical quantiles
    tq_gamma = month_gamma[month].ppf(quantiles)
    tq_gpare = month_genpareto[month].ppf(quantiles)
    tq_expon = month_expon[month].ppf(quantiles)
    tq_lognr = month_lognorm[month].ppf(quantiles)
    tq_weibu = month_weibull[month].ppf(quantiles)

    # plot empirical vs. theoretical quantiles
    ax.scatter(eq, tq_gamma, marker='*', color='k', label='Gamma')
    ax.scatter(eq, tq_gpare, marker='x', color='k', label='GenPareto')
    ax.scatter(eq, tq_expon, marker='o', color='k', label='Expon')
    ax.scatter(eq, tq_lognr, marker='+', color='k', label='LogNorm')
    ax.scatter(eq, tq_weibu, marker='^', color='k', label='Weibull')
    ax.plot(np.arange(0, RANGE), np.arange(0, RANGE), '-k')
    ax.set_title(calendar.month_name[month], fontsize=14)
    ax.set_xlim(0, RANGE)
    ax.set_ylim(0, RANGE)
    ax.set_xticks(np.arange(0, RANGE + 5, 5))
    ax.set_yticks(np.arange(0, RANGE + 5, 5))
    ax.set_xticklabels([str(t) for t in np.arange(0, RANGE + 5, 5)], fontsize=12)
    ax.set_yticklabels([str(t) for t in np.arange(0, RANGE + 5, 5)], fontsize=12)

 # add legend
 axes[0].legend(frameon=False, fontsize=12, loc='upper left')

 # add figure title
 fig.suptitle('Reference period: {} - {}'.format(str(PERIOD[0]), str(PERIOD[-1])), fontsize=14)

 # adjust subplots
 fig.subplots_adjust(wspace=0.1)
 fig.savefig('./Figures/pr_distribution_m.png', bbox_inches='tight', dpi=300)
 ```

 %% Cell type:markdown id:c0fea8ac-bac0-4096-bc81-90d799f8ab94 tags:

 ### Empirical quantiles per grid point

 %% Cell type:code id:a02c42e0-591c-4630-89b8-5dd8ef71a4a0 tags:

 ``` python
 # compute empirical quantiles over time
 equantiles = y.precipitation.quantile(quantiles, dim='time')
 equantiles = equantiles.rename({'quantile': 'q'})
 ```

 %% Cell type:code id:966d2724-2628-4842-abc9-695711945347 tags:

 ``` python
 # iterate over the grid points
 gammaq = np.ones(shape=(len(equantiles.q), len(equantiles.y), len(equantiles.x))) * np.nan
 genpaq = np.ones(shape=(len(equantiles.q), len(equantiles.y), len(equantiles.x))) * np.nan
 exponq = np.ones(shape=(len(equantiles.q), len(equantiles.y), len(equantiles.x))) * np.nan
 lognrq = np.ones(shape=(len(equantiles.q), len(equantiles.y), len(equantiles.x))) * np.nan
 weibuq = np.ones(shape=(len(equantiles.q), len(equantiles.y), len(equantiles.x))) * np.nan
 for i, _ in enumerate(y.x):
    print('Rows: {}/{}'.format(i + 1, len(y.x)))
    for j, _ in enumerate(y.y):

        # current grid point: xarray.Dataset, dimensions=(time)
        point = y.isel(x=i, y=j)
        point = valid(point)

        # check if the grid point is valid
        if point.size < 1:
            # move on to next grid point
            continue

        # fit Gamma distribution to grid point
        alpha, loc, beta = stats.gamma.fit(point, floc=0)
        gamma = stats.gamma(alpha, loc=loc, scale=beta)

        # fit GenPareto distribution to grid point
        alpha, loc, beta = stats.genpareto.fit(point, floc=0)
        genpa = stats.genpareto(alpha, loc=loc, scale=beta)

        # fit Exponential distribution to grid point
        loc, beta = stats.expon.fit(point, floc=0)
        expon = stats.expon(loc=loc, scale=beta)

        # fit LogNormal distribution
        alpha, loc, beta = stats.lognorm.fit(point, floc=0)
        lognr = stats.lognorm(alpha, loc=loc, scale=beta)

        # fit Weibull distribution
        alpha, loc, beta = stats.weibull_min.fit(point, floc=0)
        weibu = stats.weibull_min(alpha, loc=loc, scale=beta)

        # compute theoretical quantiles of fitted distributions
        tq_gamma = gamma.ppf(quantiles)
        tq_genpa = genpa.ppf(quantiles)
        tq_expon = expon.ppf(quantiles)
        tq_lognr = lognr.ppf(quantiles)
        tq_weibu = weibu.ppf(quantiles)

        # store theoretical quantiles for current grid point
        gammaq[:, j, i] = tq_gamma
        genpaq[:, j, i] = tq_genpa
        exponq[:, j, i] = tq_expon
        lognrq[:, j, i] = tq_lognr
        weibuq[:, j, i] = tq_weibu

 # store theoretical quantiles in xarray.DataArray
 gammaq = xr.DataArray(data=gammaq, dims=['q', 'y', 'x'], coords=dict(q=quantiles, y=y.y, x=y.x),
                      name='precipitation')
 genpaq = xr.DataArray(data=genpaq, dims=['q', 'y', 'x'], coords=dict(q=quantiles, y=y.y, x=y.x),
                      name='precipitation')
 exponq = xr.DataArray(data=exponq, dims=['q', 'y', 'x'], coords=dict(q=quantiles, y=y.y, x=y.x),
                      name='precipitation')
 lognrq = xr.DataArray(data=lognrq, dims=['q', 'y', 'x'], coords=dict(q=quantiles, y=y.y, x=y.x),
                      name='precipitation')
 weibuq = xr.DataArray(data=weibuq, dims=['q', 'y', 'x'], coords=dict(q=quantiles, y=y.y, x=y.x),
                      name='precipitation')
 ```

 %% Cell type:code id:601de7cb-35f4-40e1-9b51-2dab23102659 tags:

 ``` python
 # compute bias in theoretical quantiles
 bias_gamma = gammaq - equantiles  # predicted - observed
 bias_genpa = genpaq - equantiles
 bias_expon = exponq - equantiles
 bias_lognr = lognrq - equantiles
 bias_weibu = weibuq - equantiles
 ```

 %% Cell type:code id:23abd0d1-7c27-4f02-b7ae-9165c2dde0b6 tags:

 ``` python
 # distributions
 dists = {k: v for k, v in zip(['gamma', 'genpareto', 'expon', 'lognr', 'weibu'], [bias_gamma, bias_genpa, bias_expon, bias_lognr, bias_weibu])}
 ```

 %% Cell type:code id:b8089c11-a48d-4028-9d4b-e03101ff5e55 tags:

 ``` python
 # plot spatial bias in different quantiles
 plot_quantiles = quantiles[18::20]
 fig, axes = plt.subplots(3, 3, sharex=True, sharey=True, figsize=(12, 12))
 axes = axes.flatten()

 for dist, biasq in dists.items():

    # iterate over quantiles to plot
    for ax, q in zip(axes, plot_quantiles):
        im = ax.imshow(biasq.sel(q=q).values, origin='lower', vmin=0, vmax=5, cmap='viridis_r')
        ax.set_title(str('P{:.0f}'.format(q * 100)), fontsize=14)

    # adjust subplots
    fig.subplots_adjust(wspace=0.1, hspace=0.1)

    # add colorbar for bias
    axes = axes.flatten()
    cbar_ax_bias = fig.add_axes([axes[2].get_position().x1 + 0.01, axes[2].get_position().y0,
                                 0.01, axes[2].get_position().y1 - axes[2].get_position().y0])
    cbar_bias = fig.colorbar(im, cax=cbar_ax_bias)
    cbar_bias.set_label(label='Bias (mm)', fontsize=14)
    cbar_bias.ax.tick_params(labelsize=14, pad=10)

    # save figure
    fig.savefig('./Figures/pr_distribution_{}_grid.png'.format(dist), bbox_inches='tight', dpi=300)
 ```