Example of generalized linear regression in sklearn.

46be79b3 · Frisinghelli Daniel · 3645b5e5 · 46be79b3
Commit 46be79b3 authored 3 years ago by Frisinghelli Daniel
--- a/Notebooks/glm.ipynb
+++ b/Notebooks/glm.ipynb
@@ -184,16 +184,6 @@
    "VALID_PERIOD = slice('1991-01-01', '2010-01-01')"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cefcb0d1-e00e-4025-a872-b6da7e991876",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "VALID_PERIOD.stop"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,

 %% Cell type:markdown id:0ade2427-0020-414a-957b-4ed71281b61f tags:

 ## Generalized linear regression

 %% Cell type:code id:cfd6644a-9af1-4b71-8dac-1e423786e0a8 tags:

 ``` python
 # builtins
 import pathlib

 # externals
 import numpy as np
 import pandas as pd
 import xarray as xr
 from sklearn.linear_model import TweedieRegressor

 # locals
 from climax.core.dataset import ERA5Dataset
 from climax.core.constants import ERA5_VARIABLES
 from climax.core.utils import search_files
 ```

 %% Cell type:code id:646406dd-8c0d-4cdc-b7c3-b0b55b2ec4bd tags:

 ``` python
 # path to ERA5 reanalysis data
 ERA5_PATH = pathlib.Path('/mnt/CEPH_PROJECTS/FACT_CLIMAX/REANALYSIS/ERA5')
 ```

 %% Cell type:code id:18d222d5-fe39-4858-af39-04a14171cad6 tags:

 ``` python
 # list of valid predictor variable names
 ERA5_VARIABLES
 ```

 %% Cell type:code id:1cdca239-c1c9-41c6-bce5-7929ff3b19de tags:

 ``` python
 # define the predictor variables you want to use
 ERA5_PREDICTORS = ['geopotential', 'temperature', 'mean_sea_level_pressure']  # use geopotential, temperature and pressure

 # you can change this list as you wish, e.g.:
 # ERA5_PREDICTORS = ['geopotential', 'temperature']  # use only geopotential and temperature
 # ERA5_PREDICTORS = ERA5_VARIABLES  # use all ERA5 variables as predictors

 # this checks if the variable names are correct
 assert all([p in ERA5_VARIABLES for p in ERA5_PREDICTORS])
 ```

 %% Cell type:markdown id:2fd49c6d-a7dc-49cb-87ba-c1f539ff2127 tags:

 ### Use the climax package to load ERA5 predictors

 %% Cell type:code id:72e4c65c-add4-47a8-aae0-e4ae3301301a tags:

 ``` python
 # define which pressure levels you want to use: currently only 500 and 850 are available
 PLEVELS = [500, 850]
 ```

 %% Cell type:code id:35979cb6-0a18-41bd-9eea-081b187be5ff tags:

 ``` python
 # create the xarray.Dataset of the specified predictor variables
 predictors = ERA5Dataset(ERA5_PATH, ERA5_PREDICTORS, plevels=PLEVELS)
 predictors = predictors.merge()
 ```

 %% Cell type:code id:155c0954-cb8a-40d1-99e9-6c46281ae899 tags:

 ``` python
 # check out the xarray.Dataset: you will see all the variables you specified
 predictors
 ```

 %% Cell type:markdown id:727fc0f2-c8ef-480d-a273-a07d3dff0141 tags:

 ### Load target data: observations

 %% Cell type:code id:629d12b0-e4d1-4cc8-8c02-bd62bbb67cdc tags:

 ``` python
 # path to observation data
 OBS_PATH = pathlib.Path('/mnt/CEPH_PROJECTS/FACT_CLIMAX/OBSERVATION')
 ```

 %% Cell type:code id:c589cdea-09ed-415e-8613-a5ec311fdb2b tags:

 ``` python
 # define the predictand, i.e. tasmax, tasmin or pr
 PREDICTAND = 'tasmax'
 ```

 %% Cell type:code id:c8301c21-c32b-4123-a405-bde4b4d80f48 tags:

 ``` python
 # load the observation data
 predictand = xr.open_dataset(search_files(OBS_PATH.joinpath(PREDICTAND), '.nc$').pop())
 ```

 %% Cell type:code id:4760558e-b1ed-4b16-9609-cf301aa39f55 tags:

 ``` python
 # check out the xarray.Dataset: you will see a single variable
 predictand
 ```

 %% Cell type:markdown id:756e22e2-0b53-4d8f-877c-b9aee2659e17 tags:

 ### Prepare training data

 %% Cell type:code id:8aec082a-1d6a-4932-8600-efc48f886cb3 tags:

 ``` python
 # define the training period and the validation period
 TRAIN_PERIOD = slice('1981-01-01', '1991-01-01')
 VALID_PERIOD = slice('1991-01-01', '2010-01-01')
 ```

-%% Cell type:code id:cefcb0d1-e00e-4025-a872-b6da7e991876 tags:
-
-``` python
-VALID_PERIOD.stop
-```
-
 %% Cell type:code id:0812673a-5a9d-48bf-ba87-3f96cb26ff49 tags:

 ``` python
 # select training and validation data: predictors
 predictors_train = predictors.sel(time=TRAIN_PERIOD)
 predictors_valid = predictors.sel(time=VALID_PERIOD)
 ```

 %% Cell type:code id:c326326c-5621-4ed4-af2b-a942bc678e71 tags:

 ``` python
 # select training and validation data: predictand
 predictand_train = predictand.sel(time=TRAIN_PERIOD)
 predictand_valid = predictand.sel(time=VALID_PERIOD)
 ```

 %% Cell type:markdown id:9a410594-714e-4124-b155-d2985ed5b6cb tags:

 ### Train the generalized linear regression model

 %% Cell type:code id:57da5a56-bcd7-44eb-9304-62697d779f7f tags:

 ``` python
 # instanciate the GLM
 model = TweedieRegressor(power=0 if PREDICTAND in ['tasmax', 'tasmin'] else 2)
 model
 # power = 0: Normal distribution (tasmax, tasmin)
 # power = 1: Poisson distribution
 # power = (1, 2): Compound Poisson Gamma distribution
 # power = 2: Gamma distribution (pr)
 # power = 3: Inverse Gaussian
 ```

 %% Cell type:code id:802169db-8a22-42ad-8d4f-e29f676a49e9 tags:

 ``` python
 # function to normalize predictors to [0, 1]
 def normalize(predictors):
    predictors -= predictors.min(axis=1, keepdims=True)
    predictors /= predictors.max(axis=1, keepdims=True)
    return predictors
 ```

 %% Cell type:code id:5382f429-d8e7-4636-8f47-922ae20a91cc tags:

 ``` python
 # iterate over the grid points
 prediction = np.ones(shape=(len(predictors_valid.time), len(predictors_valid.y), len(predictors_valid.x))) * np.nan
 for i, _ in enumerate(predictors_train.x):
    for j, _ in enumerate(predictors_train.y):

        # current grid point: xarray.Dataset, dimensions=(time)
        point_predictors = predictors_train.isel(x=i, y=j)
        point_predictand = predictand_train.isel(x=i, y=j)

        # convert xarray.Dataset to numpy.array: shape=(time, predictors)
        point_predictors = point_predictors.to_array().values.swapaxes(0, 1)
        point_predictand = point_predictand.to_array().values.squeeze()

        # check if the grid point is valid
        if np.isnan(point_predictors).any() or np.isnan(point_predictand).any():
            # move on to next grid point
            continue

        # normalize each predictor variable to [0, 1]
        point_predictors = normalize(point_predictors)

        # instanciate the model for the current grid point
        model = TweedieRegressor(power=0 if PREDICTAND in ['tasmax', 'tasmin'] else 2)

        # train model on training data
        model.fit(point_predictors, point_predictand)
        print('Processing grid point: ({:d}, {:d}), score: {:.2f}'.format(j, i, model.score(point_predictors, point_predictand)))

        # prepare predictors of validation period
        point_validation = predictors_valid.isel(x=i, y=j).to_array().values.swapaxes(0, 1)
        point_validation = normalize(point_validation)

        # predict validation period
        pred = model.predict(point_validation)

        # store predictions for current grid point
        prediction[:, j, i] = pred

 # store predictions in xarray.Dataset
 predictions = xr.DataArray(data=prediction, dims=['time', 'y', 'x'],
                           coords=dict(time=pd.date_range(VALID_PERIOD.start, VALID_PERIOD.stop, freq='D'),
                                       lat=predictand_valid.y, lon=predictand_valid.x))
 predictions = predictions.to_dataset(name=PREDICTAND)
 ```

 %% Cell type:markdown id:595ee11d-570d-42ed-8c40-226d724aa9fb tags:

 ### Save predictions as NetCDF

 %% Cell type:code id:fb708c40-c0e3-4c62-bc4e-9d0b6bfc52d4 tags:

 ``` python
 # specify the output path, filename: PREDICTAND.nc
 OUTPUT_PATH = pathlib.Path('~/{}'.format(PREDICTAND + '.nc'))

 # save to NetCDF
 predictions.to_netcdf(OUTPUT_PATH, engine='h5netcdf')
 ```

 %% Cell type:code id:fc9aba1a-fde1-48c6-85e7-02cb5ebdcfa7 tags:

 ``` python
 # Enjoy and have fun!
 ```