sf_runoff_lt.py

import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV,TimeSeriesSplit
from sklearn.metrics import mean_squared_error


import matplotlib.pyplot as plt

import os

import pdb

def shift_series_(s, shift_range,t_unit):

    s_shifts = [s.shift(-t_unit * shift, freq='D').rename(f'{s.name}_{shift}') for shift in range(*shift_range)]
    return pd.concat(s_shifts, axis=1)


def create_it_matrix_lt(daily_input, t_length,t_unit, lead_t):

    # This function takes as input the daily temperature, precipitation and runoff and generates the input-target matrix

    # Read the daily input and extract runoff, evaporation, temperature and precipitation dataframe
    if isinstance(daily_input, str):
        daily_input = pd.read_csv(daily_input, index_col=0, parse_dates=True)
    runoff = daily_input[['Q']]
    temp = daily_input[[c for c in daily_input.columns if c[0] == 'T']]
    prec = daily_input[[c for c in daily_input.columns if c[0] == 'P']]
    evap = daily_input[[c for c in daily_input.columns if c[0] == 'E']]


    # Compute the t_unit days average runoff
    runoff_t_unit = runoff.rolling(t_unit, min_periods=t_unit).mean()
    #runoff_t_unit_shifted = runoff_t_unit.shift(t_unit * lead_t,freq='D')
    #pdb.set_trace()
    
    # Compute the t_unit days average temperature
    if not temp.empty:
        temp_t_unit = temp.rolling(t_unit, min_periods=t_unit).mean()
        temp_t_unit = pd.concat([shift_series_(temp_t_unit.loc[:, col], (-t_length + 1, 1-lead_t),t_unit) for col in temp_t_unit], axis=1)

    # Compute the t_unit days sum precipitation
    if not prec.empty:
        prec_t_unit = prec.rolling(t_unit, min_periods=t_unit).sum()
        prec_t_unit = pd.concat([shift_series_(prec_t_unit.loc[:, col], (-t_length + 1, 1-lead_t),t_unit) for col in prec_t_unit], axis=1)
    
    # Compute the t_unit days sum evapotranspiration
    if not evap.empty:
        evap_t_unit = evap.rolling(t_unit, min_periods=t_unit).sum()
        evap_t_unit = pd.concat([shift_series_(evap_t_unit.loc[:, col], (-t_length + 1, 1-lead_t),t_unit) for col in evap_t_unit], axis=1)

    result = pd.concat([runoff_t_unit, temp_t_unit, prec_t_unit, evap_t_unit], axis=1).dropna()
    #result = result.shift(-t_unit * lead_t,freq='D')
    # Create the input-target matrix
    return result


"""
def sf_input_matrix(input_matrix, seasonal_forecast, lead_time, member):

    X = input_matrix.copy()
    X = X.loc[X.index.is_month_end, :]

    for offset in range(0, -lead_time, -1):
        sf_dates = X.index + pd.offsets.MonthBegin(offset - 1)
        X_columns = [c for c in X.columns if int(c.split('_')[1]) == offset]
        sf_columns = [f'SF_{c.split("_")[0]}_lt{lead_time + offset}_m{member}' for c in X_columns]
        X.loc[:, X_columns] = seasonal_forecast.loc[sf_dates, sf_columns].values

    return X
"""


def svr_gridSearch(daily_input, t_length,t_unit, C_range=np.logspace(-3, 1, 5), epsilon_range=np.logspace(-5, 0, 5),
                   plot=False, n_splits=8):

    # svr_gridSearch run the grid search on C and epsilon svr parameters.
    
    # Read the input-target matrix 
    it_matrix = create_it_matrix(daily_input, t_length)
    X = it_matrix.drop(columns='Q')
    y = it_matrix['Q']
    
    # Set up the splits respecting of the time-series nature of the dataset

    tscv = TimeSeriesSplit(gap=t_unit,n_splits=n_splits, test_size=365)
    tscv.split(X)
    
    #pdb.set_trace()
     
    # Set up the grid search parameters
    svr_estimator = SVR(kernel='rbf', gamma='scale', cache_size=1000)
    svr_estimator = make_pipeline(StandardScaler(),
                                  TransformedTargetRegressor(regressor=svr_estimator, transformer=StandardScaler()))
    parameters = {'transformedtargetregressor__regressor__C': C_range,
                  'transformedtargetregressor__regressor__epsilon': epsilon_range}
    svr_model = GridSearchCV(svr_estimator, cv=tscv, param_grid=parameters, n_jobs=-1, verbose=1, refit=True)


    # execute the grid search
    svr_model.fit(X, y)
    
    
    # Select the best C and epsilon
    best_C = svr_model.best_params_['transformedtargetregressor__regressor__C']
    best_epsilon = svr_model.best_params_['transformedtargetregressor__regressor__epsilon']
    print(f'For {t_length} months of data input: Best estimator: C={best_C}, epsilon={best_epsilon}')

    # Check if the best C (or epsion) is in the border of the grid
    if best_C == max(C_range) or best_C == min(C_range):
        print(f'Warning: best C found on the grid limit: C = {best_C}')
    if best_epsilon == max(epsilon_range) or best_epsilon == min(epsilon_range):
        print(f'Warning: best epsilon found on the grid limit: epsilon = {best_epsilon}')
    print()
    
    if plot:

        # Gridsearch plot
        plt.figure()
        scores = svr_model.cv_results_['mean_test_score'].reshape(
            len(svr_model.param_grid['transformedtargetregressor__regressor__C']),
            len(svr_model.param_grid['transformedtargetregressor__regressor__epsilon']))
        plt.imshow(scores)
        plt.colorbar()
        plt.xticks(np.arange(len(svr_model.param_grid['transformedtargetregressor__regressor__epsilon'])),
                   ['%.3f' % a for a in svr_model.param_grid['transformedtargetregressor__regressor__epsilon']], rotation=45)
        plt.yticks(np.arange(len(svr_model.param_grid['transformedtargetregressor__regressor__C'])),
                   ['%.3f' % a for a in svr_model.param_grid['transformedtargetregressor__regressor__C']])
        plt.xlabel('epsilon')
        plt.ylabel('C')
        plt.title(f'Heatmap for {t_length} months of input data')
        plt.tight_layout()

        # Scatterplot true vs. estimated training
        plt.figure()
        true_values = y
        est_values = svr_model.predict(X)
        xy = np.vstack([true_values, est_values])
        z = gaussian_kde(xy)(xy)
        idx = z.argsort()
        x, y, z = true_values[idx], est_values[idx], z[idx]
        plt.scatter(x, y, c=z)
        plt.plot([min(true_values), max(true_values)], [min(true_values), max(true_values)], color='black')
        plt.xlabel('True values')
        plt.ylabel('Estimated values')
        plt.title(f'Heatmap for {t_length} months of input data')
        plt.gca().set_aspect('equal', adjustable='box')
        plt.draw()
    return {'C': best_C, 'epsilon': best_epsilon, 'score': svr_model.best_score_,
            'best_estimator': svr_model.best_estimator_}


def feature_sel(daily_input):

    # Find the best t_length and temperature/precipitation stations. Return score for each combination

    gridSearch_results = []
    for t_len in range(1, 12):
        gridSearch_results.append(svr_gridSearch(daily_input, 
                                                 t_len,
                                                 C_range=np.logspace(-2, 2, 6), 
                                                 epsilon_range=np.logspace(-3, 0, 6),
                                                 plot=True))

    return pd.DataFrame(data=gridSearch_results, index=range(1, 12))

    # if isinstance(daily_input, str):
    #     daily_input = pd.read_csv('/home/mcallegari@eurac.edu/SECLI-FIRM/Mattia/SF_runoff/Zoccolo/inputs/daily_input.csv', index_col=0, parse_dates=True)
    #
    # temp_col = [c for c in daily_input.columns if c[0] == 'T']
    # prec_col = [c for c in daily_input.columns if c[0] == 'P']
    #
    # for feat in itertools.product(temp_col, prec_col):
    #     for t_len in range(1, 12):
    #         svr_gridSearch(daily_input.loc[:, ('Q',) + feat], t_len, plot=False)


def training(daily_input, t_length=None, svr_C=None, svr_epsilon=None):

    # This function takes as input the daily temperature, precipitation and runoff and train a model to predict the
    # monthly mean runoff using a time series of temperature and precipitation of length t_length as input features

    # Execute the feature selection if t_length is None
    if t_length is None:

        print('Finding the best input time series length...')
        fs = feature_sel(daily_input)
        fs_best = fs.loc[fs['score'].idxmax()]
        svr_C, svr_epsilon = fs_best['C'], fs_best['epsilon']
        t_length = fs_best.name
        print(f'Best time series length: {t_length}. '
              f'Best C={svr_C}, epsilon={svr_epsilon} (score={fs_best["score"]})')
        svr_model = fs_best['best_estimator']

    else:

        it_matrix = create_it_matrix(daily_input, t_length)
        X = it_matrix.drop(columns='Q')
        y = it_matrix['Q']

        # Fit the SVR with standardized input and target
        svr_estimator = SVR(kernel='rbf', C=svr_C, epsilon=svr_epsilon, gamma='scale', cache_size=1000)
        svr_model = make_pipeline(StandardScaler(),
                                  TransformedTargetRegressor(regressor=svr_estimator, transformer=StandardScaler()))
        svr_model.fit(X, y)

    return svr_model


def monthly_climatology(daily_input,t_unit):

    if isinstance(daily_input, str):
        daily_input = pd.read_csv(daily_input, index_col=0, parse_dates=True)

    monthly_mean_columns = [c for c in daily_input.columns if c[0] in ['Q', 'T']]
    monthly_mean = daily_input.loc[:, monthly_mean_columns].groupby(by=daily_input.index.month).mean()
    #remember to add ['E', 'P']
    monthly_sum_columns = [c for c in daily_input.columns if c[0] in ['P','E']]
    monthly_sum = daily_input.loc[:, monthly_sum_columns].groupby(by=daily_input.index.month).mean() * t_unit
    #pdb.set_trace()
    return pd.concat([monthly_mean, monthly_sum], axis=1)#[monthly_mean_columns,monthly_sum_columns]

def daily_climatology(daily_input,t_unit):
    
    runoff = daily_input[['Q']]
    temp = daily_input[[c for c in daily_input.columns if c[0] == 'T']]
    prec = daily_input[[c for c in daily_input.columns if c[0] == 'P']]
    evap = daily_input[[c for c in daily_input.columns if c[0] == 'E']]


    # Compute the t_unit days average runoff
    runoff_t_unit = runoff.rolling(t_unit, min_periods=t_unit).mean()

    
    # Compute the t_unit days average temperature
    if not temp.empty:
        temp_t_unit = temp.rolling(t_unit, min_periods=t_unit).mean()
        #temp_t_unit = pd.concat([shift_series_t_unitdays(temp_t_unit.loc[:, col], (-t_length + 1, 1)) for col in temp_t_unit], axis=1)

    # Compute the t_unit days sum precipitation
    if not prec.empty:
        prec_t_unit = prec.rolling(t_unit, min_periods=t_unit).sum()
        #prec_t_unit = pd.concat([shift_series_t_unitdays(prec_t_unit.loc[:, col], (-t_length + 1, 1)) for col in prec_t_unit], axis=1)
    
    # Compute the t_unit days sum evapotranspiration
    if not evap.empty:
        evap_t_unit = evap.rolling(t_unit, min_periods=t_unit).sum()
        #evap_t_unit = pd.concat([shift_series_t_unitdays(evap_t_unit.loc[:, col], (-t_length + 1, 1)) for col in evap_t_unit], axis=1)

    daily_t_unit = pd.concat([runoff_t_unit, temp_t_unit, prec_t_unit, evap_t_unit], axis=1)
    daily_mean = daily_t_unit.groupby(by=daily_t_unit.index.day_of_year).mean()

    #pdb.set_trace()
    return daily_mean


def loyo_cv_lc_nofor(daily_input,output_folder, t_length=None, svr_C=None, svr_epsilon=None,
               lead_time=range(1, 8)):

    # Compute the climatology for all the inputs
    if isinstance(daily_input, str):
        daily_input = pd.read_csv(daily_input, index_col=0, parse_dates=True)
    monthly_clim = monthly_climatology(daily_input)


    # Create the total input-target matrix and select the start and end year for the loo-cv
    it_matrix = create_it_matrix(daily_input, t_length)
    year_start = min(it_matrix.index[(it_matrix.index.month == 1) & (it_matrix.index.day == 31)].year)
    year_end = max(it_matrix.index[(it_matrix.index.month == 12) & (it_matrix.index.day == 31)].year)

    # For each training length and each year train a different model and test with different configurations
    for Nyears_training in range(1, year_end-year_start+1):
        prediction = []
        for year in range(year_start, year_end+1):
            print(f'Testing on year {year}; Training years: {Nyears_training} ...')

            # Drop the years that should not be considered in the training and train the model
            daily_input_loo = daily_input.copy()
            training_years = (list(range(year+1, year_end+1)) + list(range(year_start, year)))[-Nyears_training:]
            daily_input_loo.loc[[i for i in daily_input.index if i.year not in training_years], 'Q'] = np.nan
            # daily_input_loo.loc[daily_input.index.year == year, 'Q'] = np.nan
            svr_model = training(daily_input_loo, t_length, svr_C, svr_epsilon)

            # Select the dates on which to execute the prediction
            test_dates = daily_input.index[daily_input.index.is_month_end & (daily_input.index.year == year)]

            # Save the true runoff values
            y = {}
            y['true_runoff'] = daily_input.loc[daily_input.index.year == year, 'Q'].resample("M").mean().values

            # Compute runoff monthly climatology considering the subset used for training the svr
            y['runoff_clim'] = [daily_input_loo.loc[daily_input_loo.index.month == m, 'Q'].mean() for m in range(1, 13)]

            # Predict using true temperature and precipitation (no forecast)
            X_trueTP = it_matrix.loc[test_dates, :].drop(columns='Q')
            y['trueTP'] = svr_model.predict(X_trueTP)

             # Predict using temperature and precipitation climatology (no forecast) for all lead times
            X_climTP = X_trueTP.copy()
            for lt in lead_time:
                change_dest = [c for c in X_climTP.columns if c.split('_')[1] == str(-lt + 1)]
                change_source = [c.split('_')[0] for c in change_dest]
                X_climTP.loc[:, change_dest] = monthly_clim.loc[(test_dates - pd.offsets.MonthBegin(lt)).month, change_source].values
                y[f'climTP_lt{lt}'] = svr_model.predict(X_climTP)
                
            '''
            # Predict using seasonal forecast with different lead times
            for lt in lead_time:
                for m in range(1, 26):
                    X_sf = sf_input_matrix(X_trueTP, seasonal_forecast, lt, m)
                    y[f'sfTP_m{m}_lt{lt}'] = svr_model.predict(X_sf)
            '''
            # Store the results in a dataframe
            prediction.append(pd.DataFrame(data=y, index=test_dates-pd.offsets.MonthBegin()))
            
        pd.concat(prediction, axis=0).to_csv(
            os.path.join(output_folder, f'Runoff_forecast_{Nyears_training}_trainingyears.csv'))

def loyo_cv_lc(daily_input, seasonal_forecast, output_folder, t_length=None, svr_C=None, svr_epsilon=None,
               lead_time=range(1, 8)):

    # Compute the climatology for all the inputs
    if isinstance(daily_input, str):
        daily_input = pd.read_csv(daily_input, index_col=0, parse_dates=True)
    monthly_clim = monthly_climatology(daily_input)

    # Read the seasonal forecast of temperature and precipitation
    if isinstance(seasonal_forecast, str):
        seasonal_forecast = pd.read_csv(seasonal_forecast, index_col=0, parse_dates=True)

    # Create the total input-target matrix and select the start and end year for the loo-cv
    it_matrix = create_it_matrix(daily_input, t_length)
    year_start = min(it_matrix.index[(it_matrix.index.month == 1) & (it_matrix.index.day == 31)].year)
    year_end = max(it_matrix.index[(it_matrix.index.month == 12) & (it_matrix.index.day == 31)].year)

    # For each training length and each year train a different model and test with different configurations
    for Nyears_training in range(1, year_end-year_start+1):
        prediction = []
        for year in range(year_start, year_end+1):
            print(f'Testing on year {year}; Training years: {Nyears_training} ...')

            # Drop the years that should not be considered in the training and train the model
            daily_input_loo = daily_input.copy()
            training_years = (list(range(year+1, year_end+1)) + list(range(year_start, year)))[-Nyears_training:]
            daily_input_loo.loc[[i for i in daily_input.index if i.year not in training_years], 'Q'] = np.nan
            # daily_input_loo.loc[daily_input.index.year == year, 'Q'] = np.nan
            svr_model = training(daily_input_loo, t_length, svr_C, svr_epsilon)

            # Select the dates on which to execute the prediction
            test_dates = daily_input.index[daily_input.index.is_month_end & (daily_input.index.year == year)]

            # Save the true runoff values
            y = {}
            y['true_runoff'] = daily_input.loc[daily_input.index.year == year, 'Q'].resample("M").mean().values

            # Compute runoff monthly climatology considering the subset used for training the svr
            y['runoff_clim'] = [daily_input_loo.loc[daily_input_loo.index.month == m, 'Q'].mean() for m in range(1, 13)]

            # Predict using true temperature and precipitation (no forecast)
            X_trueTP = it_matrix.loc[test_dates, :].drop(columns='Q')
            y['trueTP'] = svr_model.predict(X_trueTP)

            # Predict using temperature and precipitation climatology (no forecast) for all lead times
            X_climTP = X_trueTP.copy()
            for lt in lead_time:
                change_dest = [c for c in X_climTP.columns if c.split('_')[1] == str(-lt + 1)]
                change_source = [c.split('_')[0] for c in change_dest]
                X_climTP.loc[:, change_dest] = monthly_clim.loc[(test_dates - pd.offsets.MonthBegin(lt)).month, change_source].values
                y[f'climTP_lt{lt}'] = svr_model.predict(X_climTP)
                

            # Predict using seasonal forecast with different lead times
            for lt in lead_time:
                for m in range(1, 26):
                    X_sf = sf_input_matrix(X_trueTP, seasonal_forecast, lt, m)
                    y[f'sfTP_m{m}_lt{lt}'] = svr_model.predict(X_sf)

            # Store the results in a dataframe
            prediction.append(pd.DataFrame(data=y, index=test_dates-pd.offsets.MonthBegin()))

        pd.concat(prediction, axis=0).to_csv(
            os.path.join(output_folder, f'Runoff_forecast_{Nyears_training}_trainingyears.csv'))


# -----------------------------------------------------------------------

# Read and plot results


def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))


def monthly_rmse(fileName, plot=True):

    # Open the result file
    runoff = pd.read_csv(fileName, index_col=0, parse_dates=True)

    # Create the ensamble mean
    for lt in range(1, 8):
        columns = [c for c in runoff.columns if 'sfTP_m' in c and f'lt{lt}' in c]
        runoff[f'sfTP_em_lt{lt}'] = runoff.loc[:, columns].mean(axis=1)

    # Compute the RMSE for each month
    runoff_error = []
    for m in range(1, 12):
        runoff_m = runoff.loc[runoff.index.month == m, :]
        runoff_error.append(
            runoff_m.apply(lambda y_pred: root_mean_squared_error(runoff_m['true_runoff'], y_pred), axis=0)
        )

    runoff_error = pd.DataFrame(data=runoff_error, index=range(1, 12))

    if plot:
        plt.figure()
        for lt in range(1, 8):
            runoff_error.loc[:, f'sfTP_em_lt{lt}'].plot(marker='o', label=f'lead_time={lt}')
        runoff_error.loc[:, 'runoff_clim'].plot(marker='o', color='black', label='climatology', linewidth=3)
        runoff_error.loc[:, 'trueTP'].plot(marker='o', color='red', label='era5', linewidth=3)
        plt.legend()
        plt.ylabel('RMSE ($m^3/s$)')
        plt.xlabel('Month')


def plot_it_matrix(daily_input, var, common_ylim=True):

    # ## Plot each variable of the input-target matrix

    # Create the input-target matrix from the daily_input
    it_matrix = create_it_matrix(daily_input, 1).rename(columns=lambda c: c[:2])

    # Create the ylabel dictionary
    plt_ylabel = {'P': 'Total precipitation (m)', 'T': 'Mean temperature (K)', 'Q': 'Runoff ($m^3/s$)'}

    # Set the ylim
    if common_ylim:
        selected_vars = it_matrix.loc[:, [c for c in it_matrix.columns if c[0] == var[0]]].values
        b = (selected_vars.max() - selected_vars.min()) * 0.03
        plt_ylim = (selected_vars.min()-b, selected_vars.max()+b)

    # Plot each year with a different color using the day of the year as x axis
    plt.figure()
    for y in range(it_matrix.index.year.min(), it_matrix.index.year.max() + 1):
        curr = it_matrix[it_matrix.index.year == y]
        curr.set_index(curr.index.dayofyear, inplace=True)
        curr[var].plot(label='_nolegend_')

    # Plot the daily climatology
    clim = it_matrix[var].groupby(it_matrix.index.dayofyear).mean().loc[1:365]
    clim.plot(label='Mean', color='black', linewidth=5)

    # Set the figure properties
    if common_ylim:
        plt.ylim(plt_ylim)
    plt.xlabel('Day')
    plt.ylabel(plt_ylabel[var[0]])
    plt.legend()


def lead_time_rmse(fileName):

    # fileName = '/home/mcallegari@eurac.edu/SECLI-FIRM/Mattia/SF_runoff/Zoccolo/Results/Learning_curve/Runoff_forecast_26_trainingyears.csv'

    # Open the result file
    runoff = pd.read_csv(fileName, index_col=0, parse_dates=True)

    # Create the ensamble mean
    for lt in range(1, 8):
        columns = [c for c in runoff.columns if 'sfTP_m' in c and f'lt{lt}' in c]
        runoff[f'sfTP_em_lt{lt}'] = runoff.loc[:, columns].mean(axis=1)

    # Compute the RMSE
    runoff_error = runoff.apply(lambda y_pred: root_mean_squared_error(runoff['true_runoff'], y_pred), axis=0)

    plt.figure()
    plt.plot([1, 7], [runoff_error['runoff_clim']]*2, color='black', label='runoff climatology')
    plt.plot([1, 7], [runoff_error['trueTP']] * 2, color='red', label='era5')
    plt.plot(range(1, 8), runoff_error[[f'climTP_lt{lt}' for lt in range(1, 8)]], marker='o', label='era5 climatology')
    plt.plot(range(1, 8), runoff_error[[f'sfTP_em_lt{lt}' for lt in range(1, 8)]], marker='o', label='SEAS5 ensamble mean')
    plt.plot(range(1, 8), runoff_error[[f'sfTP_m1_lt{lt}' for lt in range(1, 8)]], color='C1', alpha=0.5, label='SEAS5 members')
    for m in range(2, 26):
        plt.plot(range(1, 8), runoff_error[[f'sfTP_m{m}_lt{lt}' for lt in range(1, 8)]], color='C1', alpha=0.5)
    plt.legend()
    plt.xlabel('Lead time (months)')
    plt.ylabel('RMSE ($m^3/s$)')

    
def spatial_avg_daily_input(daily_input):
    t_columns = [c for c in daily_input.columns if c[0] =='T']
    daily_input['T'] = daily_input[t_columns].mean(axis=1)
    daily_input=daily_input.drop(columns = t_columns)

    e_columns = [c for c in daily_input.columns if c[0] =='E']
    daily_input['E'] = daily_input[e_columns].mean(axis=1)
    daily_input=daily_input.drop(columns = e_columns)

    p_columns = [c for c in daily_input.columns if c[0] =='P']
    daily_input['P'] = daily_input[p_columns].mean(axis=1)
    daily_input=daily_input.drop(columns = p_columns)
    
    return daily_input;


def spatial_stats_daily_input(daily_input):
    
    new_daily_input=pd.DataFrame(daily_input.Q)
    
    t_columns = [c for c in daily_input.columns if c[0] =='T']
    t_vars=daily_input[t_columns]
    new_daily_input.loc[:,'T'] = t_vars.mean(axis=1)
    new_daily_input.loc[:,'T5']=t_vars.quantile(q=0.05,axis=1)
    new_daily_input.loc[:,'T25']=t_vars.quantile(q=0.25,axis=1)
    new_daily_input.loc[:,'T75']=t_vars.quantile(q=0.75,axis=1)
    new_daily_input.insert(loc=5,column='T95',value=t_vars.quantile(q=0.95,axis=1))
    
    e_columns = [c for c in daily_input.columns if c[0] =='E']
    e_vars=daily_input[e_columns]
    new_daily_input.loc[:,'E'] = e_vars.mean(axis=1)
    new_daily_input.loc[:,'E5']=e_vars.quantile(q=0.05,axis=1)
    new_daily_input.loc[:,'E25']=e_vars.quantile(q=0.25,axis=1)
    new_daily_input.loc[:,'E75']=e_vars.quantile(q=0.75,axis=1)
    new_daily_input.loc[:,'E95']=e_vars.quantile(q=0.95,axis=1)
    
    p_columns = [c for c in daily_input.columns if c[0] =='P']
    p_vars=daily_input[p_columns]
    new_daily_input.loc[:,'P'] = p_vars.mean(axis=1)
    new_daily_input.loc[:,'P5']=p_vars.quantile(q=0.05,axis=1)
    new_daily_input.loc[:,'P25']=p_vars.quantile(q=0.25,axis=1)
    new_daily_input.loc[:,'P75']=p_vars.quantile(q=0.75,axis=1)
    new_daily_input.loc[:,'P95']=p_vars.quantile(q=0.95,axis=1)
    
    return new_daily_input


def create_gap(train_index,test_index,gap):
    right=((train_index+1 == test_index[0]).sum()==1) and ((train_index-1 == test_index[-1]).sum()==0)
    centre=((train_index+1 == test_index[0]).sum()==1) and ((train_index-1 == test_index[-1]).sum()==1)
    left = ((train_index+1 == test_index[0]).sum()==0) and ((train_index-1 == test_index[-1]).sum()==1)
    if right:
        train_index=train_index[0:-gap]

    if left:
        train_index=train_index[gap:]

    if centre:
        pos = np.where(train_index+1 == test_index[0])[0][0]
        train_index=np.concatenate((train_index[:pos-gap],train_index[pos+gap:]),axis=0)
        
    return train_index;


def compute_anomalies(climatologies,pred):
    
    #compute real climatology
    anomalies=pd.DataFrame(pred.true_runoff-pred.runoff_clim,columns=['true_runoff'])
    
    #get the climatology of prediction on the wanted days
    clim_on_test_dates = pd.DataFrame(climatologies.loc[anomalies.index.day_of_year])
    
    #create an array with the proper shape
    repeated_clim=np.repeat((np.array(clim_on_test_dates.prediction)[...,np.newaxis]),
                    pred.shape[1]-2,
                    axis=1)

    #subtract clim to predictions
    anomalies_pred= pred.iloc[:,2:]- repeated_clim

    #put together real and predicted anomalies
    anomalies=pd.concat([anomalies,anomalies_pred],axis=1)

    return anomalies