diff --git a/climax/main/preprocess_ERA5.py b/climax/main/preprocess_ERA5.py index f71eb9a9e86a80799a89ac12604194153e9b69d7..c12efba53a9d139b0d57586c9ae653797b1cdd8c 100644 --- a/climax/main/preprocess_ERA5.py +++ b/climax/main/preprocess_ERA5.py @@ -81,24 +81,38 @@ if __name__ == '__main__': continue # create filenames for reprojected and resampled files - target = [args.target.joinpath(f.name) for f in source] + target = [args.target.joinpath(var, f.name) for f in source] + dlyavg = [args.target.joinpath(var, f.name.replace('.nc', '_d.nc')) + for f in source] + + # aggregate hourly data to daily data: resample in case of missing + # days + LOGGER.info('Computing daily averages ...') + for src, tmp in zip(source, dlyavg): + ds = xr.open_dataset(source) + + # compute daily averages + ds = ds.resample(time='D').mean(dim='time') + + # save intermediate file for resampling and reprojection to + # target grid + ds.to_netcdf(tmp, engine='h5netcdf') # reproject and resample to target grid in parallel target = Parallel(n_jobs=-1, verbose=51)( - delayed(reproject_cdo)(args.grid, src, trg, args.mode, + delayed(reproject_cdo)(args.grid, tmp, trg, args.mode, args.overwrite) - for src, trg in zip(source, target)) + for tmp, trg in zip(dlyavg, target)) + + # remove temporary daily averages + for avg in dlyavg: + avg.unlink() # aggregate files for different years into a single file using # xarray and dask LOGGER.info('Reading ERA5 data ...') ds = xr.open_mfdataset(target, parallel=True).compute() - # aggregate hourly data to daily data: resample in case of missing - # days - LOGGER.info('Computing daily averages ...') - ds = ds.resample(time='D').mean(dim='time') - # set NetCDF file compression for each variable for _, var in ds.data_vars.items(): var.encoding['zlib'] = True