Skip to content
Snippets Groups Projects
Commit e4345740 authored by Michael Langguth's avatar Michael Langguth
Browse files

Final update to get_metrics_joint_dom.ipynb.

parent c93a0e2b
No related branches found
No related tags found
No related merge requests found
Pipeline #105856 failed
%% Cell type:code id:written-productivity tags: %% Cell type:code id:written-productivity tags:
``` python ``` python
import os, sys import os, sys
import glob import glob
sys.path.append("../utils/") sys.path.append("../utils/")
import xarray as xr import xarray as xr
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from statistical_evaluation import perform_block_bootstrap_metric, avg_metrics, calculate_cond_quantiles, Scores from statistical_evaluation import perform_block_bootstrap_metric, avg_metrics, calculate_cond_quantiles, Scores
``` ```
%% Cell type:markdown id:english-conducting tags: %% Cell type:markdown id:english-conducting tags:
# Evaluation over a smaller (joint) domain # Evaluation over a smaller (joint) domain
In this Jupyter Notebook, evaluation will be performed on a subregion which is nested into the target region of the evaluated trained video prediction models. The resulting netCDF-files can be used in the meta-postpro In this Jupyter Notebook, evaluation will be performed on a subregion which is nested into the target region of the evaluated trained video prediction models. The resulting netCDF-files can be used in the meta-postprocessing to perform an evaluation on a joint region such as in Figure 10a of the [manuscript](https://doi.org/10.5194/gmd-2021-430). <br>
The following cells will first merge all forecast files under `indir` into a single netCDF-file.<br>
Then the data is sliced to the domain defined by `lonlatbox` and all subsequent evaluation is performed on this smaller domain.<br> ### Approach
In the following cells, all forecast files under `indir` will be merged into a single netCDF-file first.<br>
Then the data gets sliced to the domain defined by `lonlatbox` and all subsequent evaluation is performed on this smaller domain.<br>
The evaluation metrics are then saved to a file under `indir` named `evaluation_metrics_<nlon>x<nlat>.nc` where `nlat` and `nlon` denote the number of grid points/pixels in latitude and longitude direction of the smaller domain, respectively. <br> The evaluation metrics are then saved to a file under `indir` named `evaluation_metrics_<nlon>x<nlat>.nc` where `nlat` and `nlon` denote the number of grid points/pixels in latitude and longitude direction of the smaller domain, respectively. <br>
Thus, first let's define the basic parameters: Thus, first let's define the basic parameters:
%% Cell type:code id:metropolitan-mailman tags: %% Cell type:code id:metropolitan-mailman tags:
``` python ``` python
indir = "/p/project/deepacf/deeprain/video_prediction_shared_folder/results/era5-Y2007-2019M01to12-92x56-3840N0000E-2t_tcc_t_850/savp/20210901T090059_gong1_savp_cv12/" indir = "<path_to_trained_model/with_larger_target_domain>"
model = "savp" model = "savp"
# define domain. [3., 24.3, 40.2, 53.1] corresponds to the smallest domain tested in the GMD paper # define domain. [3., 24.3, 40.2, 53.1] corresponds to the smallest domain tested in the GMD paper (with 72x44grid points)
lonlatbox = [3., 24.3, 40.2, 53.1] lonlatbox = [3., 24.3, 40.2, 53.1]
``` ```
%% Cell type:markdown id:turned-player tags: %% Cell type:markdown id:turned-player tags:
Next, we perform a first merging step. For computational efficiency, we merge max. 1000 files in the first step.<br> Next, we perform a first merging step. For computational efficiency, we merge max. 1000 files in the first step.<br>
Since the data is not sorted by the dimension `init_time` when querying along the sample index, we sort it before saving to intermediate files.<br> Since the data is not sorted by the dimension `init_time` when querying along the sample index, we sort it before saving to intermediate files.<br>
Given that the merging step has already been performed, no further processing is required.<br> Given that the merging step has already been performed, no further processing is required.<br>
If this is not the case, we start with the sample indices between 0 and 999: If this is not the case, we start with the sample indices between 0 and 999:
%% Cell type:code id:aging-radio tags: %% Cell type:code id:aging-radio tags:
``` python ``` python
def get_fname_ndigits(indir, prefix, suffix, n, patt="[0-9]"): def get_fname_ndigits(indir, prefix, suffix, n, patt="[0-9]"):
flist = [] flist = []
for i in range(1, n+1): for i in range(1, n+1):
fn_search = os.path.join(indir, "{0}{1}{2}".format(prefix, i*patt, suffix)) fn_search = os.path.join(indir, "{0}{1}{2}".format(prefix, i*patt, suffix))
flist = flist + glob.glob(fn_search) flist = flist + glob.glob(fn_search)
if len(flist) == 0: if len(flist) == 0:
raise FileNotFoundError("Could not find any file under '{0}' with prefix '{1}' and suffix '{2}' containing digits.".format(indir, prefix, suffix)) raise FileNotFoundError("Could not find any file under '{0}' with prefix '{1}' and suffix '{2}' containing digits.".format(indir, prefix, suffix))
return flist return flist
# get list of files with sample index between 0 and 999. # get list of files with sample index between 0 and 999.
vfp_list = get_fname_ndigits(indir, "vfp_date_*sample_ind_", ".nc", 3) vfp_list = get_fname_ndigits(indir, "vfp_date_*sample_ind_", ".nc", 3)
outfile = os.path.join(indir, "vfp_{0}_forecasts_sample_ind_0_999.nc".format(model)) outfile = os.path.join(indir, "vfp_{0}_forecasts_sample_ind_0_999.nc".format(model))
if not os.path.isfile(outfile): if not os.path.isfile(outfile):
print("File '{0}' does not exist. \n Start reading data with sample index between 0 and 999 from '{1}'...".format(outfile, indir)) print("File '{0}' does not exist. \n Start reading data with sample index between 0 and 999 from '{1}'...".format(outfile, indir))
data_all = xr.open_mfdataset(vfp_list, concat_dim="init_time", combine="nested", decode_cf=True).load() data_all = xr.open_mfdataset(vfp_list, concat_dim="init_time", combine="nested", decode_cf=True).load()
data_all = data_all.sortby("init_time") data_all = data_all.sortby("init_time")
print("Data loaded successfully. Save merged data to '{0}'.".format(outfile)) print("Data loaded successfully. Save merged data to '{0}'.".format(outfile))
data_all.to_netcdf(outfile, encoding={'init_time':{'units': "seconds since 1900-01-01 00:00:00"}}) data_all.to_netcdf(outfile, encoding={'init_time':{'units': "seconds since 1900-01-01 00:00:00"}})
``` ```
%% Cell type:markdown id:controversial-picking tags: %% Cell type:markdown id:controversial-picking tags:
Then, we proceed with the rest. Then, we proceed with the rest.
%% Cell type:code id:basic-corps tags: %% Cell type:code id:basic-corps tags:
``` python ``` python
for i in np.arange(1, 9): for i in np.arange(1, 9):
outfile = os.path.join(indir, "vfp_{0}_forecasts_sample_ind_{1:d}000_{1:d}999.nc".format(model, i)) outfile = os.path.join(indir, "vfp_{0}_forecasts_sample_ind_{1:d}000_{1:d}999.nc".format(model, i))
if not os.path.isfile(outfile): if not os.path.isfile(outfile):
print("File '{0}' does not exist. Start reading data with sample index between {1:d}000 and {1:d}999 from '{2}'...".format(outfile, i, indir)) print("File '{0}' does not exist. Start reading data with sample index between {1:d}000 and {1:d}999 from '{2}'...".format(outfile, i, indir))
data_all = xr.open_mfdataset(os.path.join(indir, "vfp_date_*sample_ind_{0}???.nc".format(i)), concat_dim="init_time", combine="nested", decode_cf=True).load() data_all = xr.open_mfdataset(os.path.join(indir, "vfp_date_*sample_ind_{0}???.nc".format(i)), concat_dim="init_time", combine="nested", decode_cf=True).load()
data_all = data_all.sortby("init_time") data_all = data_all.sortby("init_time")
print("Data loaded successfully. Save merged data to '{0}'.".format(outfile)) print("Data loaded successfully. Save merged data to '{0}'.".format(outfile))
data_all.to_netcdf(outfile, encoding={'init_time':{'units': "seconds since 1900-01-01 00:00:00"}}) data_all.to_netcdf(outfile, encoding={'init_time':{'units': "seconds since 1900-01-01 00:00:00"}})
``` ```
%% Cell type:markdown id:severe-satisfaction tags: %% Cell type:markdown id:severe-satisfaction tags:
Still, xarray's `open_mfdataset`-method would not be able to concatenate all data since the `init_time`-dimension is not montonically increasing/decreasing when looping through the files. <br> Still, xarray's `open_mfdataset`-method would not be able to concatenate all data since the `init_time`-dimension is not montonically increasing/decreasing when looping through the files. <br>
Thus, we have to merge the data manually. Thus, we have to merge the data manually.
The merged dataset is then saved to separate datafile for later computation. The merged dataset is then saved to separate datafile for later computation.
If the data has already been merged, we simply read the data from the corresponding netCDF-file. If the data has already been merged, we simply read the data from the corresponding netCDF-file.
%% Cell type:code id:illegal-headset tags: %% Cell type:code id:illegal-headset tags:
``` python ``` python
outfile_all = os.path.join(indir, "vfp_{0}_forecasts_all.nc".format(model)) outfile_all = os.path.join(indir, "vfp_{0}_forecasts_all.nc".format(model))
if not os.path.isfile(outfile_all): if not os.path.isfile(outfile_all):
print("netCDF-file with all forecasts '{0}' does not exist yet. Start merging and sorting all precursor files.".format(outfile)) print("netCDF-file with all forecasts '{0}' does not exist yet. Start merging and sorting all precursor files.".format(outfile))
all_files = sorted(glob.glob(os.path.join(indir, "vfp_{0}_forecasts_sample_ind_*.nc".format(model)))) all_files = sorted(glob.glob(os.path.join(indir, "vfp_{0}_forecasts_sample_ind_*.nc".format(model))))
if len(all_files) == 0: if len(all_files) == 0:
raise FileNotFoundError("Could not find any precursor files.") raise FileNotFoundError("Could not find any precursor files.")
for i, f in enumerate(all_files): for i, f in enumerate(all_files):
print("Processing file '{0}'".format(f)) print("Processing file '{0}'".format(f))
tmp = xr.open_dataset(os.path.join(indir, f)).load() tmp = xr.open_dataset(os.path.join(indir, f)).load()
if i == 0: if i == 0:
all_fcst = tmp.copy() all_fcst = tmp.copy()
else: else:
print("Start merging") print("Start merging")
all_fcst = xr.merge([all_fcst, tmp]) all_fcst = xr.merge([all_fcst, tmp])
# sort by init_time-dimension... # sort by init_time-dimension...
all_fcst = all_fcst.sortby("init_time") all_fcst = all_fcst.sortby("init_time")
# ... and save to file # ... and save to file
print("Finally, write all merged and sorted data to '{0}'.".format(outfile_all)) print("Finally, write all merged and sorted data to '{0}'.".format(outfile_all))
all_fcst.to_netcdf(outfile_all) all_fcst.to_netcdf(outfile_all)
else: else:
all_fcst = xr.open_dataset(outfile_all).load() all_fcst = xr.open_dataset(outfile_all).load()
``` ```
%% Cell type:markdown id:fresh-favor tags: %% Cell type:markdown id:fresh-favor tags:
Now, we slice the dataset to the domain of interest (defined by `lonlatbox`). Now, we slice the dataset to the domain of interest (defined by `lonlatbox`).
%% Cell type:code id:initial-campbell tags: %% Cell type:code id:initial-campbell tags:
``` python ``` python
all_fcst_sl = all_fcst.sel({"lon": slice(lonlatbox[0], lonlatbox[1]), "lat": slice(lonlatbox[3], lonlatbox[2])}) all_fcst_sl = all_fcst.sel({"lon": slice(lonlatbox[0], lonlatbox[1]), "lat": slice(lonlatbox[3], lonlatbox[2])})
print(all_fcst_sl) print(all_fcst_sl)
``` ```
%% Cell type:markdown id:optimum-drunk tags: %% Cell type:markdown id:optimum-drunk tags:
Next we initialize the function for calculating the MSE and call it to evaluate the ERA5 and persistence forecasts. <br> Next we initialize the function for calculating the MSE and call it to evaluate the ERA5 and persistence forecasts. <br>
If you require further evaluation metrics, just expand the cell accordingly, e.g. add the following lines <br> If you require further evaluation metrics, just expand the cell accordingly, e.g. add the following lines <br>
``` ```
ssim_func = Scores("ssim", ["lat", "lon"]).score_func ssim_func = Scores("ssim", ["lat", "lon"]).score_func
ssim_era5_all = ssim_func(data_fcst=era5_fcst[varname_fcst], data_ref=era5_fcst[varname_ref]) ssim_era5_all = ssim_func(data_fcst=era5_fcst[varname_fcst], data_ref=era5_fcst[varname_ref])
ssim_per_all = (data_fcst=era5_fcst[varname_per], data_ref=era5_fcst[varname_ref]) ssim_per_all = (data_fcst=era5_fcst[varname_per], data_ref=era5_fcst[varname_ref])
``` ```
in case you want to evaluate the SSIM as well. in case you want to evaluate the SSIM as well.
%% Cell type:code id:sorted-adams tags: %% Cell type:code id:sorted-adams tags:
``` python ``` python
mse_func = Scores("mse", ["lat", "lon"]).score_func mse_func = Scores("mse", ["lat", "lon"]).score_func
varname_ref, varname_fcst, varname_per = "2t_ref", "2t_{0}_fcst".format(model), "2t_persistence_fcst" varname_ref, varname_fcst, varname_per = "2t_ref", "2t_{0}_fcst".format(model), "2t_persistence_fcst"
mse_model_all = mse_func(data_fcst=all_fcst_sl[varname_fcst], data_ref=all_fcst_sl[varname_ref]) mse_model_all = mse_func(data_fcst=all_fcst_sl[varname_fcst], data_ref=all_fcst_sl[varname_ref])
mse_per_all = mse_func(data_fcst=all_fcst_sl[varname_per], data_ref=all_fcst_sl[varname_ref]) mse_per_all = mse_func(data_fcst=all_fcst_sl[varname_per], data_ref=all_fcst_sl[varname_ref])
``` ```
%% Cell type:markdown id:demonstrated-eligibility tags: %% Cell type:markdown id:demonstrated-eligibility tags:
Then, we initialize the data arrays to store the desired evaluation metrics... Then, we initialize the data arrays to store the desired evaluation metrics...
%% Cell type:code id:protective-battle tags: %% Cell type:code id:protective-battle tags:
``` python ``` python
fcst_hours = all_fcst_sl["fcst_hour"] fcst_hours = all_fcst_sl["fcst_hour"]
nhours = len(fcst_hours) nhours = len(fcst_hours)
nboots=1000 nboots=1000
mse_model_fcst = xr.DataArray(np.empty(nhours, dtype=object), coords={"fcst_hour": fcst_hours}, dims=["fcst_hour"]) mse_model_fcst = xr.DataArray(np.empty(nhours, dtype=object), coords={"fcst_hour": fcst_hours}, dims=["fcst_hour"])
mse_model_fcst_boot = xr.DataArray(np.empty((nhours, nboots), dtype=object), mse_model_fcst_boot = xr.DataArray(np.empty((nhours, nboots), dtype=object),
coords={"fcst_hour": fcst_hours, "iboot": np.arange(nboots)}, coords={"fcst_hour": fcst_hours, "iboot": np.arange(nboots)},
dims=["fcst_hour", "iboot"]) dims=["fcst_hour", "iboot"])
mse_per_fcst = xr.DataArray(np.empty(nhours, dtype=object), coords={"fcst_hour": fcst_hours}, dims=["fcst_hour"]) mse_per_fcst = xr.DataArray(np.empty(nhours, dtype=object), coords={"fcst_hour": fcst_hours}, dims=["fcst_hour"])
mse_per_fcst_boot = xr.DataArray(np.empty((nhours, nboots), dtype=object), mse_per_fcst_boot = xr.DataArray(np.empty((nhours, nboots), dtype=object),
coords={"fcst_hour": fcst_hours, "iboot": np.arange(nboots)}, coords={"fcst_hour": fcst_hours, "iboot": np.arange(nboots)},
dims=["fcst_hour", "iboot"]) dims=["fcst_hour", "iboot"])
``` ```
%% Cell type:markdown id:norman-swedish tags: %% Cell type:markdown id:norman-swedish tags:
... and populate them by looping over all forecast hours. ... and populate them by looping over all forecast hours.
%% Cell type:code id:extra-bible tags: %% Cell type:code id:extra-bible tags:
``` python ``` python
for i, fh in enumerate(fcst_hours): for i, fh in enumerate(fcst_hours):
mse_model_curr = mse_model_all.sel(fcst_hour=fh) mse_model_curr = mse_model_all.sel(fcst_hour=fh)
mse_per_curr = mse_per_all.sel(fcst_hour=fh) mse_per_curr = mse_per_all.sel(fcst_hour=fh)
mse_model_fcst[fh-1], mse_per_fcst[fh-1] = mse_model_curr.mean(), mse_per_curr.mean() mse_model_fcst[fh-1], mse_per_fcst[fh-1] = mse_model_curr.mean(), mse_per_curr.mean()
mse_model_fcst_boot[i, :] = perform_block_bootstrap_metric(mse_model_curr, "init_time", 24*7) mse_model_fcst_boot[i, :] = perform_block_bootstrap_metric(mse_model_curr, "init_time", 24*7)
mse_per_fcst_boot[i, :] = perform_block_bootstrap_metric(mse_per_curr, "init_time", 24*7) mse_per_fcst_boot[i, :] = perform_block_bootstrap_metric(mse_per_curr, "init_time", 24*7)
``` ```
%% Cell type:markdown id:limiting-corpus tags: %% Cell type:markdown id:limiting-corpus tags:
Finally, we put the data arrays into a joint dataset and save the results into the netCDF-file. Finally, we put the data arrays into a joint dataset and save the results into the netCDF-file.
%% Cell type:code id:reported-traffic tags: %% Cell type:code id:reported-traffic tags:
``` python ``` python
# create Dataset and save to netCDF-file # create Dataset and save to netCDF-file
ds_mse = xr.Dataset({"2t_{0}_mse_avg".format(model): mse_model_fcst, "2t_{0}_mse_bootstrapped".format(model): mse_model_fcst_boot, ds_mse = xr.Dataset({"2t_{0}_mse_avg".format(model): mse_model_fcst, "2t_{0}_mse_bootstrapped".format(model): mse_model_fcst_boot,
"2t_persistence_mse_avg": mse_per_fcst, "2t_persistence_mse_bootstrapped": mse_per_fcst_boot}) "2t_persistence_mse_avg": mse_per_fcst, "2t_persistence_mse_bootstrapped": mse_per_fcst_boot})
outfile = os.path.join(indir, "evaluation_metrics_{0:d}x{1:d}.nc".format(len(all_fcst_sl["lon"]), len(all_fcst_sl["lat"]))) outfile = os.path.join(indir, "evaluation_metrics_{0:d}x{1:d}.nc".format(len(all_fcst_sl["lon"]), len(all_fcst_sl["lat"])))
print("Save evaluation metrics to '{0}'".format(outfile)) print("Save evaluation metrics to '{0}'".format(outfile))
print(ds_mse) print(ds_mse)
ds_mse.to_netcdf(outfile) ds_mse.to_netcdf(outfile)
``` ```
%% Cell type:markdown id:correct-bulgarian tags: %% Cell type:markdown id:correct-bulgarian tags:
## Done! ## Done!
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment