diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 5af011db9133a22d3b7dbe30197ec16f60c7e5db..7633a2a9c1842219d7af7b9c7b2b4f23a034cbdf 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -11,6 +11,8 @@ import pandas as pd from typing import Union, Tuple, Dict, List import itertools from collections import OrderedDict +from mlair.helpers import to_list + Data = Union[xr.DataArray, pd.DataFrame] @@ -569,7 +571,6 @@ def create_single_bootstrap_realization(data: xr.DataArray, dim_name_time: str) :param dim_name_time: name of time dimension :return: bootstrapped realization of data """ - num_of_blocks = data.coords[dim_name_time].shape[0] boot_idx = np.random.choice(num_of_blocks, size=num_of_blocks, replace=True) return data.isel({dim_name_time: boot_idx}) @@ -585,7 +586,7 @@ def calculate_average(data: xr.DataArray, **kwargs) -> xr.DataArray: def create_n_bootstrap_realizations(data: xr.DataArray, dim_name_time: str, dim_name_model: str, n_boots: int = 1000, - dim_name_boots: str = 'boots') -> xr.DataArray: + dim_name_boots: str = 'boots', seasons: List = None) -> Dict[str, xr.DataArray]: """ Create n bootstrap realizations and calculate averages across realizations @@ -594,26 +595,23 @@ def create_n_bootstrap_realizations(data: xr.DataArray, dim_name_time: str, dim_ :param dim_name_model: name of model dimension :param n_boots: number of bootstap realizations :param dim_name_boots: name of bootstap dimension + :param seasons: calculate errors for given seasons in addition (default None) :return: """ + seasons = [] if seasons is None else to_list(seasons) # assure seasons to be empty list if None res_dims = [dim_name_boots] dims = list(data.dims) other_dims = [v for v in dims if v in set(dims).difference([dim_name_time])] coords = {dim_name_boots: range(n_boots), **{dim_name: data.coords[dim_name] for dim_name in other_dims}} if len(dims) > 1: res_dims = res_dims + other_dims - res = xr.DataArray(np.nan, dims=res_dims, coords=coords) + realizations = {k: xr.DataArray(np.nan, dims=res_dims, coords=coords) for k in seasons + [""]} for boot in range(n_boots): - res[boot] = (calculate_average( - create_single_bootstrap_realization(data, dim_name_time=dim_name_time), - dim=dim_name_time, skipna=True)) - return res - - - - - - - - - + shuffled = create_single_bootstrap_realization(data, dim_name_time=dim_name_time) + realizations[""][boot] = calculate_average(shuffled, dim=dim_name_time, skipna=True) + for season in seasons: + assert season in ["DJF", "MAM", "JJA", "SON"] + sel = shuffled[dim_name_time].dt.season == season + realizations[season][boot] = calculate_average(shuffled.sel({dim_name_time: sel}), + dim=dim_name_time, skipna=True) + return realizations diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index bd2012c3fe9f53e7e07bfb4bfc4cde096c2dc891..299febfe7db6f3237b3390ec6c0563506d61a98c 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -1095,7 +1095,7 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover def __init__(self, data: xr.DataArray, plot_folder: str = ".", model_type_dim: str = "type", error_measure: str = "mse", error_unit: str = None, dim_name_boots: str = 'boots', block_length: str = None, model_name: str = "NN", model_indicator: str = "nn", - ahead_dim: str = "ahead", sampling: Union[str, Tuple[str]] = ""): + ahead_dim: str = "ahead", sampling: Union[str, Tuple[str]] = "", season_annotation: str = None): super().__init__(plot_folder, "sample_uncertainty_from_bootstrap") self.default_plot_name = self.plot_name self.model_type_dim = model_type_dim @@ -1105,6 +1105,7 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover self.error_unit = error_unit self.block_length = block_length self.model_name = model_name + _season = season_annotation or "" self.sampling = {"daily": "d", "hourly": "H"}.get(sampling[1] if isinstance(sampling, tuple) else sampling, "") data = self.rename_model_indicator(data, model_name, model_indicator) self.prepare_data(data) @@ -1114,12 +1115,12 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover # plot raw metric (mse) for orientation, utest, agg_type in variants: - self._plot(orientation=orientation, apply_u_test=utest, agg_type=agg_type) + self._plot(orientation=orientation, apply_u_test=utest, agg_type=agg_type, season=_season) # plot root of metric (rmse) self._apply_root() for orientation, utest, agg_type in variants: - self._plot(orientation=orientation, apply_u_test=utest, agg_type=agg_type, tag="_sqrt") + self._plot(orientation=orientation, apply_u_test=utest, agg_type=agg_type, tag="_sqrt", season=_season) self._data_table = None self._n_boots = None @@ -1148,9 +1149,10 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover self.error_measure = f"root {self.error_measure}" self.error_unit = self.error_unit.replace("$^2$", "") - def _plot(self, orientation: str = "v", apply_u_test: bool = False, agg_type="single", tag=""): + def _plot(self, orientation: str = "v", apply_u_test: bool = False, agg_type="single", tag="", season=""): self.plot_name = self.default_plot_name + {"v": "_vertical", "h": "_horizontal"}[orientation] + \ - {True: "_u_test", False: ""}[apply_u_test] + "_" + agg_type + tag + {True: "_u_test", False: ""}[apply_u_test] + "_" + agg_type + tag + \ + {"": ""}.get(season, f"_{season}") if apply_u_test is True and agg_type == "multi": return # not implemented data_table = self._data_table @@ -1198,10 +1200,13 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover ax.set_xlabel(f"{self.error_measure} (in {self.error_unit})") xlims = list(ax.get_xlim()) ax.set_xlim([xlims[0], xlims[1] * 1.015]) - else: raise ValueError(f"orientation must be `v' or `h' but is: {orientation}") - text = f"n={n_boots}" if self.block_length is None else f"{self.block_length}, n={n_boots}" + text = f"n={n_boots}" + if self.block_length is not None: + text = f"{self.block_length}, {text}" + if len(season) > 0: + text = f"{season}, {text}" loc = "lower left" text_box = AnchoredText(text, frameon=True, loc=loc, pad=0.5, bbox_to_anchor=(0., 1.0), bbox_transform=ax.transAxes) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 7a31f83fdf09ceed649aa02d4eecb74a9165eba0..eb0d72becc4570c7081f02e14bf0549de1623c47 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -87,6 +87,7 @@ class PostProcessing(RunEnvironment): self.skill_scores = None self.feature_importance_skill_scores = None self.uncertainty_estimate = None + self.uncertainty_estimate_seasons = {} self.block_mse_per_station = None self.competitor_path = self.data_store.get("competitor_path") self.competitors = to_list(self.data_store.get_default("competitors", default=[])) @@ -154,14 +155,16 @@ class PostProcessing(RunEnvironment): separate_ahead=separate_ahead, block_length=block_length) self.block_mse_per_station = block_mse_per_station - self.uncertainty_estimate = statistics.create_n_bootstrap_realizations( + estimate = statistics.create_n_bootstrap_realizations( block_mse, dim_name_time=self.index_dim, dim_name_model=self.model_type_dim, - dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots) + dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots, seasons=["DJF", "MAM", "JJA", "SON"]) + self.uncertainty_estimate = estimate.pop("") + self.uncertainty_estimate_seasons = estimate self.report_sample_uncertainty() def report_sample_uncertainty(self, percentiles: list = None): """ - Store raw results of uncertainty estimate and calculate aggregate statistcs and store as raw data but also as + Store raw results of uncertainty estimate and calculate aggregate statistics and store as raw data but also as markdown and latex. """ report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") @@ -170,31 +173,37 @@ class PostProcessing(RunEnvironment): # store raw results as nc file_name = os.path.join(report_path, "uncertainty_estimate_raw_results.nc") self.uncertainty_estimate.to_netcdf(path=file_name) + for season in self.uncertainty_estimate_seasons.keys(): + file_name = os.path.join(report_path, f"uncertainty_estimate_raw_results_{season}.nc") + self.uncertainty_estimate_seasons[season].to_netcdf(path=file_name) # store statistics if percentiles is None: percentiles = [.05, .1, .25, .5, .75, .9, .95] - for ahead_steps in ["single", "multi"]: - if ahead_steps == "single": - try: - df_descr = self.uncertainty_estimate.to_pandas().describe(percentiles=percentiles).astype("float32") - except ValueError: - df_descr = self.uncertainty_estimate.mean(self.ahead_dim).to_pandas().describe(percentiles=percentiles).astype("float32") - else: - if self.ahead_dim not in self.uncertainty_estimate.dims: - continue - df_descr = self.uncertainty_estimate.to_dataframe(self.model_type_dim).unstack().groupby(level=self.ahead_dim).describe( - percentiles=percentiles).astype("float32") - df_descr = df_descr.stack(-1) - df_descr = df_descr.reorder_levels(df_descr.index.names[::-1]) - df_sorter = ["count", "mean", "std", "min", *[f"{round(p * 100)}%" for p in percentiles], "max"] - df_descr = df_descr.loc[df_sorter] - column_format = tables.create_column_format_for_tex(df_descr) - file_name = os.path.join(report_path, f"uncertainty_estimate_statistics_{ahead_steps}.%s") - tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr) - tables.save_to_md(report_path, file_name % "md", df=df_descr) - df_descr.to_csv(file_name % "csv", sep=";") + for season in [None] + list(self.uncertainty_estimate_seasons.keys()): + estimate = self.uncertainty_estimate if season is None else self.uncertainty_estimate_seasons[season] + affix = "" if season is None else f"_{season}" + for ahead_steps in ["single", "multi"]: + if ahead_steps == "single": + try: + df_descr = estimate.to_pandas().describe(percentiles=percentiles).astype("float32") + except ValueError: + df_descr = estimate.mean(self.ahead_dim).to_pandas().describe(percentiles=percentiles).astype("float32") + else: + if self.ahead_dim not in estimate.dims: + continue + df_descr = estimate.to_dataframe(self.model_type_dim).unstack().groupby(level=self.ahead_dim).describe( + percentiles=percentiles).astype("float32") + df_descr = df_descr.stack(-1) + df_descr = df_descr.reorder_levels(df_descr.index.names[::-1]) + df_sorter = ["count", "mean", "std", "min", *[f"{round(p * 100)}%" for p in percentiles], "max"] + df_descr = df_descr.loc[df_sorter] + column_format = tables.create_column_format_for_tex(df_descr) + file_name = os.path.join(report_path, f"uncertainty_estimate_statistics_{ahead_steps}{affix}.%s") + tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr) + tables.save_to_md(report_path, file_name % "md", df=df_descr) + df_descr.to_csv(file_name % "csv", sep=";") def calculate_block_mse(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"): """ @@ -589,11 +598,13 @@ class PostProcessing(RunEnvironment): try: if "PlotSampleUncertaintyFromBootstrap" in plot_list and self.uncertainty_estimate is not None: block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") - PlotSampleUncertaintyFromBootstrap( - data=self.uncertainty_estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim, - dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error", - error_unit=r"ppb$^2$", block_length=block_length, model_name=self.model_display_name, - model_indicator=self.forecast_indicator, sampling=self._sampling) + for season in [None] + list(self.uncertainty_estimate_seasons.keys()): + estimate = self.uncertainty_estimate if season is None else self.uncertainty_estimate_seasons[season] + PlotSampleUncertaintyFromBootstrap( + data=estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim, + dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error", + error_unit=r"ppb$^2$", block_length=block_length, model_name=self.model_display_name, + model_indicator=self.forecast_indicator, sampling=self._sampling, season_annotation=season) except Exception as e: logging.error(f"Could not create plot PlotSampleUncertaintyFromBootstrap due to the following error: {e}" f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}")