diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index b5495e4d91fe53a2d3ecf6a038f53e56c971652a..4a4f550b4505fcd15411f9cbb0a294142f05952d 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -30,6 +30,7 @@ DEFAULT_EARLY_STOPPING_EPOCHS = np.inf DEFAULT_RESTORE_BEST_MODEL_WEIGHTS = True DEFAULT_TARGET_VAR = "o3" DEFAULT_TARGET_DIM = "variables" +DEFAULT_TARGET_VAR_UNIT = "ppb" DEFAULT_WINDOW_LEAD_TIME = 3 DEFAULT_WINDOW_DIM = "window" DEFAULT_TIME_DIM = "datetime" diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 9843728c62e8b745a2cf1e941af0801d6e9ab0d7..7034c513328d6587e10925fe5782a17d3bd033e8 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -1312,6 +1312,45 @@ class PlotTimeEvolutionMetric(AbstractPlotClass): self._save() +class PlotRankHistogram(AbstractPlotClass): + """ + + """ + + def __init__(self, data: dict, plot_folder: str = ".", ahead_dim: str = "ahead", target_unit: str = None, + target_var: str = None): + super().__init__(plot_folder, "rank_historgam_plot") + self.data = data + self.ahead_dim = ahead_dim + self.target_unit = target_unit + self.target_var = target_var + self._plot() + + def _plot(self): + for key, value in self.data.items(): + plot_name = f"{self.plot_name}_{key}_plot.pdf" + plot_path = os.path.join(os.path.abspath(self.plot_folder), plot_name) + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) + for ah in value[self.ahead_dim]: + fig, ax = plt.subplots() + ax.bar(value.sel({"rank_hist_type": "bins", self.ahead_dim: ah.values}), + value.sel({"rank_hist_type": "freq", self.ahead_dim: ah.values}), + align='center') + ax.set_xlabel(f"{self.target_var} (in {self.target_unit})") + ax.set_ylabel("freq") + plt.title(f"{ah.values}") + pdf_pages.savefig() + plt.close('all') + # close all open figures / plots + pdf_pages.close() + # plt.close('all') + print("test") + + def _load_data(self, subset): + pass + + + if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] path = "../../testrun_network/forecasts" diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 0f88fe7395e2e72e72232585ac7a4540f8cdcd4d..4f220cf5416ca5559dd593bfd39f2023d4da2d06 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -25,7 +25,7 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, \ DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, DEFAULT_DO_UNCERTAINTY_ESTIMATE, DEFAULT_CREATE_SNAPSHOT, \ DEFAULT_EARLY_STOPPING_EPOCHS, DEFAULT_RESTORE_BEST_MODEL_WEIGHTS, DEFAULT_COMPETITORS, DEFAULT_NUMBER_OF_REALIZATIONS, \ - DEFAULT_ENS_MOMENT_DIM, DEFAULT_ENS_REALIZ_DIM + DEFAULT_ENS_MOMENT_DIM, DEFAULT_ENS_REALIZ_DIM, DEFAULT_TARGET_VAR_UNIT from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -100,6 +100,7 @@ class ExperimentSetup(RunEnvironment): :param target_var: target variable to predict by model, currently only a single target variable is supported. Because this framework was originally designed to predict ozone, default is `"o3"`. :param target_dim: dimension of target variable (default `"variables"`). + :param target_var_unit: unit of target variable (e.g used in plots) :param window_lead_time: number of time steps to predict by model (default 3). Time steps `t_0+1` to `t_0+w` are predicted. :param dimensions: @@ -250,7 +251,7 @@ class ExperimentSetup(RunEnvironment): do_uncertainty_estimate: bool = None, model_display_name: str = None, transformation_file: str = None, calculate_fresh_transformation: bool = None, snapshot_load_path: str = None, create_snapshot: bool = None, snapshot_path: str = None, num_realizations: int = None, - ens_moment_dim: str = None, ens_realization_dim: str = None, **kwargs): + ens_moment_dim: str = None, ens_realization_dim: str = None, target_var_unit: str = None, **kwargs): # create run framework super().__init__() @@ -360,6 +361,7 @@ class ExperimentSetup(RunEnvironment): self._set_param("target_var", target_var, default=DEFAULT_TARGET_VAR) self._set_param("target_dim", target_dim, default=DEFAULT_TARGET_DIM) self._set_param("window_lead_time", window_lead_time, default=DEFAULT_WINDOW_LEAD_TIME) + self._set_param("target_var_unit", target_var_unit, default=DEFAULT_TARGET_VAR_UNIT) # interpolation self._set_param("dimensions", dimensions, default=DEFAULT_DIMENSIONS) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 95120868d009bf111e333ce68c9c1405b0b96587..b2fb4a9f8d0a195bfd5cf9e1dbd4f37e9b8765ef 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -29,7 +29,7 @@ from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotFeatureImportanceSkillScore, PlotConditionalQuantiles, \ - PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap, PlotTimeEvolutionMetric + PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap, PlotTimeEvolutionMetric, PlotRankHistogram from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ PlotPeriodogram, PlotDataHistogram from mlair.run_modules.run_environment import RunEnvironment @@ -88,6 +88,7 @@ class PostProcessing(RunEnvironment): self.forecast_path = self.data_store.get("forecast_path") self.plot_path: str = self.data_store.get("plot_path") self.target_var = self.data_store.get("target_var") + self.target_var_unit = self.data_store.get("target_var_unit") self._sampling = self.data_store.get("sampling") self.num_realizations = self.data_store.get("num_realizations", "postprocessing") self.ens_realization_dim = self.data_store.get("ens_realization_dim", "postprocessing") @@ -99,6 +100,7 @@ class PostProcessing(RunEnvironment): self.uncertainty_estimate = None self.uncertainty_estimate_seasons = {} self.block_mse_per_station = None + self.rank_hist = {} self.competitor_path = self.data_store.get("competitor_path") self.competitors = to_list(self.data_store.get_default("competitors", default=[])) self.forecast_indicator = "nn" @@ -124,7 +126,8 @@ class PostProcessing(RunEnvironment): # calc/report ens scores if self.num_realizations is not None: - self.report_crps("test") + for subset in ["test", "train_val"]: + self.report_crps(subset) # sample uncertainty @@ -194,6 +197,7 @@ class PostProcessing(RunEnvironment): self.store_crps_reports(df_tot, report_path, subset, station=False) self.store_crps_reports(df_stations, report_path, subset, station=True) + @TimeTrackingWrapper def report_crps(self, subset): """ Calculate CRPS for all lead times @@ -208,14 +212,15 @@ class PostProcessing(RunEnvironment): collector_ens = [] collector_obs = [] - - - # crps[f"{i}{get_sampling(self._sampling)}"] = ensverif.crps.crps( # ens.values.reshape(-1, self.num_realizations), obs.values.reshape(-1), distribution="emp") crps_stations = {} + rank_stations = {} idx_counter = 0 - for station in self.test_data.keys(): + generators = {"train": self.train_data, "val": self.val_data, + "test": self.test_data, "train_val": self.train_val_data} + # for subset, generator in generators.items(): + for station in generators[subset].keys(): station_based_file_name = os.path.join(self.forecast_path, f"forecasts_{station}_ens_{subset}_values.nc") ds = xr.open_mfdataset(station_based_file_name) ens_station = ds["ens"].sel({self.iter_dim: station, self.ens_moment_dim: "ens_dist_mean", @@ -237,9 +242,10 @@ class PostProcessing(RunEnvironment): collector_obs.append(obs_reindex) idx_counter = new_index[-1] - crps_times = self._calc_crps_for_lead_times(ens_station, obs_station) + crps_times, rank_times = self._calc_crps_for_lead_times(ens_station, obs_station) crps_stations[station] = crps_times + crps_stations[station] = rank_times df_stations = pd.DataFrame(crps_stations) @@ -250,22 +256,44 @@ class PostProcessing(RunEnvironment): try: full_ens = xr.concat(collector_ens, dim=self.index_dim) full_obs = xr.concat(collector_obs, dim=self.index_dim) - df_tot = pd.DataFrame(self._calc_crps_for_lead_times(full_ens, full_obs), index=to_list(subset)) - self.store_crps_reports(df_tot, report_path, subset, station=False) + crps, ranks = self._calc_crps_for_lead_times(full_ens, full_obs) + df_crps_tot = pd.DataFrame(crps, index=to_list(subset)) + + self.store_crps_reports(df_crps_tot, report_path, subset, station=False) + rh = self._get_rank_hist_from_dict(ranks) + self.rank_hist[subset] = rh + self._save_rank_hist(rh, subset) except Exception as e: logging.info(f"Can't calc crps for all stations together due to: {e}") - @TimeTrackingWrapper + def _get_rank_hist_from_dict(self, data): + + d = np.stack([np.stack([np.insert(v[0].astype(np.float32), 0, np.nan), v[1]]) for v in data.values()]) + dxr = xr.DataArray(d, dims=[self.ahead_dim, "rank_hist_type", "idx"], + coords={self.ahead_dim: list(data.keys()), "rank_hist_type": ["freq", "bins"], + "idx": range(d.shape[-1])}) + return dxr + def _save_rank_hist(self, data, subset): + report_path = os.path.join(self.data_store.get("experiment_path"), "data") + data.to_netcdf(os.path.join(report_path, f"ens_rank_hist_{subset}_data.nc")) + # return data + + + + + # @TimeTrackingWrapper def _calc_crps_for_lead_times(self, ens, obs): crps_collector = {} + rank_collector = {} for i in range(1, self.window_lead_time + 1): ens_res = ens.sel({self.ahead_dim: i, }) obs_res = obs.sel({self.ahead_dim: i, }) - crps_collector[f"{i}{get_sampling(self._sampling)}"] = ensverif.crps.crps(ens_res, obs_res, - distribution="emp") + collector_key = f"{i}{get_sampling(self._sampling)}" + crps_collector[collector_key] = ensverif.crps.crps(ens_res, obs_res, distribution="emp") + rank_collector[collector_key] = ensverif.rankhist.rankhist(ens_res, obs_res) - return crps_collector + return crps_collector, rank_collector @staticmethod def store_crps_reports(df, report_path, subset, station=False): @@ -820,6 +848,14 @@ class PostProcessing(RunEnvironment): logging.error(f"Could not create plot PlotTimeEvolutionMetric due to the following error: {e}" f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + try: + if "PlotRankHistogram" in plot_list: + PlotRankHistogram(self.rank_hist, plot_folder=self.plot_path, target_unit=self.target_var_unit, + target_var=self.target_var) + except Exception as e: + logging.error(f"Could not create plot PlotRankHistogram due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + @TimeTrackingWrapper def calculate_test_score(self): """Evaluate test score of model and save locally."""