diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index b7744941d43476d7da7a1e4e1fa91c032307c4ff..22593d8b10333875384106a886740170d0ce913a 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -135,8 +135,11 @@ class PostProcessing(RunEnvironment): self.plot() def estimate_sample_uncertainty(self, separate_ahead=False): - #todo: visualize - #todo: write results on disk + """ + Estimate sample uncertainty by using a bootstrap approach. Forecasts are split into individual blocks along time + and randomly drawn with replacement. The resulting behaviour of the error indicates the robustness of each + analyzed model to quantify which model might be superior compared to others. + """ n_boots = self.data_store.get_default("n_boots", default=1000, scope="uncertainty_estimate") block_length = self.data_store.get_default("block_length", default="1m") evaluate_competitors = self.data_store.get_default("evaluate_competitors", default=True) @@ -145,8 +148,37 @@ class PostProcessing(RunEnvironment): self.uncertainty_estimate = statistics.create_n_bootstrap_realizations( block_mse, dim_name_time=self.index_dim, dim_name_model=self.model_type_dim, dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots) + self.report_sample_uncertainty() + + def report_sample_uncertainty(self, percentiles: list = None): + """ + Store raw results of uncertainty estimate and calculate aggregate statistcs and store as raw data but also as + markdown and latex. + """ + report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") + path_config.check_path_and_create(report_path) + + # store raw results as nc + file_name = os.path.join(report_path, "uncertainty_estimate_raw_results.nc") + self.uncertainty_estimate.to_netcdf(path=file_name) + + # store statistics + if percentiles is None: + percentiles = [.05, .1, .25, .5, .75, .9, .95] + df_descr = self.uncertainty_estimate.to_pandas().describe(percentiles=percentiles).astype("float32") + column_format = tables.create_column_format_for_tex(df_descr) + file_name = os.path.join(report_path, "uncertainty_estimate_statistics.%s") + tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr) + tables.save_to_md(report_path, file_name % "md", df=df_descr) + df_descr.to_csv(file_name % "csv", sep=";") def calculate_block_mse(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"): + """ + Transform data into blocks along time axis. Block length can be any frequency like '1m' or '7d. Data are only + split along time axis, which means that a single block can have very diverse quantities regarding the number of + station or actual data contained. This is intended to analyze not only the robustness against the time but also + against the number of observations and diversity ot stations. + """ path = self.data_store.get("forecast_path") all_stations = self.data_store.get("stations") start = self.data_store.get("start", "test") @@ -155,13 +187,11 @@ class PostProcessing(RunEnvironment): coll_dim = "station" collector = [] for station in all_stations: - external_data = self._get_external_data(station, path) # test data - - # test errors + # test data + external_data = self._get_external_data(station, path) if external_data is not None: pass - - # load competitors + # competitors if evaluate_competitors is True: competitor = self.load_competitors(station) combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim) @@ -185,9 +215,7 @@ class PostProcessing(RunEnvironment): return mse_blocks def create_error_array(self, data): - """ - Calculate squared error of all given time series in relation to observation. - """ + """Calculate squared error of all given time series in relation to observation.""" errors = data.drop_sel({self.model_type_dim: self.observation_indicator}) errors1 = errors - data.sel({self.model_type_dim: self.observation_indicator}) errors2 = errors1 ** 2