Merge branch 'issue333_feat_test-set-sample-uncertainty' of...

Merge branch 'issue333_feat_test-set-sample-uncertainty' of gitlab.jsc.fz-juelich.de:esde/machine-learning/mlair into issue333_feat_test-set-sample-uncertainty

Merge branch 'issue333_feat_test-set-sample-uncertainty' of...
8d343f1b · Felix Kleinert · 441714a6 · 4fa3bf86 · 8d343f1b
Commit 8d343f1b authored 3 years ago by Felix Kleinert
--- a/mlair/run_modules/post_processing.py
+++ b/mlair/run_modules/post_processing.py
@@ -135,8 +135,11 @@ class PostProcessing(RunEnvironment):
        self.plot()

    def estimate_sample_uncertainty(self, separate_ahead=False):
-        #todo: visualize
-        #todo: write results on disk
+        """
+        Estimate sample uncertainty by using a bootstrap approach. Forecasts are split into individual blocks along time
+        and randomly drawn with replacement. The resulting behaviour of the error indicates the robustness of each
+        analyzed model to quantify which model might be superior compared to others.
+        """
        n_boots = self.data_store.get_default("n_boots", default=1000, scope="uncertainty_estimate")
        block_length = self.data_store.get_default("block_length", default="1m")
        evaluate_competitors = self.data_store.get_default("evaluate_competitors", default=True)
@@ -145,8 +148,37 @@ class PostProcessing(RunEnvironment):
        self.uncertainty_estimate = statistics.create_n_bootstrap_realizations(
            block_mse, dim_name_time=self.index_dim, dim_name_model=self.model_type_dim,
            dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots)
+        self.report_sample_uncertainty()
+
+    def report_sample_uncertainty(self, percentiles: list = None):
+        """
+        Store raw results of uncertainty estimate and calculate aggregate statistcs and store as raw data but also as
+        markdown and latex.
+        """
+        report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
+        path_config.check_path_and_create(report_path)
+
+        # store raw results as nc
+        file_name = os.path.join(report_path, "uncertainty_estimate_raw_results.nc")
+        self.uncertainty_estimate.to_netcdf(path=file_name)
+
+        # store statistics
+        if percentiles is None:
+            percentiles = [.05, .1, .25, .5, .75, .9, .95]
+        df_descr = self.uncertainty_estimate.to_pandas().describe(percentiles=percentiles).astype("float32")
+        column_format = tables.create_column_format_for_tex(df_descr)
+        file_name = os.path.join(report_path, "uncertainty_estimate_statistics.%s")
+        tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr)
+        tables.save_to_md(report_path, file_name % "md", df=df_descr)
+        df_descr.to_csv(file_name % "csv", sep=";")

    def calculate_block_mse(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"):
+        """
+        Transform data into blocks along time axis. Block length can be any frequency like '1m' or '7d. Data are only
+        split along time axis, which means that a single block can have very diverse quantities regarding the number of
+        station or actual data contained. This is intended to analyze not only the robustness against the time but also
+        against the number of observations and diversity ot stations.
+        """
        path = self.data_store.get("forecast_path")
        all_stations = self.data_store.get("stations")
        start = self.data_store.get("start", "test")
@@ -155,13 +187,11 @@ class PostProcessing(RunEnvironment):
        coll_dim = "station"
        collector = []
        for station in all_stations:
-            external_data = self._get_external_data(station, path)  # test data
-
-            # test errors
+            # test data
+            external_data = self._get_external_data(station, path)
            if external_data is not None:
                pass
-
-            # load competitors
+            # competitors
            if evaluate_competitors is True:
                competitor = self.load_competitors(station)
                combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim)
@@ -185,9 +215,7 @@ class PostProcessing(RunEnvironment):
        return mse_blocks

    def create_error_array(self, data):
-        """
-        Calculate squared error of all given time series in relation to observation.
-        """
+        """Calculate squared error of all given time series in relation to observation."""
        errors = data.drop_sel({self.model_type_dim: self.observation_indicator})
        errors1 = errors - data.sel({self.model_type_dim: self.observation_indicator})
        errors2 = errors1 ** 2