From 8ad74c53051a9e27870c58ef97c5cf5e1c4d51f1 Mon Sep 17 00:00:00 2001
From: "v.gramlich1" <v.gramlichfz-juelich.de>
Date: Mon, 2 Aug 2021 16:30:23 +0200
Subject: [PATCH] Made PlotOversamplingContingency plot competitors

---
 mlair/plotting/data_insight_plotting.py   |  11 +-
 mlair/plotting/postprocessing_plotting.py | 123 ++++++++++++++--------
 mlair/run_modules/post_processing.py      |   4 +-
 3 files changed, 82 insertions(+), 56 deletions(-)

diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py
index c4c1f4af..0a3b28ce 100644
--- a/mlair/plotting/data_insight_plotting.py
+++ b/mlair/plotting/data_insight_plotting.py
@@ -23,8 +23,6 @@ from mlair.plotting.abstract_plot_class import AbstractPlotClass
 @TimeTrackingWrapper
 class PlotOversampling(AbstractPlotClass):
 
-    #Todo: Build histograms correctly
-
     def __init__(self, data, bin_edges, oversampling_rates, plot_folder: str = ".",
                  plot_names=["oversampling_histogram", "oversampling_density_histogram", "oversampling_rates",
                             "oversampling_rates_deviation"]):
@@ -59,13 +57,8 @@ class PlotOversampling(AbstractPlotClass):
 
     def _plot_oversampling_histogram(self, Y_hist, Y_extreme_hist, bin_edges):
         fig, ax = plt.subplots(1, 1)
-        ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling")
-        ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling")
-        #ax.plot(bin_edges[:-1] + 0.5 * interval_width, weights, label=f"{subset}", c=colors[subset])
-        #ax.step(bin_edges, np.append(0,Y_hist), label="Before oversampling")
-        #ax.step(bin_edges, np.append(0,Y_extreme_hist), label="After oversampling")
-        #ax.stairs(Y_hist_dens, bin_edges, label="Before oversampling")
-        #ax.stairs(Y_extreme_hist_dens, bin_edges, label="After oversampling")
+        ax.hist(bin_edges[:-1], bin_edges, weights=Y_hist, label="Before oversampling", histtype="step")
+        ax.hist(bin_edges[:-1], bin_edges, weights=Y_extreme_hist, label="After oversampling", histtype="step")
         ax.set_title(f"Density Histogram before-after oversampling")
         ax.legend()
 
diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py
index 29ed4054..a0d54c18 100644
--- a/mlair/plotting/postprocessing_plotting.py
+++ b/mlair/plotting/postprocessing_plotting.py
@@ -30,8 +30,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
 
 @TimeTrackingWrapper
 class PlotOversamplingContingency(AbstractPlotClass):
-    #Todo: 1. Make competitors flexible
-    #       2. Get min and max_label
+    #Todo: Get min and max_label
 
     def __init__(self, station_names, file_path, comp_path, file_name, plot_folder: str = ".", model_name: str = "nn",
                  obs_name: str = "obs", comp_names: str = "IntelliO3",
@@ -43,31 +42,40 @@ class PlotOversamplingContingency(AbstractPlotClass):
         self._file_path = file_path
         self._comp_path = comp_path
         self._file_name = file_name
-        self._model_name = model_name
         self._obs_name = obs_name
+        self._model_name = model_name
         self._comp_names = to_list(comp_names)
-        true_above, false_above, false_below, true_below, borders = self._calculate_contingencies()
-        ts, h, f = self._calculate_scores(true_above, false_above, false_below, true_below)
-        min_label = borders[0]
-        max_label = borders[1]
-        plt.plot(range(min_label, max_label), ts, label="threat score")
-        plt.legend()
+        self._all_names = [self._model_name]
+        self._all_names.extend(self._comp_names)
+        self._plot_names = plot_names
+        contingency_array, borders = self._calculate_contingencies()
+        self._scores = ["ts", "h", "f"]
+        score_array = self._calculate_all_scores(contingency_array)
+        self._min_label = borders[0]
+        self._max_label = borders[1]
+        self._plot_counter = 0
+
+        self._plot(score_array, "ts")
         self._save()
-        self.plot_name = plot_names[1]
-        plt.plot(range(min_label, max_label), h, label="hit rate")
-        plt.legend()
+        self._plot(score_array, "h")
         self._save()
-        self.plot_name = plot_names[2]
-        plt.plot(range(min_label, max_label), f, label="false alarm rate")
-        plt.legend()
+        self._plot(score_array, "f")
         self._save()
-        self.plot_name = plot_names[3]
-        plt.plot(range(min_label, max_label), ts, label="threat score")
-        plt.plot(range(min_label, max_label), h, label="hit rate")
-        plt.plot(range(min_label, max_label), f, label="false alarm rate")
-        plt.legend()
+        self._plot(score_array, "all_scores")
         self._save()
 
+    def _plot(self, data, score):
+        if score == "all_scores":
+            for score_name in data.scores.values.tolist():
+                plt.plot(range(self._min_label, self._max_label), data.loc[dict(type="nn", scores=score_name)], label=score_name)
+        else:
+            for type in data.type.values.tolist():
+                plt.plot(range(self._min_label, self._max_label), data.loc[dict(type=type, scores=score)], label=type)
+        plt.legend()
+        self.plot_name = self._plot_names[self._plot_counter]
+        self._plot_counter = self._plot_counter + 1
+
+
     def _create_competitor_forecast(self, station_name: str, competitor_name: str) -> xr.DataArray:
         """
         Load and format the competing forecast of a distinct model indicated by `competitor_name` for a distinct station
@@ -109,27 +117,29 @@ class PlotOversamplingContingency(AbstractPlotClass):
         return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None
 
     def _calculate_contingencies(self):
+        min_label = 0
+        max_label = 100
+        borders = [min_label, max_label]
+        thresholds = np.arange(min_label, max_label)
+        contingency_cell = ["ta", "fa", "fb", "tb"]
+        contingency_array = xr.DataArray(dims=["thresholds", "contingency_cell", "type"],
+                                         coords=[thresholds, contingency_cell, self._all_names])
+        contingency_array = contingency_array.fillna(0)
         for station in self._stations:
             file = os.path.join(self._file_path, self._file_name % station)
             forecast_file = xr.open_dataarray(file)
             obs = forecast_file.sel(type=self._obs_name)
-            model = forecast_file.sel(type=self._model_name)
+            predictions = [forecast_file.sel(type=self._model_name)]
             competitors = [self._load_competitors(station, [comp]).sel(type=comp) for comp in self._comp_names]
-            min_label = 0
-            max_label = 100
-            borders = [min_label, max_label]
-            true_above = []
-            false_above = []
-            false_below = []
-            true_below = []
+            predictions.extend(competitors)
             for threshold in range(min_label, max_label):
-                ta, fa, fb, tb = self._single_contingency(obs, model, threshold)
-                true_above.append(ta)
-                false_above.append(fa)
-                false_below.append(fb)
-                true_below.append(tb)
-            return np.array(true_above), np.array(false_above), np.array(false_below), np.array(true_below), borders
-
+                for pred in predictions:
+                    ta, fa, fb, tb = self._single_contingency(obs, pred, threshold)
+                    contingency_array.loc[dict(thresholds=threshold, contingency_cell="ta", type=pred.type.values)] = ta
+                    contingency_array.loc[dict(thresholds=threshold, contingency_cell="fa", type=pred.type.values)] = fa
+                    contingency_array.loc[dict(thresholds=threshold, contingency_cell="fb", type=pred.type.values)] = fb
+                    contingency_array.loc[dict(thresholds=threshold, contingency_cell="tb", type=pred.type.values)] = tb
+        return contingency_array, borders
 
     def _single_contingency(self, obs, pred, threshold):
         ta = 0
@@ -151,16 +161,39 @@ class PlotOversamplingContingency(AbstractPlotClass):
                     tb += 1
         return ta, fa, fb, tb
 
-    def _calculate_scores(self, true_above, false_above, false_below, true_below):
-        np.seterr(divide="ignore")
-        np.seterr(divide="ignore")
-        ts = true_above/(true_above + false_above + false_below)
-        h = true_above/(true_above + false_below)
-        f = false_above/(false_above + true_below)
-        np.nan_to_num(ts, copy=False)
-        np.nan_to_num(h, copy=False)
-        np.nan_to_num(f, copy=False)
-        return ts, h, f
+    def _calculate_all_scores(self, contingency_array):
+        score_array = xr.DataArray(dims=["scores", "thresholds", "type"],
+                                   coords=[self._scores, contingency_array.thresholds.values,
+                                           contingency_array.type.values])
+        for type in score_array.type.values.tolist():
+            for threshold in score_array.thresholds.values.tolist():
+                for score in score_array.scores.values.tolist():
+                    score_value = self._calculate_scores(contingency_array.loc[dict(type=type,
+                                                                                    thresholds=threshold)].values, score)
+                    score_array.loc[dict(type=type, thresholds=threshold, scores=score)] = score_value
+        return score_array
+
+    def _calculate_scores(self, contingency, score):
+        true_above = contingency[0]
+        false_above = contingency[1]
+        false_below = contingency[2]
+        true_below = contingency[3]
+        if score == "ts":
+            if (true_above + false_above + false_below) == 0:
+                score_value = 1
+            else:
+                score_value = true_above/(true_above + false_above + false_below)
+        elif score == "h":
+            if (true_above + false_below) == 0:
+                score_value = 1
+            else:
+                score_value = true_above/(true_above + false_below)
+        elif score == "f":
+            if (false_above + true_below) == 0:
+                score_value = 1
+            else:
+                score_value = false_above/(false_above + true_below)
+        return score_value
 
 
 
diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py
index cef2c651..80d40a66 100644
--- a/mlair/run_modules/post_processing.py
+++ b/mlair/run_modules/post_processing.py
@@ -333,8 +333,8 @@ class PostProcessing(RunEnvironment):
         try:
             if (self.data_store.get('oversampling_method')=='bin_oversampling') and (
                     "PlotOversamplingContingency" in plot_list):
-                PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path, comp_path=self.competitor_path,
-                                            comp_names=self.competitors,
+                PlotOversamplingContingency(station_names=self.test_data.keys(), file_path=path,
+                                            comp_path=self.competitor_path, comp_names=self.competitors,
                                             file_name=r"forecasts_%s_test.nc", plot_folder=self.plot_path)
         except Exception as e:
             logging.error(f"Could not create plot OversamplingContingencyPlots due to the following error: {e}")
-- 
GitLab