From d0c8f3ce284d999031b51d3989c30c5a07be3d8a Mon Sep 17 00:00:00 2001
From: Felix Kleinert <f.kleinert@fz-juelich.de>
Date: Wed, 21 Oct 2020 09:56:16 +0200
Subject: [PATCH] update data avail plots and docu

---
 src/plotting/postprocessing_plotting.py | 98 ++++++++++++++++---------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py
index 3a70e14f..a4e7173b 100644
--- a/src/plotting/postprocessing_plotting.py
+++ b/src/plotting/postprocessing_plotting.py
@@ -861,45 +861,70 @@ class PlotAvailability(AbstractPlotClass):
 @TimeTrackingWrapper
 class PlotAvailabilityHistogram(AbstractPlotClass):
     """
+    Create data availability plots as histogram.
 
+    Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean).
+    Calling this class creates two different types of histograms where each generator
+
+    1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis)
+    2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number
+       of samples (yaxis)
 
     """
 
-    def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"):
+    def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily",
+                 subset_dim: str = 'DataSet', temporal_dim: str = 'datetime', history_dim: str = 'window',
+                 station_dim: str = 'Stations'):
 
         super().__init__(plot_folder, "data_availability_histogram")
         self.freq = self._get_sampling(sampling)
+        self.subset_dim = subset_dim
+        self.temporal_dim = temporal_dim
+        self.history_dim = history_dim
+        self.station_dim = station_dim
         self._prepare_data(generators)
-        self._plot(plt_type='hist')
-        self._save()
-        self.plot_name += '_cumulative'
-        self._plot(plt_type='hist_cum')
-        self._save()
+
+        for plt_type in self.allowed_plot_types:
+            plot_name_tmp = self.plot_name
+            self.plot_name += '_' + plt_type
+            self._plot(plt_type=plt_type)
+            self._save()
+            self.plot_name = plot_name_tmp
+
+    @property
+    def allowed_plot_types(self):
+        plot_types = ['hist', 'hist_cum']
+        return plot_types
 
     def _prepare_data(self, generators: Dict[str, DataGenerator]):
+        """
+        Prepares data to be used by plot methods.
+
+        Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim
+        """
         avail_data_time_sum = {}
         avail_data_station_sum = {}
-        dataset_time_interval={}
+        dataset_time_interval = {}
         for subset, generator in generators.items():
             avail_list = []
             for station in generator.stations:
-                station_data_X, _ = generator[station]
-                station_data_X = station_data_X.loc[{'window': 0,  # select recent window frame
+                station_data_x, _ = generator[station]
+                station_data_x = station_data_x.loc[{self.history_dim: 0,  # select recent window frame
                                                      generator.target_dim: generator.variables[0]}]
-                avail_list.append(station_data_X.notnull())
-            avail_data = xr.concat(avail_list, dim='Stations').notnull()
-            avail_data_time_sum[subset] = avail_data.sum(dim='Stations')
-            avail_data_station_sum[subset] = avail_data.sum(dim='datetime')
+                avail_list.append(station_data_x.notnull())
+            avail_data = xr.concat(avail_list, dim=self.station_dim).notnull()
+            avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim)
+            avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim)
             dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray(
-                avail_data_time_sum[subset], dim_name='datetime', return_type='as_dict'
+                avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict'
             )
         avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(),
-                                                                             name='DataSet')
+                                                                             name=self.subset_dim)
                                       )
-        full_time_index = self._make_full_time_index(avail_data_amount.coords['datetime'].values, freq=self.freq)
+        full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq)
         self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(),
-                                                                                      name='DataSet'))
-        self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index})
+                                                                                      name=self.subset_dim))
+        self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index})
         self.dataset_time_interval = dataset_time_interval
 
     @staticmethod
@@ -930,19 +955,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
             raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}")
 
     def _plot_hist(self, *args):
-        # for dataset in
         colors = self.get_dataset_colors()
         fig, axes = plt.subplots(figsize=(10, 3))
         for i, subset in enumerate(self.dataset_time_interval.keys()):
-            plot_dataset = self.avail_data_amount.sel({'DataSet': subset,
-                                                       'datetime': slice(self.dataset_time_interval[subset]['first'],
-                                                                         self.dataset_time_interval[subset]['last']
-                                                                         )
+            plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset,
+                                                       self.temporal_dim: slice(
+                                                           self.dataset_time_interval[subset]['first'],
+                                                           self.dataset_time_interval[subset]['last']
+                                                       )
                                                        }
                                                       )
 
             plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset)
-            plt.fill_between(plot_dataset.coords['datetime'].values, plot_dataset.values, color=colors[subset])
+            plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset])
 
         lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval))
         for lgd_line in lgd.get_lines():
@@ -957,25 +982,28 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
         fig, axes = plt.subplots(figsize=(10, 3))
         n_bins = int(self.avail_data_cum_sum.max().values)
         bins = np.arange(0, n_bins+1)
-        descending_subsets = self.avail_data_cum_sum.max(dim='Stations').sortby(
-            self.avail_data_cum_sum.max(dim='Stations'), ascending=False
-        ).coords['DataSet'].values
+        descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby(
+            self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False
+        ).coords[self.subset_dim].values
 
         for subset in descending_subsets:
-            self.avail_data_cum_sum.sel({'DataSet': subset}).plot.hist(ax=axes,
-                                                                       bins=bins,
-                                                                       label=subset,
-                                                                       cumulative=-1,
-                                                                       color=colors[subset],
-                                                                       alpha=.6)
-
-        lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval))
+            self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes,
+                                                                             bins=bins,
+                                                                             label=subset,
+                                                                             cumulative=-1,
+                                                                             color=colors[subset],
+                                                                             # alpha=.5
+                                                                             )
+
+        lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval),
+                         facecolor='white', framealpha=1, edgecolor='black')
         plt.title('')
         plt.ylabel('Number of stations')
         plt.xlabel('Number of samples')
         plt.xlim((bins[0], bins[-1]))
         plt.tight_layout()
 
+
 if __name__ == "__main__":
     stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
     path = "../../testrun_network/forecasts"
-- 
GitLab