From 1043377b7fcd3a23682503dcbe6637b7f0f69fae Mon Sep 17 00:00:00 2001
From: Felix Kleinert <f.kleinert@fz-juelich.de>
Date: Thu, 15 Oct 2020 23:36:05 +0200
Subject: [PATCH] Include num_samples vs. num_stations plot #192 by using
 extraction method from #191

---
 src/plotting/postprocessing_plotting.py | 61 ++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py
index a62b44bb..3a70e14f 100644
--- a/src/plotting/postprocessing_plotting.py
+++ b/src/plotting/postprocessing_plotting.py
@@ -865,17 +865,20 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
 
     """
 
-    def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily",
-                 summary_name="data availability"):
+    def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"):
 
         super().__init__(plot_folder, "data_availability_histogram")
         self.freq = self._get_sampling(sampling)
         self._prepare_data(generators)
-        self._plot()
+        self._plot(plt_type='hist')
+        self._save()
+        self.plot_name += '_cumulative'
+        self._plot(plt_type='hist_cum')
         self._save()
 
     def _prepare_data(self, generators: Dict[str, DataGenerator]):
-        avail_dict = {}
+        avail_data_time_sum = {}
+        avail_data_station_sum = {}
         dataset_time_interval={}
         for subset, generator in generators.items():
             avail_list = []
@@ -884,13 +887,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
                 station_data_X = station_data_X.loc[{'window': 0,  # select recent window frame
                                                      generator.target_dim: generator.variables[0]}]
                 avail_list.append(station_data_X.notnull())
-            avail_dict[subset] = xr.concat(avail_list, dim='Stations').notnull().sum(dim='Stations')
+            avail_data = xr.concat(avail_list, dim='Stations').notnull()
+            avail_data_time_sum[subset] = avail_data.sum(dim='Stations')
+            avail_data_station_sum[subset] = avail_data.sum(dim='datetime')
             dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray(
-                avail_dict[subset], dim_name='datetime', return_type='as_dict'
+                avail_data_time_sum[subset], dim_name='datetime', return_type='as_dict'
             )
-        avail_data_amount = xr.concat(avail_dict.values(), pd.Index(avail_dict.keys(), name='DataSet'))
+        avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(),
+                                                                             name='DataSet')
+                                      )
         full_time_index = self._make_full_time_index(avail_data_amount.coords['datetime'].values, freq=self.freq)
-        self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index}, fill_value=0.)
+        self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(),
+                                                                                      name='DataSet'))
+        self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index})
         self.dataset_time_interval = dataset_time_interval
 
     @staticmethod
@@ -912,10 +921,18 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
         full_time_index = pd.date_range(start=irregular_time_index[0], end=irregular_time_index[-1], freq=freq)
         return full_time_index
 
-    def _plot(self, *args):
+    def _plot(self, plt_type='hist', *args):
+        if plt_type == 'hist':
+            self._plot_hist()
+        elif plt_type == 'hist_cum':
+            self._plot_hist_cum()
+        else:
+            raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}")
+
+    def _plot_hist(self, *args):
         # for dataset in
         colors = self.get_dataset_colors()
-        fig, axes = plt.subplots(figsize=(10,3))
+        fig, axes = plt.subplots(figsize=(10, 3))
         for i, subset in enumerate(self.dataset_time_interval.keys()):
             plot_dataset = self.avail_data_amount.sel({'DataSet': subset,
                                                        'datetime': slice(self.dataset_time_interval[subset]['first'],
@@ -935,9 +952,29 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
         plt.ylabel('Number of samples')
         plt.tight_layout()
 
+    def _plot_hist_cum(self, *args):
+        colors = self.get_dataset_colors()
+        fig, axes = plt.subplots(figsize=(10, 3))
+        n_bins = int(self.avail_data_cum_sum.max().values)
+        bins = np.arange(0, n_bins+1)
+        descending_subsets = self.avail_data_cum_sum.max(dim='Stations').sortby(
+            self.avail_data_cum_sum.max(dim='Stations'), ascending=False
+        ).coords['DataSet'].values
+
+        for subset in descending_subsets:
+            self.avail_data_cum_sum.sel({'DataSet': subset}).plot.hist(ax=axes,
+                                                                       bins=bins,
+                                                                       label=subset,
+                                                                       cumulative=-1,
+                                                                       color=colors[subset],
+                                                                       alpha=.6)
 
-
-
+        lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval))
+        plt.title('')
+        plt.ylabel('Number of stations')
+        plt.xlabel('Number of samples')
+        plt.xlim((bins[0], bins[-1]))
+        plt.tight_layout()
 
 if __name__ == "__main__":
     stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
-- 
GitLab