From 1043377b7fcd3a23682503dcbe6637b7f0f69fae Mon Sep 17 00:00:00 2001 From: Felix Kleinert <f.kleinert@fz-juelich.de> Date: Thu, 15 Oct 2020 23:36:05 +0200 Subject: [PATCH] Include num_samples vs. num_stations plot #192 by using extraction method from #191 --- src/plotting/postprocessing_plotting.py | 61 ++++++++++++++++++++----- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index a62b44bb..3a70e14f 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -865,17 +865,20 @@ class PlotAvailabilityHistogram(AbstractPlotClass): """ - def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily", - summary_name="data availability"): + def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"): super().__init__(plot_folder, "data_availability_histogram") self.freq = self._get_sampling(sampling) self._prepare_data(generators) - self._plot() + self._plot(plt_type='hist') + self._save() + self.plot_name += '_cumulative' + self._plot(plt_type='hist_cum') self._save() def _prepare_data(self, generators: Dict[str, DataGenerator]): - avail_dict = {} + avail_data_time_sum = {} + avail_data_station_sum = {} dataset_time_interval={} for subset, generator in generators.items(): avail_list = [] @@ -884,13 +887,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass): station_data_X = station_data_X.loc[{'window': 0, # select recent window frame generator.target_dim: generator.variables[0]}] avail_list.append(station_data_X.notnull()) - avail_dict[subset] = xr.concat(avail_list, dim='Stations').notnull().sum(dim='Stations') + avail_data = xr.concat(avail_list, dim='Stations').notnull() + avail_data_time_sum[subset] = avail_data.sum(dim='Stations') + avail_data_station_sum[subset] = avail_data.sum(dim='datetime') dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray( - avail_dict[subset], dim_name='datetime', return_type='as_dict' + avail_data_time_sum[subset], dim_name='datetime', return_type='as_dict' ) - avail_data_amount = xr.concat(avail_dict.values(), pd.Index(avail_dict.keys(), name='DataSet')) + avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(), + name='DataSet') + ) full_time_index = self._make_full_time_index(avail_data_amount.coords['datetime'].values, freq=self.freq) - self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index}, fill_value=0.) + self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(), + name='DataSet')) + self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index}) self.dataset_time_interval = dataset_time_interval @staticmethod @@ -912,10 +921,18 @@ class PlotAvailabilityHistogram(AbstractPlotClass): full_time_index = pd.date_range(start=irregular_time_index[0], end=irregular_time_index[-1], freq=freq) return full_time_index - def _plot(self, *args): + def _plot(self, plt_type='hist', *args): + if plt_type == 'hist': + self._plot_hist() + elif plt_type == 'hist_cum': + self._plot_hist_cum() + else: + raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}") + + def _plot_hist(self, *args): # for dataset in colors = self.get_dataset_colors() - fig, axes = plt.subplots(figsize=(10,3)) + fig, axes = plt.subplots(figsize=(10, 3)) for i, subset in enumerate(self.dataset_time_interval.keys()): plot_dataset = self.avail_data_amount.sel({'DataSet': subset, 'datetime': slice(self.dataset_time_interval[subset]['first'], @@ -935,9 +952,29 @@ class PlotAvailabilityHistogram(AbstractPlotClass): plt.ylabel('Number of samples') plt.tight_layout() + def _plot_hist_cum(self, *args): + colors = self.get_dataset_colors() + fig, axes = plt.subplots(figsize=(10, 3)) + n_bins = int(self.avail_data_cum_sum.max().values) + bins = np.arange(0, n_bins+1) + descending_subsets = self.avail_data_cum_sum.max(dim='Stations').sortby( + self.avail_data_cum_sum.max(dim='Stations'), ascending=False + ).coords['DataSet'].values + + for subset in descending_subsets: + self.avail_data_cum_sum.sel({'DataSet': subset}).plot.hist(ax=axes, + bins=bins, + label=subset, + cumulative=-1, + color=colors[subset], + alpha=.6) - - + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval)) + plt.title('') + plt.ylabel('Number of stations') + plt.xlabel('Number of samples') + plt.xlim((bins[0], bins[-1])) + plt.tight_layout() if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] -- GitLab