diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index 3a70e14f69de4929c55e688f3cc326525c546e36..a4e7173bebc7d25bc276251b66cd8beae2d6d2bd 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -861,45 +861,70 @@ class PlotAvailability(AbstractPlotClass): @TimeTrackingWrapper class PlotAvailabilityHistogram(AbstractPlotClass): """ + Create data availability plots as histogram. + Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean). + Calling this class creates two different types of histograms where each generator + + 1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis) + 2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number + of samples (yaxis) """ - def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"): + def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily", + subset_dim: str = 'DataSet', temporal_dim: str = 'datetime', history_dim: str = 'window', + station_dim: str = 'Stations'): super().__init__(plot_folder, "data_availability_histogram") self.freq = self._get_sampling(sampling) + self.subset_dim = subset_dim + self.temporal_dim = temporal_dim + self.history_dim = history_dim + self.station_dim = station_dim self._prepare_data(generators) - self._plot(plt_type='hist') - self._save() - self.plot_name += '_cumulative' - self._plot(plt_type='hist_cum') - self._save() + + for plt_type in self.allowed_plot_types: + plot_name_tmp = self.plot_name + self.plot_name += '_' + plt_type + self._plot(plt_type=plt_type) + self._save() + self.plot_name = plot_name_tmp + + @property + def allowed_plot_types(self): + plot_types = ['hist', 'hist_cum'] + return plot_types def _prepare_data(self, generators: Dict[str, DataGenerator]): + """ + Prepares data to be used by plot methods. + + Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim + """ avail_data_time_sum = {} avail_data_station_sum = {} - dataset_time_interval={} + dataset_time_interval = {} for subset, generator in generators.items(): avail_list = [] for station in generator.stations: - station_data_X, _ = generator[station] - station_data_X = station_data_X.loc[{'window': 0, # select recent window frame + station_data_x, _ = generator[station] + station_data_x = station_data_x.loc[{self.history_dim: 0, # select recent window frame generator.target_dim: generator.variables[0]}] - avail_list.append(station_data_X.notnull()) - avail_data = xr.concat(avail_list, dim='Stations').notnull() - avail_data_time_sum[subset] = avail_data.sum(dim='Stations') - avail_data_station_sum[subset] = avail_data.sum(dim='datetime') + avail_list.append(station_data_x.notnull()) + avail_data = xr.concat(avail_list, dim=self.station_dim).notnull() + avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim) + avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim) dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray( - avail_data_time_sum[subset], dim_name='datetime', return_type='as_dict' + avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict' ) avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(), - name='DataSet') + name=self.subset_dim) ) - full_time_index = self._make_full_time_index(avail_data_amount.coords['datetime'].values, freq=self.freq) + full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq) self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(), - name='DataSet')) - self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index}) + name=self.subset_dim)) + self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index}) self.dataset_time_interval = dataset_time_interval @staticmethod @@ -930,19 +955,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass): raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}") def _plot_hist(self, *args): - # for dataset in colors = self.get_dataset_colors() fig, axes = plt.subplots(figsize=(10, 3)) for i, subset in enumerate(self.dataset_time_interval.keys()): - plot_dataset = self.avail_data_amount.sel({'DataSet': subset, - 'datetime': slice(self.dataset_time_interval[subset]['first'], - self.dataset_time_interval[subset]['last'] - ) + plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset, + self.temporal_dim: slice( + self.dataset_time_interval[subset]['first'], + self.dataset_time_interval[subset]['last'] + ) } ) plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset) - plt.fill_between(plot_dataset.coords['datetime'].values, plot_dataset.values, color=colors[subset]) + plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset]) lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval)) for lgd_line in lgd.get_lines(): @@ -957,25 +982,28 @@ class PlotAvailabilityHistogram(AbstractPlotClass): fig, axes = plt.subplots(figsize=(10, 3)) n_bins = int(self.avail_data_cum_sum.max().values) bins = np.arange(0, n_bins+1) - descending_subsets = self.avail_data_cum_sum.max(dim='Stations').sortby( - self.avail_data_cum_sum.max(dim='Stations'), ascending=False - ).coords['DataSet'].values + descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby( + self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False + ).coords[self.subset_dim].values for subset in descending_subsets: - self.avail_data_cum_sum.sel({'DataSet': subset}).plot.hist(ax=axes, - bins=bins, - label=subset, - cumulative=-1, - color=colors[subset], - alpha=.6) - - lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval)) + self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes, + bins=bins, + label=subset, + cumulative=-1, + color=colors[subset], + # alpha=.5 + ) + + lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval), + facecolor='white', framealpha=1, edgecolor='black') plt.title('') plt.ylabel('Number of stations') plt.xlabel('Number of samples') plt.xlim((bins[0], bins[-1])) plt.tight_layout() + if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] path = "../../testrun_network/forecasts"