Skip to content
Snippets Groups Projects
Commit d0c8f3ce authored by Felix Kleinert's avatar Felix Kleinert
Browse files

update data avail plots and docu

parent 1043377b
Branches
Tags
2 merge requests!171Develop intelli o3 ts,!166Resolve "Add additional data plot (histogram)"
This commit is part of merge request !166. Comments created here will be created in the context of that merge request.
...@@ -861,45 +861,70 @@ class PlotAvailability(AbstractPlotClass): ...@@ -861,45 +861,70 @@ class PlotAvailability(AbstractPlotClass):
@TimeTrackingWrapper @TimeTrackingWrapper
class PlotAvailabilityHistogram(AbstractPlotClass): class PlotAvailabilityHistogram(AbstractPlotClass):
""" """
Create data availability plots as histogram.
Each entry of each generator is checked for `notnull()` values along all the datetime axis (boolean).
Calling this class creates two different types of histograms where each generator
1) data_availability_histogram: datetime (xaxis) vs. number of stations with availabile data (yaxis)
2) data_availability_histogram_cumulative: number of samples (xaxis) vs. number of stations having at least number
of samples (yaxis)
""" """
def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"): def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily",
subset_dim: str = 'DataSet', temporal_dim: str = 'datetime', history_dim: str = 'window',
station_dim: str = 'Stations'):
super().__init__(plot_folder, "data_availability_histogram") super().__init__(plot_folder, "data_availability_histogram")
self.freq = self._get_sampling(sampling) self.freq = self._get_sampling(sampling)
self.subset_dim = subset_dim
self.temporal_dim = temporal_dim
self.history_dim = history_dim
self.station_dim = station_dim
self._prepare_data(generators) self._prepare_data(generators)
self._plot(plt_type='hist')
self._save() for plt_type in self.allowed_plot_types:
self.plot_name += '_cumulative' plot_name_tmp = self.plot_name
self._plot(plt_type='hist_cum') self.plot_name += '_' + plt_type
self._plot(plt_type=plt_type)
self._save() self._save()
self.plot_name = plot_name_tmp
@property
def allowed_plot_types(self):
plot_types = ['hist', 'hist_cum']
return plot_types
def _prepare_data(self, generators: Dict[str, DataGenerator]): def _prepare_data(self, generators: Dict[str, DataGenerator]):
"""
Prepares data to be used by plot methods.
Creates xarrays which are sums of valid data (boolean sums) across i) station_dim and ii) temporal_dim
"""
avail_data_time_sum = {} avail_data_time_sum = {}
avail_data_station_sum = {} avail_data_station_sum = {}
dataset_time_interval = {} dataset_time_interval = {}
for subset, generator in generators.items(): for subset, generator in generators.items():
avail_list = [] avail_list = []
for station in generator.stations: for station in generator.stations:
station_data_X, _ = generator[station] station_data_x, _ = generator[station]
station_data_X = station_data_X.loc[{'window': 0, # select recent window frame station_data_x = station_data_x.loc[{self.history_dim: 0, # select recent window frame
generator.target_dim: generator.variables[0]}] generator.target_dim: generator.variables[0]}]
avail_list.append(station_data_X.notnull()) avail_list.append(station_data_x.notnull())
avail_data = xr.concat(avail_list, dim='Stations').notnull() avail_data = xr.concat(avail_list, dim=self.station_dim).notnull()
avail_data_time_sum[subset] = avail_data.sum(dim='Stations') avail_data_time_sum[subset] = avail_data.sum(dim=self.station_dim)
avail_data_station_sum[subset] = avail_data.sum(dim='datetime') avail_data_station_sum[subset] = avail_data.sum(dim=self.temporal_dim)
dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray( dataset_time_interval[subset] = self._get_first_and_last_indexelement_from_xarray(
avail_data_time_sum[subset], dim_name='datetime', return_type='as_dict' avail_data_time_sum[subset], dim_name=self.temporal_dim, return_type='as_dict'
) )
avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(), avail_data_amount = xr.concat(avail_data_time_sum.values(), pd.Index(avail_data_time_sum.keys(),
name='DataSet') name=self.subset_dim)
) )
full_time_index = self._make_full_time_index(avail_data_amount.coords['datetime'].values, freq=self.freq) full_time_index = self._make_full_time_index(avail_data_amount.coords[self.temporal_dim].values, freq=self.freq)
self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(), self.avail_data_cum_sum = xr.concat(avail_data_station_sum.values(), pd.Index(avail_data_station_sum.keys(),
name='DataSet')) name=self.subset_dim))
self.avail_data_amount = avail_data_amount.reindex({'datetime': full_time_index}) self.avail_data_amount = avail_data_amount.reindex({self.temporal_dim: full_time_index})
self.dataset_time_interval = dataset_time_interval self.dataset_time_interval = dataset_time_interval
@staticmethod @staticmethod
...@@ -930,19 +955,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass): ...@@ -930,19 +955,19 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}") raise ValueError(f"plt_type mus be 'hist' or 'hist_cum', but is {type}")
def _plot_hist(self, *args): def _plot_hist(self, *args):
# for dataset in
colors = self.get_dataset_colors() colors = self.get_dataset_colors()
fig, axes = plt.subplots(figsize=(10, 3)) fig, axes = plt.subplots(figsize=(10, 3))
for i, subset in enumerate(self.dataset_time_interval.keys()): for i, subset in enumerate(self.dataset_time_interval.keys()):
plot_dataset = self.avail_data_amount.sel({'DataSet': subset, plot_dataset = self.avail_data_amount.sel({self.subset_dim: subset,
'datetime': slice(self.dataset_time_interval[subset]['first'], self.temporal_dim: slice(
self.dataset_time_interval[subset]['first'],
self.dataset_time_interval[subset]['last'] self.dataset_time_interval[subset]['last']
) )
} }
) )
plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset) plot_dataset.plot.step(color=colors[subset], ax=axes, label=subset)
plt.fill_between(plot_dataset.coords['datetime'].values, plot_dataset.values, color=colors[subset]) plt.fill_between(plot_dataset.coords[self.temporal_dim].values, plot_dataset.values, color=colors[subset])
lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval)) lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval))
for lgd_line in lgd.get_lines(): for lgd_line in lgd.get_lines():
...@@ -957,25 +982,28 @@ class PlotAvailabilityHistogram(AbstractPlotClass): ...@@ -957,25 +982,28 @@ class PlotAvailabilityHistogram(AbstractPlotClass):
fig, axes = plt.subplots(figsize=(10, 3)) fig, axes = plt.subplots(figsize=(10, 3))
n_bins = int(self.avail_data_cum_sum.max().values) n_bins = int(self.avail_data_cum_sum.max().values)
bins = np.arange(0, n_bins+1) bins = np.arange(0, n_bins+1)
descending_subsets = self.avail_data_cum_sum.max(dim='Stations').sortby( descending_subsets = self.avail_data_cum_sum.max(dim=self.station_dim).sortby(
self.avail_data_cum_sum.max(dim='Stations'), ascending=False self.avail_data_cum_sum.max(dim=self.station_dim), ascending=False
).coords['DataSet'].values ).coords[self.subset_dim].values
for subset in descending_subsets: for subset in descending_subsets:
self.avail_data_cum_sum.sel({'DataSet': subset}).plot.hist(ax=axes, self.avail_data_cum_sum.sel({self.subset_dim: subset}).plot.hist(ax=axes,
bins=bins, bins=bins,
label=subset, label=subset,
cumulative=-1, cumulative=-1,
color=colors[subset], color=colors[subset],
alpha=.6) # alpha=.5
)
lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval)) lgd = fig.legend(loc="upper right", ncol=len(self.dataset_time_interval),
facecolor='white', framealpha=1, edgecolor='black')
plt.title('') plt.title('')
plt.ylabel('Number of stations') plt.ylabel('Number of stations')
plt.xlabel('Number of samples') plt.xlabel('Number of samples')
plt.xlim((bins[0], bins[-1])) plt.xlim((bins[0], bins[-1]))
plt.tight_layout() plt.tight_layout()
if __name__ == "__main__": if __name__ == "__main__":
stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
path = "../../testrun_network/forecasts" path = "../../testrun_network/forecasts"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment