diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index 48606d4f8531812672391304b41885555608473b..14e3074a7d8f09bd597fb2fbf53a298d83ab6556 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -16,6 +16,7 @@ import pandas as pd import seaborn as sns import xarray as xr from matplotlib.backends.backend_pdf import PdfPages +import matplotlib.patches as mpatches from src import helpers from src.helpers import TimeTracking, TimeTrackingWrapper @@ -33,14 +34,14 @@ class AbstractPlotClass: def _plot(self, *args): raise NotImplementedError - - def _save(self): + + def _save(self, **kwargs): """ Standard save method to store plot locally. Name of and path to plot need to be set on initialisation """ plot_name = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}.pdf") logging.debug(f"... save plot to {plot_name}") - plt.savefig(plot_name, dpi=self.resolution) + plt.savefig(plot_name, dpi=self.resolution, **kwargs) plt.close('all') @@ -626,12 +627,24 @@ class PlotTimeSeries: @TimeTrackingWrapper class PlotAvailability(AbstractPlotClass): - def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily"): + def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily", + summary_name="data availability"): + # create standard Gantt plot for all stations (currently in single pdf file with single page) super().__init__(plot_folder, "data_availability") self.sampling = self._get_sampling(sampling) plot_dict = self._prepare_data(generators) - self._plot(plot_dict) - self._save() + lgd = self._plot(plot_dict) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") + # create summary Gantt plot (is data in at least one station available) + self.plot_name += "_summary" + plot_dict_summary = self._summarise_data(generators, summary_name) + lgd = self._plot(plot_dict_summary) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") + # combination of station and summary plot, last element is summary broken bar + self.plot_name = "data_availability_combined" + plot_dict_summary.update(plot_dict) + lgd = self._plot(plot_dict_summary) + self._save(bbox_extra_artists=(lgd, ), bbox_inches="tight") @staticmethod def _get_sampling(sampling): @@ -659,12 +672,37 @@ class PlotAvailability(AbstractPlotClass): plt_dict[station].update({subset: t2}) return plt_dict + def _summarise_data(self, generators: Dict[str, DataGenerator], summary_name: str): + plt_dict = {} + for subset, generator in generators.items(): + all_data = None + stations = generator.stations + for station in stations: + station_data = generator.get_data_generator(station) + labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean() + labels_bool = labels.sel(window=1).notnull() + if all_data is None: + all_data = labels_bool + else: + tmp = all_data.combine_first(labels_bool) # expand dims to merged datetime coords + all_data = np.logical_or(tmp, labels_bool).combine_first(all_data) # apply logical on merge and fill missing with all_data + + group = (all_data != all_data.shift(datetime=1)).cumsum() + plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values}, index=all_data.datetime.values) + t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0])) + t2 = [i[1:] for i in t if i[0]] + if plt_dict.get(summary_name) is None: + plt_dict[summary_name] = {subset: t2} + else: + plt_dict[summary_name].update({subset: t2}) + return plt_dict + + def _plot(self, plt_dict): - # colors = {"train": "orange", "val": "skyblue", "test": "blueishgreen"} - colors = {"train": "#e69f00", "val": "#56b4e9", "test": "#009e73"} - # colors = {"train": (230, 159, 0), "val": (86, 180, 233), "test": (0, 158, 115)} + # colors = {"train": "orange", "val": "blueishgreen", "test": "skyblue"} # color names + colors = {"train": "#e69f00", "val": "#009e73", "test": "#56b4e9"} # hex code + # colors = {"train": (230, 159, 0), "val": (0, 158, 115), "test": (86, 180, 233)} # in rgb but as abs values pos = 0 - count = 0 height = 0.8 # should be <= 1 yticklabels = [] number_of_stations = len(plt_dict.keys()) @@ -681,4 +719,6 @@ class PlotAvailability(AbstractPlotClass): ax.set_ylim([height, number_of_stations + 1]) ax.set_yticks(np.arange(len(plt_dict.keys()))+1+height/2) ax.set_yticklabels(yticklabels) - plt.tight_layout() + handles = [mpatches.Patch(color=c, label=k) for k, c in colors.items()] + lgd = plt.legend(handles=handles, bbox_to_anchor=(0, 1, 1, 0.2), loc="lower center", ncol=len(handles)) + return lgd diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index 95bd5056febbe06babfd59191332c1f4cb8078d4..150399cb2e4997a6b9adfb30dfa3ff89de73d4ac 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -21,7 +21,8 @@ DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'max 'pblheight': 'maximum'} DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", - "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "plot_conditional_quantiles"] + "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "plot_conditional_quantiles", + "PlotAvailability"] class ExperimentSetup(RunEnvironment): diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index 1361dab63e93ea813c3b92394822fb683c7621c1..8a962888ec0b789a14a24b20c97148e7a8315b30 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -214,8 +214,9 @@ class PostProcessing(RunEnvironment): if "PlotTimeSeries" in plot_list: PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling) - avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} - PlotAvailability(avail_data, plot_folder=self.plot_path) + if "PlotAvailability" in plot_list: + avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} + PlotAvailability(avail_data, plot_folder=self.plot_path) def calculate_test_score(self): test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(),