diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index d33f3abb2e2cd04399a2d98f34bf2ed06acfcb6b..ee5aea4a437a1be0754f173077b264e38faa98ad 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -13,6 +13,7 @@ import sys import numpy as np import pandas as pd import xarray as xr +import seaborn as sns import matplotlib # matplotlib.use("Agg") from matplotlib import lines as mlines, pyplot as plt, patches as mpatches, dates as mdates @@ -447,6 +448,71 @@ class PlotAvailabilityHistogram(AbstractPlotClass): # pragma: no cover plt.tight_layout() +@TimeTrackingWrapper +class PlotDataMonthlyDistribution(AbstractPlotClass): # pragma: no cover + + def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", variables_dim="variables", + time_dim="datetime", window_dim="window", target_var: str = "", target_var_unit: str = "ppb"): + """Set attributes and create plot.""" + super().__init__(plot_folder, "monthly_data_distribution") + self.variables_dim = variables_dim + self.time_dim = time_dim + self.window_dim = window_dim + self.coll_dim = "coll" + self.subset_dim = "subset" + self._data = self._prepare_data(generators) + self._plot(target_var, target_var_unit) + self._save() + + def _prepare_data(self, generators) -> List[xr.DataArray]: + """ + Pre.process data required to plot. + + :param generator: data + :return: The entire data set, flagged with the corresponding month. + """ + f = lambda x: x.get_observation() + forecasts = [] + for set_type, generator in generators.items(): + forecasts_set = None + forecasts_monthly = {} + for i, gen in enumerate(generator): + data = f(gen) + data = gen.apply_transformation(data, inverse=True) + data = data.clip(min=0).reset_coords(drop=True) + new_index = data.coords[self.time_dim].values.astype("datetime64[M]").astype(int) % 12 + 1 + data = data.assign_coords({self.time_dim: new_index}) + forecasts_set = xr.concat([forecasts_set, data], self.time_dim) if forecasts_set is not None else data + for month in set(forecasts_set.coords[self.time_dim].values): + monthly_values = forecasts_set.sel({self.time_dim: month}).values + forecasts_monthly[month] = np.concatenate((forecasts_monthly.get(month, []), monthly_values)) + forecasts_monthly = pd.DataFrame.from_dict(forecasts_monthly, orient="index")#.transpose() + forecasts_monthly[self.coll_dim] = set_type + forecasts.append(forecasts_monthly.set_index(self.coll_dim, append=True)) + forecasts = pd.concat(forecasts).stack().rename_axis(["month", "subset", "index"]) + forecasts = forecasts.to_frame(name="values").reset_index(level=[0, 1]) + return forecasts + + @staticmethod + def _spell_out_chemical_concentrations(short_name: str, add_concentration: bool = False): + short2long = {'o3': 'ozone', 'no': 'nitrogen oxide', 'no2': 'nitrogen dioxide', 'nox': 'nitrogen dioxides'} + _suffix = "" if add_concentration is False else " concentration" + return f"{short2long[short_name]}{_suffix}" + + def _plot(self, target_var: str, target_var_unit: str): + """ + Create a monthly grouped box plot over all stations but with separate boxes for each lead time step. + + :param target_var: display name of the target variable on plot's axis + """ + ax = sns.boxplot(data=self._data, x="month", y="values", hue="subset", whis=1.5, + palette=self.get_dataset_colors(), flierprops={'marker': '.', 'markersize': 1}, showmeans=True, + meanprops={'markersize': 1, 'markeredgecolor': 'k'}) + ylabel = self._spell_out_chemical_concentrations(target_var) + ax.set(xlabel='month', ylabel=f'dma8 {ylabel} (in {target_var_unit})') + plt.tight_layout() + + @TimeTrackingWrapper class PlotDataHistogram(AbstractPlotClass): # pragma: no cover """ diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index e4cc34f66db7426876209fece0dc902341ac6582..4e7225af8ad840f545d35714c21d43c4dcd1317b 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -27,7 +27,7 @@ from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClima PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap, PlotTimeEvolutionMetric, PlotSeasonalMSEStack, \ PlotErrorsOnMap from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram, PlotDataHistogram + PlotPeriodogram, PlotDataHistogram, PlotDataMonthlyDistribution from mlair.run_modules.run_environment import RunEnvironment @@ -691,6 +691,16 @@ class PostProcessing(RunEnvironment): logging.error(f"Could not create plot PlotAvailabilityHistogram due to the following error: {e}" f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + try: + if "PlotDataMonthlyDistribution" in plot_list: + avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} + PlotDataMonthlyDistribution(avail_data, plot_folder=self.plot_path, time_dim=time_dim, + variables_dim=target_dim, window_dim=window_dim, target_var=self.target_var, + ) + except Exception as e: + logging.error(f"Could not create plot PlotDataMonthlyDistribution due to the following error:" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + try: if "PlotDataHistogram" in plot_list: upsampling = self.data_store.get_default("upsampling", scope="train", default=False)