diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index a874611a42cbfb4ce4e663f3acad6fc4eed04607..785aab88992e84a84ab4144040597922a48e5134 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -48,7 +48,7 @@ DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotSeparationOfScales"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 1176621a71f09e6efff4ac21a69e4f466e6dfbd4..f159e6fca1390b12891a511f30eeb1fbbb0672e9 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -444,6 +444,59 @@ class PlotAvailabilityHistogram(AbstractPlotClass): # pragma: no cover plt.tight_layout() +class PlotDataHistogram(AbstractPlotClass): # pragma: no cover + + def __init__(self, generator: Dict[str, DataCollection], plot_folder: str = ".", plot_name="histogram", + variables_dim="variables", time_dim="datetime", window_dim="window"): + super().__init__(plot_folder, plot_name) + self.variables_dim = variables_dim + self.time_dim = time_dim + self.window_dim = window_dim + self.inputs = to_list(generator[0].get_X(as_numpy=False)[0].coords[self.variables_dim].values.tolist()) + self.targets = to_list(generator[0].get_Y(as_numpy=False).coords[self.variables_dim].values.tolist()) + + # normalized versions + self._calculate_hist(generator, self.inputs, input_data=True) + self._plot(add_name="input") + self._calculate_hist(generator, self.targets, input_data=False) + self._plot(add_name="target") + + def _calculate_hist(self, generator, variables, input_data=True): + bins = {} + n_bins = 100 + interval_width = None + bin_edges = None + f = lambda x: x.get_X(as_numpy=False)[0] if input_data is True else x.get_Y(as_numpy=False) + for gen in generator: + w = min(abs(f(gen).coords[self.window_dim].values)) + data = f(gen).sel({self.window_dim: w}) + res, interval_width, bin_edges = f_proc_hist(data, variables, n_bins, self.variables_dim) + for var in variables: + n_var = bins.get(var, np.zeros(n_bins)) + n_var += res[var] + bins[var] = n_var + self.bins = bins + self.interval_width = interval_width + self.bin_edges = bin_edges + + def _plot(self, add_name): + plot_path = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}_{add_name}.pdf") + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) + for var in self.bins.keys(): + fig, ax = plt.subplots() + hist_var = self.bins[var] + n_var = sum(hist_var) + weights = hist_var / (self.interval_width * n_var) + ax.hist(self.bin_edges[:-1], self.bin_edges, weights=weights) + ax.set_ylabel("probability density") + ax.set_xlabel(f"{var}") + ax.set_title(f"Histogram (n={int(n_var)})") + pdf_pages.savefig() + # close all open figures / plots + pdf_pages.close() + plt.close('all') + + class PlotPeriodogram(AbstractPlotClass): # pragma: no cover """ Create Lomb-Scargle periodogram in raw input and target data. The Lomb-Scargle version can deal with missing values. @@ -719,3 +772,13 @@ def f_proc_2(g, m, pos, variables_dim, time_dim): var_str, f, pgram = f_proc(var, d_var) raw_data_single[var_str] = [(f, pgram)] return raw_data_single + + +def f_proc_hist(data, variables, n_bins, variables_dim): + res = {} + for var in variables: + d = data.sel({variables_dim: var}).squeeze() if len(data.shape) > 1 else data + hist, bin_edges = np.histogram(d.values, n_bins, range=(-4, 4)) + interval_width = (bin_edges[1] - bin_edges[0]) + res[var] = hist + return res, interval_width, bin_edges diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 23d26fc1e5c866657d28b11f275d76df5a8cc300..fafcff5e13930930f298c99750990642c22cded8 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -22,7 +22,7 @@ from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram + PlotPeriodogram, PlotDataHistogram from mlair.run_modules.run_environment import RunEnvironment @@ -398,6 +398,13 @@ class PostProcessing(RunEnvironment): except Exception as e: logging.error(f"Could not create plot PlotPeriodogram due to the following error: {e}") + try: + if "PlotDataHistogram" in plot_list: + PlotDataHistogram(self.train_data, plot_folder=self.plot_path, time_dim=time_dim, + variables_dim=target_dim) + except Exception as e: + logging.error(f"Could not create plot PlotDataHistogram due to the following error: {e}") + def calculate_test_score(self): """Evaluate test score of model and save locally."""