diff --git a/docs/_source/_plots/datahistogram.png b/docs/_source/_plots/datahistogram.png new file mode 100644 index 0000000000000000000000000000000000000000..bda9896f22583a38a3f52e2a927e276199ec742c Binary files /dev/null and b/docs/_source/_plots/datahistogram.png differ diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index a874611a42cbfb4ce4e663f3acad6fc4eed04607..785aab88992e84a84ab4144040597922a48e5134 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -48,7 +48,7 @@ DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotSeparationOfScales"] + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 1176621a71f09e6efff4ac21a69e4f466e6dfbd4..aca5e3c53eca42b3bcb2266afbb481829eb90a62 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -444,6 +444,132 @@ class PlotAvailabilityHistogram(AbstractPlotClass): # pragma: no cover plt.tight_layout() +class PlotDataHistogram(AbstractPlotClass): # pragma: no cover + """ + Plot histogram on transformed input and target data. This data is the same that the model sees during training. No + plots are create for the original values space (raw / unformatted data). This plot method will create a histogram + for input and target each comparing the subsets train, val and test, as well as a distinct one for the three + subsets. + + .. image:: ../../../../../_source/_plots/datahistogram.png + :width: 400 + + """ + + def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", plot_name="histogram", + variables_dim="variables", time_dim="datetime", window_dim="window"): + super().__init__(plot_folder, plot_name) + self.variables_dim = variables_dim + self.time_dim = time_dim + self.window_dim = window_dim + self.inputs, self.targets = self._get_inputs_targets(generators, self.variables_dim) + self.bins = {} + self.interval_width = {} + self.bin_edges = {} + + # input plots + self._calculate_hist(generators, self.inputs, input_data=True) + for subset in generators.keys(): + self._plot(add_name="input", subset=subset) + self._plot_combined(add_name="input") + + # target plots + self._calculate_hist(generators, self.targets, input_data=False) + for subset in generators.keys(): + self._plot(add_name="target", subset=subset) + self._plot_combined(add_name="target") + + @staticmethod + def _get_inputs_targets(gens, dim): + k = list(gens.keys())[0] + gen = gens[k][0] + inputs = to_list(gen.get_X(as_numpy=False)[0].coords[dim].values.tolist()) + targets = to_list(gen.get_Y(as_numpy=False).coords[dim].values.tolist()) + return inputs, targets + + def _calculate_hist(self, generators, variables, input_data=True): + n_bins = 100 + for set_type, generator in generators.items(): + tmp_bins = {} + tmp_edges = {} + end = {} + start = {} + f = lambda x: x.get_X(as_numpy=False)[0] if input_data is True else x.get_Y(as_numpy=False) + for gen in generator: + w = min(abs(f(gen).coords[self.window_dim].values)) + data = f(gen).sel({self.window_dim: w}) + res, _, g_edges = f_proc_hist(data, variables, n_bins, self.variables_dim) + for var in variables: + b = tmp_bins.get(var, []) + b.append(res[var]) + tmp_bins[var] = b + e = tmp_edges.get(var, []) + e.append(g_edges[var]) + tmp_edges[var] = e + end[var] = max([end.get(var, g_edges[var].max()), g_edges[var].max()]) + start[var] = min([start.get(var, g_edges[var].min()), g_edges[var].min()]) + # interpolate and aggregate + bins = {} + edges = {} + interval_width = {} + for var in variables: + bin_edges = np.linspace(start[var], end[var], n_bins + 1) + interval_width[var] = bin_edges[1] - bin_edges[0] + for i, e in enumerate(tmp_bins[var]): + bins_interp = np.interp(bin_edges[:-1], tmp_edges[var][i][:-1], e, left=0, right=0) + bins[var] = bins.get(var, np.zeros(n_bins)) + bins_interp + edges[var] = bin_edges + + self.bins[set_type] = bins + self.interval_width[set_type] = interval_width + self.bin_edges[set_type] = edges + + def _plot(self, add_name, subset): + plot_path = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}_{subset}_{add_name}.pdf") + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) + bins = self.bins[subset] + bin_edges = self.bin_edges[subset] + interval_width = self.interval_width[subset] + colors = self.get_dataset_colors() + for var in bins.keys(): + fig, ax = plt.subplots() + hist_var = bins[var] + n_var = sum(hist_var) + weights = hist_var / (interval_width[var] * n_var) + ax.hist(bin_edges[var][:-1], bin_edges[var], weights=weights, color=colors[subset]) + ax.set_ylabel("probability density") + ax.set_xlabel(f"values") + ax.set_title(f"histogram {var} ({subset}, n={int(n_var)})") + pdf_pages.savefig() + # close all open figures / plots + pdf_pages.close() + plt.close('all') + + def _plot_combined(self, add_name): + plot_path = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}_{add_name}.pdf") + pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) + variables = self.bins[list(self.bins.keys())[0]].keys() + colors = self.get_dataset_colors() + for var in variables: + fig, ax = plt.subplots() + for subset in self.bins.keys(): + hist_var = self.bins[subset][var] + interval_width = self.interval_width[subset][var] + bin_edges = self.bin_edges[subset][var] + n_var = sum(hist_var) + weights = hist_var / (interval_width * n_var) + ax.plot(bin_edges[:-1] + 0.5 * interval_width, weights, label=f"{subset}", + c=colors[subset]) + ax.set_ylabel("probability density") + ax.set_xlabel("values") + ax.legend(loc="upper right") + ax.set_title(f"histogram {var}") + pdf_pages.savefig() + # close all open figures / plots + pdf_pages.close() + plt.close('all') + + class PlotPeriodogram(AbstractPlotClass): # pragma: no cover """ Create Lomb-Scargle periodogram in raw input and target data. The Lomb-Scargle version can deal with missing values. @@ -698,14 +824,14 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover plt.close('all') -def f_proc(var, d_var): +def f_proc(var, d_var): # pragma: no cover var_str = str(var) t = (d_var.datetime - d_var.datetime[0]).astype("timedelta64[h]").values / np.timedelta64(1, "D") f, pgram = LombScargle(t, d_var.values.flatten(), nterms=1).autopower() return var_str, f, pgram -def f_proc_2(g, m, pos, variables_dim, time_dim): +def f_proc_2(g, m, pos, variables_dim, time_dim): # pragma: no cover raw_data_single = dict() if m == 0: d = g.id_class._data @@ -719,3 +845,14 @@ def f_proc_2(g, m, pos, variables_dim, time_dim): var_str, f, pgram = f_proc(var, d_var) raw_data_single[var_str] = [(f, pgram)] return raw_data_single + + +def f_proc_hist(data, variables, n_bins, variables_dim): # pragma: no cover + res = {} + bin_edges = {} + interval_width = {} + for var in variables: + d = data.sel({variables_dim: var}).squeeze() if len(data.shape) > 1 else data + res[var], bin_edges[var] = np.histogram(d.values, n_bins) + interval_width[var] = bin_edges[var][1] - bin_edges[var][0] + return res, interval_width, bin_edges diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 23d26fc1e5c866657d28b11f275d76df5a8cc300..89a6f205d03892c57c55a66399a43c9ba2987b42 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -22,7 +22,7 @@ from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ - PlotPeriodogram + PlotPeriodogram, PlotDataHistogram from mlair.run_modules.run_environment import RunEnvironment @@ -398,6 +398,13 @@ class PostProcessing(RunEnvironment): except Exception as e: logging.error(f"Could not create plot PlotPeriodogram due to the following error: {e}") + try: + if "PlotDataHistogram" in plot_list: + gens = {"train": self.train_data, "val": self.val_data, "test": self.test_data} + PlotDataHistogram(gens, plot_folder=self.plot_path, time_dim=time_dim, variables_dim=target_dim) + except Exception as e: + logging.error(f"Could not create plot PlotDataHistogram due to the following error: {e}") + def calculate_test_score(self): """Evaluate test score of model and save locally.""" diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 90227ed21e544feb90b3b426edc07e0283624177..16606d8f72c36b0d7eee852ef233b4373436e2f8 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -68,4 +68,4 @@ class TestAllDefaults: assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotSeparationOfScales"] + "PlotDataHistogram"]