diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index fec4ef6ec5e5fb89c3fedd46fc1fd6fd845ff1ea..5a00b62546b2a4b2df1bb6b32569220079e35693 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -454,6 +454,8 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover self.window_dim = window_dim self.inputs, self.targets = self._get_inputs_targets(generators, self.variables_dim) self.bins = {} + self.interval_width = {} + self.bin_edges = {} # input plots self._calculate_hist(generators, self.inputs, input_data=True) @@ -476,38 +478,58 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover return inputs, targets def _calculate_hist(self, generators, variables, input_data=True): + n_bins = 100 for set_type, generator in generators.items(): - bins = {} - n_bins = 100 - interval_width = None - bin_edges = None + tmp_bins = {} + tmp_edges = {} + end = {} + start = {} f = lambda x: x.get_X(as_numpy=False)[0] if input_data is True else x.get_Y(as_numpy=False) for gen in generator: w = min(abs(f(gen).coords[self.window_dim].values)) data = f(gen).sel({self.window_dim: w}) - res, interval_width, bin_edges = f_proc_hist(data, variables, n_bins, self.variables_dim) + res, _, g_edges = f_proc_hist(data, variables, n_bins, self.variables_dim) for var in variables: - n_var = bins.get(var, np.zeros(n_bins)) - n_var += res[var] - bins[var] = n_var + b = tmp_bins.get(var, []) + b.append(res[var]) + tmp_bins[var] = b + e = tmp_edges.get(var, []) + e.append(g_edges[var]) + tmp_edges[var] = e + end[var] = max([end.get(var, g_edges[var].max()), g_edges[var].max()]) + start[var] = min([start.get(var, g_edges[var].min()), g_edges[var].min()]) + # interpolate and aggregate + bins = {} + edges = {} + interval_width = {} + for var in variables: + bin_edges = np.linspace(start[var], end[var], n_bins + 1) + interval_width[var] = bin_edges[1] - bin_edges[0] + for i, e in enumerate(tmp_bins[var]): + bins_interp = np.interp(bin_edges[:-1], tmp_edges[var][i][:-1], e, left=0, right=0) + bins[var] = bins.get(var, np.zeros(n_bins)) + bins_interp + edges[var] = bin_edges + self.bins[set_type] = bins - self.interval_width = interval_width - self.bin_edges = bin_edges + self.interval_width[set_type] = interval_width + self.bin_edges[set_type] = edges def _plot(self, add_name, subset): plot_path = os.path.join(os.path.abspath(self.plot_folder), f"{self.plot_name}_{subset}_{add_name}.pdf") pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) bins = self.bins[subset] + bin_edges = self.bin_edges[subset] + interval_width = self.interval_width[subset] colors = self.get_dataset_colors() for var in bins.keys(): fig, ax = plt.subplots() hist_var = bins[var] n_var = sum(hist_var) - weights = hist_var / (self.interval_width * n_var) - ax.hist(self.bin_edges[:-1], self.bin_edges, weights=weights, color=colors[subset]) + weights = hist_var / (interval_width[var] * n_var) + ax.hist(bin_edges[var][:-1], bin_edges[var], weights=weights, color=colors[subset]) ax.set_ylabel("probability density") - ax.set_xlabel(f"values ({subset})") - ax.set_title(f"Histogram ({var}, n={int(n_var)})") + ax.set_xlabel(f"values") + ax.set_title(f"histogram {var} ({subset}, n={int(n_var)})") pdf_pages.savefig() # close all open figures / plots pdf_pages.close() @@ -522,14 +544,16 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() for subset in self.bins.keys(): hist_var = self.bins[subset][var] + interval_width = self.interval_width[subset][var] + bin_edges = self.bin_edges[subset][var] n_var = sum(hist_var) - weights = hist_var / (self.interval_width * n_var) - ax.plot(self.bin_edges[:-1] + 0.5 * self.interval_width, weights, label=f"{subset}", + weights = hist_var / (interval_width * n_var) + ax.plot(bin_edges[:-1] + 0.5 * interval_width, weights, label=f"{subset}", c=colors[subset]) ax.set_ylabel("probability density") - ax.set_xlabel(f"{var}") + ax.set_xlabel("values") ax.legend(loc="upper right") - ax.set_title(f"Histogram") + ax.set_title(f"histogram {var}") pdf_pages.savefig() # close all open figures / plots pdf_pages.close() @@ -815,9 +839,10 @@ def f_proc_2(g, m, pos, variables_dim, time_dim): def f_proc_hist(data, variables, n_bins, variables_dim): res = {} + bin_edges = {} + interval_width = {} for var in variables: d = data.sel({variables_dim: var}).squeeze() if len(data.shape) > 1 else data - hist, bin_edges = np.histogram(d.values, n_bins, range=(-4, 4)) - interval_width = (bin_edges[1] - bin_edges[0]) - res[var] = hist + res[var], bin_edges[var] = np.histogram(d.values, n_bins) + interval_width[var] = bin_edges[var][1] - bin_edges[var][0] return res, interval_width, bin_edges