diff --git a/mlair/data_handler/input_bootstraps.py b/mlair/data_handler/input_bootstraps.py index ab4c71f1c77ab12b0e751f6991fbf20cd55aa8ae..187f09050bb39a953ac58c2b7fca54b6a207aed1 100644 --- a/mlair/data_handler/input_bootstraps.py +++ b/mlair/data_handler/input_bootstraps.py @@ -28,12 +28,13 @@ class BootstrapIterator(Iterator): _position: int = None - def __init__(self, data: "Bootstraps", method): + def __init__(self, data: "Bootstraps", method, return_reshaped=False): assert isinstance(data, Bootstraps) self._data = data self._dimension = data.bootstrap_dimension self.boot_dim = "boots" self._method = method + self._return_reshaped = return_reshaped self._collection = self.create_collection(self._data.data, self._dimension) self._position = 0 @@ -46,12 +47,15 @@ class BootstrapIterator(Iterator): raise NotImplementedError def _reshape(self, d): - if isinstance(d, list): - return list(map(lambda x: self._reshape(x), d)) - # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + if self._return_reshaped: + if isinstance(d, list): + return list(map(lambda x: self._reshape(x), d)) + # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + else: + shape = d.shape + return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) else: - shape = d.shape - return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) + return d def _to_numpy(self, d): if isinstance(d, list): @@ -75,8 +79,8 @@ class BootstrapIterator(Iterator): class BootstrapIteratorSingleInput(BootstrapIterator): _position: int = None - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -107,8 +111,8 @@ class BootstrapIteratorSingleInput(BootstrapIterator): class BootstrapIteratorVariable(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -140,8 +144,8 @@ class BootstrapIteratorVariable(BootstrapIterator): class BootstrapIteratorBranch(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): try: diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index aa89da0ea66e263d076af9abd578ba125c260bec..af7975f3a042163a885f590c6624076fe91f03aa 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -361,7 +361,7 @@ class SkillScores: **kwargs) def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, - observation_name: str = None) -> np.ndarray: + observation_name: str = None, dim: str = "index") -> np.ndarray: r""" Calculate general skill score based on mean squared error. @@ -374,12 +374,12 @@ class SkillScores: """ if observation_name is None: observation_name = self.observation_name - data = data.dropna("index") + data = data.dropna(dim) observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) reference = data.sel(type=reference_name) mse = mean_squared_error - skill_score = 1 - mse(observation, forecast) / mse(observation, reference) + skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim) return skill_score.values def get_count(self, data: Data, forecast_name: str, reference_name: str, diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 2afb5f08ec6401fa2e693c924cf263204ffbb75d..c95afea68ae8f979d78a001b13aa9d8a1580c792 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -129,7 +129,7 @@ class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover logging.debug("... start plotting") color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_d", self._window_lead_time).as_hex() - ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette, + ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, meanprops={'markersize': 1, 'markeredgecolor': 'k'}) ylabel = self._spell_out_chemical_concentrations(target_var) @@ -449,7 +449,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() if not score_only: fig.set_size_inches(11.7, 8.27) - sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", + sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) ax.set(ylabel=f"{self._label_add(score_only)}skill score", xlabel="", title="summary of all stations", @@ -539,7 +539,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axhline(y=0, color="grey", linewidth=.5) @@ -554,7 +554,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axvline(x=0, color="grey", linewidth=.5) @@ -613,7 +613,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None, sampling: str = "daily", ahead_dim: str = "ahead", bootstrap_type: str = None, - bootstrap_method: str = None): + bootstrap_method: str = None, boot_dim: str = "boots", model_name: str = "NN", + branch_names: list = None, ylim: tuple = None): """ Set attributes and create plot. @@ -632,15 +633,20 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover self._labels = None self._x_name = "boot_var" self._ahead_dim = ahead_dim + self._boot_dim = boot_dim self._boot_type = self._set_bootstrap_type(bootstrap_type) self._boot_method = self._set_bootstrap_method(bootstrap_method) + self._branches_names = branch_names + self._ylim = ylim - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type})" + title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables"} + self._title = f"{model_name}\nImportance of {title_d[self._boot_type]}" self._data = self._prepare_data(data, sampling) if "branch" in self._data.columns: plot_name = self.plot_name for branch in self._data["branch"].unique(): - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type}, {branch})" + branch_name = self._branches_names[branch] if self._branches_names is not None else branch + self._title = f"{model_name}\nImportance of {title_d[self._boot_type]} ({branch_name})" self._plot(branch=branch) self.plot_name = f"{plot_name}_{branch}" self._save() @@ -676,10 +682,11 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover number_tags = self._get_number_tag(data.coords[self._x_name].values, split_by='_') new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', keep=1, as_unique=True) - values = data.values.reshape((data.shape[0], len(new_boot_coords), len(number_tags), data.shape[-1])) - data = xr.DataArray(values, coords={station_dim: data.coords["station"], self._x_name: new_boot_coords, - "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim]}, - dims=[station_dim, self._x_name, "branch", self._ahead_dim]) + values = data.values.reshape((*data.shape[:3], len(number_tags), len(new_boot_coords))) + data = xr.DataArray(values, coords={station_dim: data.coords[station_dim], self._x_name: new_boot_coords, + "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim], + self._boot_dim: data.coords[self._boot_dim]}, + dims=[station_dim, self._ahead_dim, self._boot_dim, "branch", self._x_name]) else: try: new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', @@ -754,14 +761,14 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover if len(separate_vars) > 1: first_box_width = .8 else: - first_box_width = 2. + first_box_width = .8 - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}, width=first_box_width) ax[0].set(ylabel=f"skill score", xlabel="") - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax[1].set(ylabel="", xlabel="") @@ -773,7 +780,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover plt.xticks(rotation=45, ha='right') sax.legend_.remove() - fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + # fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + ax[1].legend(handles, self._labels, loc='lower center', ncol=len(handles) + 1, fontsize="medium") def align_yaxis(ax1, ax2): """ @@ -798,6 +806,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover align_yaxis(ax[0], ax[1]) align_yaxis(ax[0], ax[1]) + plt.subplots_adjust(right=0.8) plt.title(self._title) @staticmethod @@ -827,12 +836,25 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover """ """ - fig, ax = plt.subplots() plot_data = self._data if branch is None else self._data[self._data["branch"] == str(branch)] - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., palette="Blues_d", - showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) + if self._boot_type == "branch": + fig, ax = plt.subplots(figsize=(0.5 + 2 / len(plot_data[self._x_name].unique()) + len(plot_data[self._x_name].unique()),4)) + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}, + width=0.8) + else: + fig, ax = plt.subplots() + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1.5, palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) - plt.xticks(rotation=45) + + if self._ylim is not None: + ax.set(ylim=self._ylim) + + if self._boot_type == "branch": + plt.xticks() + else: + plt.xticks(rotation=45) ax.set(ylabel=f"skill score", xlabel="", title=self._title) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) @@ -1051,7 +1073,7 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover else: raise ValueError(f"orientation must be `v' or `h' but is: {orientation}") fig, ax = plt.subplots(figsize=figsize) - sns.boxplot(data=data_table, ax=ax, whis=1., color="white", + sns.boxplot(data=data_table, ax=ax, whis=1.5, color="white", showmeans=True, meanprops={"markersize": 6, "markeredgecolor": "k"}, flierprops={"marker": "o", "markerfacecolor": "black", "markeredgecolor": "none", "markersize": 3}, boxprops={'facecolor': 'none', 'edgecolor': 'k'}, diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 1de27067fa829db44aad8fff4fa3b4fe3288109c..0c516242e352aecd5e8741656263355407a4127e 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -298,26 +298,34 @@ class PostProcessing(RunEnvironment): These forecasts are saved in bootstrap_path with the names `bootstraps_{var}_{station}.nc` and `bootstraps_labels_{station}.nc`. """ + + def _reshape(d, pos): + if isinstance(d, list): + return list(map(lambda x: _reshape(x, pos), d)) + else: + return d[..., pos] + # forecast with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"): # extract all requirements from data store forecast_path = self.data_store.get("forecast_path") number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") - dims = [self.index_dim, self.ahead_dim, self.model_type_dim] + dims = [self.uncertainty_estimate_boot_dim, self.index_dim, self.ahead_dim, self.model_type_dim] for station in self.test_data: X, Y = None, None bootstraps = Bootstraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type, bootstrap_method=bootstrap_method) + number_of_bootstraps = bootstraps.number_of_bootstraps for boot in bootstraps: X, Y, (index, dimension) = boot # make bootstrap predictions - bootstrap_predictions = self.model.predict(X) - if isinstance(bootstrap_predictions, list): # if model is branched model - bootstrap_predictions = bootstrap_predictions[-1] + bootstrap_predictions = [self.model.predict(_reshape(X, pos)) for pos in range(number_of_bootstraps)] + if isinstance(bootstrap_predictions[0], list): # if model is branched model + bootstrap_predictions = list(map(lambda x: x[-1], bootstrap_predictions)) # save bootstrap predictions separately for each station and variable combination - bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1) - shape = bootstrap_predictions.shape - coords = (range(shape[0]), range(1, shape[1] + 1)) + bootstrap_predictions = list(map(lambda x: np.expand_dims(x, axis=-1), bootstrap_predictions)) + shape = bootstrap_predictions[0].shape + coords = (range(number_of_bootstraps), range(shape[0]), range(1, shape[1] + 1)) var = f"{index}_{dimension}" if index is not None else str(dimension) tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims) file_name = os.path.join(forecast_path, @@ -325,9 +333,9 @@ class PostProcessing(RunEnvironment): tmp.to_netcdf(file_name) else: # store also true labels for each station - labels = np.expand_dims(Y, axis=-1) + labels = np.expand_dims(Y[..., 0], axis=-1) file_name = os.path.join(forecast_path, f"bootstraps_{station}_{bootstrap_method}_labels.nc") - labels = xr.DataArray(labels, coords=(*coords, [self.observation_indicator]), dims=dims) + labels = xr.DataArray(labels, coords=(*coords[1:], [self.observation_indicator]), dims=dims[1:]) labels.to_netcdf(file_name) def calculate_feature_importance_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]: @@ -358,16 +366,13 @@ class PostProcessing(RunEnvironment): file_name = os.path.join(forecast_path, f"bootstraps_{str(station)}_{bootstrap_method}_labels.nc") with xr.open_dataarray(file_name) as da: labels = da.load() - shape = labels.shape # get original forecasts - orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps) - orig = orig.reshape(shape) - coords = (range(shape[0]), range(1, shape[1] + 1), [reference_name]) - orig = xr.DataArray(orig, coords=coords, dims=[self.index_dim, self.ahead_dim, self.model_type_dim]) + orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), reference_name=reference_name) + orig.coords[self.index_dim] = labels.coords[self.index_dim] # calculate skill scores for each variable - skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1)) + skill = [] for boot_set in bootstrap_iter: boot_var = f"{boot_set[0]}_{boot_set[1]}" if isinstance(boot_set, tuple) else str(boot_set) file_name = os.path.join(forecast_path, @@ -380,27 +385,32 @@ class PostProcessing(RunEnvironment): data = boot_data.sel({self.ahead_dim: ahead}) boot_scores.append( skill_scores.general_skill_score(data, forecast_name=boot_var, - reference_name=reference_name)) - skill.loc[boot_var] = np.array(boot_scores) + reference_name=reference_name, dim=self.index_dim)) + tmp = xr.DataArray(np.expand_dims(np.array(boot_scores), axis=-1), + coords=[range(1, self.window_lead_time + 1), range(number_of_bootstraps), + [boot_var]], + dims=[self.ahead_dim, self.uncertainty_estimate_boot_dim, self.boot_var_dim]) + skill.append(tmp) # collect all results in single dictionary - score[str(station)] = xr.DataArray(skill, dims=[self.boot_var_dim, self.ahead_dim]) + score[str(station)] = xr.concat(skill, dim=self.boot_var_dim) return score - def get_orig_prediction(self, path, file_name, number_of_bootstraps, prediction_name=None): + def get_orig_prediction(self, path, file_name, prediction_name=None, reference_name=None): if prediction_name is None: prediction_name = self.forecast_indicator file = os.path.join(path, file_name) with xr.open_dataarray(file) as da: - prediction = da.load().sel(type=prediction_name).squeeze() - return self.repeat_data(prediction, number_of_bootstraps) + prediction = da.load().sel({self.model_type_dim: [prediction_name]}) + if reference_name is not None: + prediction.coords[self.model_type_dim] = [reference_name] + return prediction.dropna(dim=self.index_dim) @staticmethod def repeat_data(data, number_of_repetition): if isinstance(data, xr.DataArray): data = data.data - vals = np.tile(data, (number_of_repetition, 1)) - return vals[~np.isnan(vals).any(axis=1), :] + return np.repeat(np.expand_dims(data, axis=-1), number_of_repetition, axis=-1) def _get_model_name(self): """Return model name without path information.""" @@ -465,6 +475,8 @@ class PostProcessing(RunEnvironment): for boot_type, boot_data in self.feature_importance_skill_scores.items(): for boot_method, boot_skill_score in boot_data.items(): try: + #todo rename plot name + #todo check if plot works properly PlotBootstrapSkillScore(boot_skill_score, plot_folder=self.plot_path, model_setup=self.forecast_indicator, sampling=self._sampling, ahead_dim=self.ahead_dim, separate_vars=to_list(self.target_var), @@ -962,15 +974,17 @@ class PostProcessing(RunEnvironment): """Create a csv file containing all results from feature importance.""" report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(report_path) - res = [[self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, "vals"]] + res = [] for boot_type, d0 in results.items(): for boot_method, d1 in d0.items(): for station_name, vals in d1.items(): for boot_var in vals.coords[self.boot_var_dim].values.tolist(): for ahead in vals.coords[self.ahead_dim].values.tolist(): res.append([boot_type, boot_method, station_name, boot_var, ahead, - float(vals.sel({self.boot_var_dim: boot_var, self.ahead_dim: ahead}))]) - col_names = res.pop(0) + *vals.sel({self.boot_var_dim: boot_var, + self.ahead_dim: ahead}).values.round(5).tolist()]) + col_names = [self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, + *list(range(len(res[0]) - 5))] df = pd.DataFrame(res, columns=col_names) file_name = "feature_importance_skill_score_report_raw.csv" df.to_csv(os.path.join(report_path, file_name), sep=";")