diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 9b79dc6ac815ce6e7f55f67062fc1e6172544512..ca569720dc41d95621d0613a2170cc4d9d46c082 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -54,7 +54,7 @@ DEFAULT_FEATURE_IMPORTANCE_N_BOOTS = 20 DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE = "singleinput" DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD = "shuffle" DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", - "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", + "PlotCompetitiveSkillScore", "PlotFeatureImportanceSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", "PlotSampleUncertaintyFromBootstrap"] DEFAULT_SAMPLING = "daily" diff --git a/mlair/data_handler/input_bootstraps.py b/mlair/data_handler/input_bootstraps.py index ab4c71f1c77ab12b0e751f6991fbf20cd55aa8ae..187f09050bb39a953ac58c2b7fca54b6a207aed1 100644 --- a/mlair/data_handler/input_bootstraps.py +++ b/mlair/data_handler/input_bootstraps.py @@ -28,12 +28,13 @@ class BootstrapIterator(Iterator): _position: int = None - def __init__(self, data: "Bootstraps", method): + def __init__(self, data: "Bootstraps", method, return_reshaped=False): assert isinstance(data, Bootstraps) self._data = data self._dimension = data.bootstrap_dimension self.boot_dim = "boots" self._method = method + self._return_reshaped = return_reshaped self._collection = self.create_collection(self._data.data, self._dimension) self._position = 0 @@ -46,12 +47,15 @@ class BootstrapIterator(Iterator): raise NotImplementedError def _reshape(self, d): - if isinstance(d, list): - return list(map(lambda x: self._reshape(x), d)) - # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + if self._return_reshaped: + if isinstance(d, list): + return list(map(lambda x: self._reshape(x), d)) + # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + else: + shape = d.shape + return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) else: - shape = d.shape - return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) + return d def _to_numpy(self, d): if isinstance(d, list): @@ -75,8 +79,8 @@ class BootstrapIterator(Iterator): class BootstrapIteratorSingleInput(BootstrapIterator): _position: int = None - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -107,8 +111,8 @@ class BootstrapIteratorSingleInput(BootstrapIterator): class BootstrapIteratorVariable(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -140,8 +144,8 @@ class BootstrapIteratorVariable(BootstrapIterator): class BootstrapIteratorBranch(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): try: diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index aa89da0ea66e263d076af9abd578ba125c260bec..af7975f3a042163a885f590c6624076fe91f03aa 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -361,7 +361,7 @@ class SkillScores: **kwargs) def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, - observation_name: str = None) -> np.ndarray: + observation_name: str = None, dim: str = "index") -> np.ndarray: r""" Calculate general skill score based on mean squared error. @@ -374,12 +374,12 @@ class SkillScores: """ if observation_name is None: observation_name = self.observation_name - data = data.dropna("index") + data = data.dropna(dim) observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) reference = data.sel(type=reference_name) mse = mean_squared_error - skill_score = 1 - mse(observation, forecast) / mse(observation, reference) + skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim) return skill_score.values def get_count(self, data: Data, forecast_name: str, reference_name: str, diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 2afb5f08ec6401fa2e693c924cf263204ffbb75d..43f1864f7354c1f711bb886f4f97eda56439ab89 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -129,7 +129,7 @@ class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover logging.debug("... start plotting") color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_d", self._window_lead_time).as_hex() - ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette, + ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, meanprops={'markersize': 1, 'markeredgecolor': 'k'}) ylabel = self._spell_out_chemical_concentrations(target_var) @@ -449,7 +449,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() if not score_only: fig.set_size_inches(11.7, 8.27) - sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", + sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) ax.set(ylabel=f"{self._label_add(score_only)}skill score", xlabel="", title="summary of all stations", @@ -539,7 +539,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axhline(y=0, color="grey", linewidth=.5) @@ -554,7 +554,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover fig, ax = plt.subplots() data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axvline(x=0, color="grey", linewidth=.5) @@ -591,14 +591,9 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover @TimeTrackingWrapper -class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover +class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover """ - Create plot of climatological skill score after Murphy (1988) as box plot over all stations. - - A forecast time step (called "ahead") is separately shown to highlight the differences for each prediction time - step. Either each single term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed - (score_only=True, default). Y-axis is adjusted following the data and not hard coded. The plot is saved under - plot_folder path with name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi. + Create plot of feature importance analysis. By passing a list `separate_vars` containing variable names, a second plot is created showing the `separate_vars` and the remaining variables side by side with different scaling. @@ -613,7 +608,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None, sampling: str = "daily", ahead_dim: str = "ahead", bootstrap_type: str = None, - bootstrap_method: str = None): + bootstrap_method: str = None, boot_dim: str = "boots", model_name: str = "NN", + branch_names: list = None, ylim: tuple = None): """ Set attributes and create plot. @@ -626,24 +622,32 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover :param bootstrap_annotation: additional information to use in the file name (default: None) """ annotation = ["_".join([s for s in ["", bootstrap_type, bootstrap_method] if s is not None])][0] - super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}{annotation}") + super().__init__(plot_folder, f"feature_importance_{model_setup}{annotation}") if separate_vars is None: separate_vars = ['o3'] self._labels = None self._x_name = "boot_var" self._ahead_dim = ahead_dim + self._boot_dim = boot_dim self._boot_type = self._set_bootstrap_type(bootstrap_type) self._boot_method = self._set_bootstrap_method(bootstrap_method) + self._number_of_bootstraps = 0 + self._branches_names = branch_names + self._ylim = ylim - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type})" self._data = self._prepare_data(data, sampling) + self._set_title(model_name) if "branch" in self._data.columns: plot_name = self.plot_name for branch in self._data["branch"].unique(): - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type}, {branch})" + self._set_title(model_name, branch) self._plot(branch=branch) self.plot_name = f"{plot_name}_{branch}" self._save() + if len(set(separate_vars).intersection(self._data[self._x_name].unique())) > 0: + self.plot_name += '_separated' + self._plot(branch=branch, separate_vars=separate_vars) + self._save(bbox_inches='tight') else: self._plot() self._save() @@ -656,6 +660,21 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover def _set_bootstrap_type(boot_type): return {"singleinput": "single input"}.get(boot_type, boot_type) + def _set_title(self, model_name, branch=None): + title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables"} + base_title = f"{model_name}\nImportance of {title_d[self._boot_type]}" + + additional = [] + if branch is not None: + branch_name = self._branches_names[branch] if self._branches_names is not None else branch + additional.append(branch_name) + if self._number_of_bootstraps > 1: + additional.append(f"n={self._number_of_bootstraps}") + additional_title = ", ".join(additional) + if len(additional_title) > 0: + additional_title = f" ({additional_title})" + self._title = base_title + additional_title + @staticmethod def _set_bootstrap_method(boot_method): return {"zero_mean": "zero mean", "shuffle": "shuffled"}.get(boot_method, boot_method) @@ -672,14 +691,16 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover """ station_dim = "station" data = helpers.dict_to_xarray(data, station_dim).sortby(self._x_name) + data = data.transpose(station_dim, self._ahead_dim, self._boot_dim, self._x_name) if self._boot_type == "single input": number_tags = self._get_number_tag(data.coords[self._x_name].values, split_by='_') new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', keep=1, as_unique=True) - values = data.values.reshape((data.shape[0], len(new_boot_coords), len(number_tags), data.shape[-1])) - data = xr.DataArray(values, coords={station_dim: data.coords["station"], self._x_name: new_boot_coords, - "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim]}, - dims=[station_dim, self._x_name, "branch", self._ahead_dim]) + values = data.values.reshape((*data.shape[:3], len(number_tags), len(new_boot_coords))) + data = xr.DataArray(values, coords={station_dim: data.coords[station_dim], self._x_name: new_boot_coords, + "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim], + self._boot_dim: data.coords[self._boot_dim]}, + dims=[station_dim, self._ahead_dim, self._boot_dim, "branch", self._x_name]) else: try: new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', @@ -691,6 +712,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover self._labels = [str(i) + sampling_letter for i in data.coords[self._ahead_dim].values] if station_dim not in data.dims: data = data.expand_dims(station_dim) + self._number_of_bootstraps = np.unique(data.coords[self._boot_dim].values).shape[0] return data.to_dataframe("data").reset_index(level=np.arange(len(data.dims)).tolist()) @staticmethod @@ -739,10 +761,10 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover if separate_vars is None: self._plot_all_variables(branch) else: - self._plot_selected_variables(separate_vars) + self._plot_selected_variables(separate_vars, branch) - def _plot_selected_variables(self, separate_vars: List): - data = self._data + def _plot_selected_variables(self, separate_vars: List, branch=None): + data = self._data if branch is None else self._data[self._data["branch"] == str(branch)] self.raise_error_if_separate_vars_do_not_exist(data, separate_vars, self._x_name) all_variables = self._get_unique_values_from_column_of_df(data, self._x_name) remaining_vars = helpers.remove_items(all_variables, separate_vars) @@ -750,22 +772,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover data_second = self._select_data(df=data, variables=remaining_vars, column_name=self._x_name) fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [len(separate_vars), - len(remaining_vars)]}) + len(remaining_vars)]}, + figsize=(len(remaining_vars),len(remaining_vars)/2.)) if len(separate_vars) > 1: first_box_width = .8 else: - first_box_width = 2. + first_box_width = .8 - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, - flierprops={"marker": "."}, width=first_box_width) + showfliers=False, width=first_box_width) ax[0].set(ylabel=f"skill score", xlabel="") + if self._ylim is not None: + _ylim = self._ylim if isinstance(self._ylim, tuple) else self._ylim[0] + ax[0].set(ylim=_ylim) - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, - flierprops={"marker": "."}) + showfliers=False) ax[1].set(ylabel="", xlabel="") ax[1].yaxis.tick_right() + if self._ylim is not None and isinstance(self._ylim, list): + _ylim = self._ylim[1] + ax[1].set(ylim=_ylim) + handles, _ = ax[1].get_legend_handles_labels() for sax in ax: matplotlib.pyplot.sca(sax) @@ -773,7 +803,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover plt.xticks(rotation=45, ha='right') sax.legend_.remove() - fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + # fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + ax[1].legend(handles, self._labels, loc='lower center', ncol=len(handles) + 1, fontsize="medium") def align_yaxis(ax1, ax2): """ @@ -798,6 +829,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover align_yaxis(ax[0], ax[1]) align_yaxis(ax[0], ax[1]) + plt.subplots_adjust(right=0.8) plt.title(self._title) @staticmethod @@ -827,12 +859,29 @@ class PlotBootstrapSkillScore(AbstractPlotClass): # pragma: no cover """ """ - fig, ax = plt.subplots() plot_data = self._data if branch is None else self._data[self._data["branch"] == str(branch)] - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., palette="Blues_d", - showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) + if self._boot_type == "branch": + fig, ax = plt.subplots(figsize=(0.5 + 2 / len(plot_data[self._x_name].unique()) + len(plot_data[self._x_name].unique()),4)) + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., + palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + showfliers=False, width=0.8) + else: + fig, ax = plt.subplots() + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1.5, palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False) ax.axhline(y=0, color="grey", linewidth=.5) - plt.xticks(rotation=45) + + if self._ylim is not None: + if isinstance(self._ylim, tuple): + _ylim = self._ylim + else: + _ylim = (min(self._ylim[0][0], self._ylim[1][0]), max(self._ylim[0][1], self._ylim[1][1])) + ax.set(ylim=_ylim) + + if self._boot_type == "branch": + plt.xticks() + else: + plt.xticks(rotation=45) ax.set(ylabel=f"skill score", xlabel="", title=self._title) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) @@ -1051,7 +1100,7 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover else: raise ValueError(f"orientation must be `v' or `h' but is: {orientation}") fig, ax = plt.subplots(figsize=figsize) - sns.boxplot(data=data_table, ax=ax, whis=1., color="white", + sns.boxplot(data=data_table, ax=ax, whis=1.5, color="white", showmeans=True, meanprops={"markersize": 6, "markeredgecolor": "k"}, flierprops={"marker": "o", "markerfacecolor": "black", "markeredgecolor": "none", "markersize": 3}, boxprops={'facecolor': 'none', 'edgecolor': 'k'}, diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 1de27067fa829db44aad8fff4fa3b4fe3288109c..e3aa2154559622fdd699d430bc4d386499f5114d 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -22,7 +22,7 @@ from mlair.helpers import TimeTracking, statistics, extract_value, remove_items, from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, \ + PlotCompetitiveSkillScore, PlotTimeSeries, PlotFeatureImportanceSkillScore, PlotConditionalQuantiles, \ PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ PlotPeriodogram, PlotDataHistogram @@ -272,7 +272,8 @@ class PostProcessing(RunEnvironment): if _iter == 0: self.feature_importance_skill_scores = {} for boot_type in to_list(bootstrap_type): - self.feature_importance_skill_scores[boot_type] = {} + if _iter == 0: + self.feature_importance_skill_scores[boot_type] = {} for boot_method in to_list(bootstrap_method): try: if create_new_bootstraps: @@ -281,7 +282,7 @@ class PostProcessing(RunEnvironment): boot_skill_score = self.calculate_feature_importance_skill_scores(bootstrap_type=boot_type, bootstrap_method=boot_method) self.feature_importance_skill_scores[boot_type][boot_method] = boot_skill_score - except FileNotFoundError: + except (FileNotFoundError, ValueError): if _iter != 0: raise RuntimeError(f"calculate_feature_importance ({boot_type}, {boot_type}) was called for the " f"2nd time. This means, that something internally goes wrong. Please check " @@ -298,26 +299,34 @@ class PostProcessing(RunEnvironment): These forecasts are saved in bootstrap_path with the names `bootstraps_{var}_{station}.nc` and `bootstraps_labels_{station}.nc`. """ + + def _reshape(d, pos): + if isinstance(d, list): + return list(map(lambda x: _reshape(x, pos), d)) + else: + return d[..., pos] + # forecast with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"): # extract all requirements from data store forecast_path = self.data_store.get("forecast_path") number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") - dims = [self.index_dim, self.ahead_dim, self.model_type_dim] + dims = [self.uncertainty_estimate_boot_dim, self.index_dim, self.ahead_dim, self.model_type_dim] for station in self.test_data: X, Y = None, None bootstraps = Bootstraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type, bootstrap_method=bootstrap_method) + number_of_bootstraps = bootstraps.number_of_bootstraps for boot in bootstraps: X, Y, (index, dimension) = boot # make bootstrap predictions - bootstrap_predictions = self.model.predict(X) - if isinstance(bootstrap_predictions, list): # if model is branched model - bootstrap_predictions = bootstrap_predictions[-1] + bootstrap_predictions = [self.model.predict(_reshape(X, pos)) for pos in range(number_of_bootstraps)] + if isinstance(bootstrap_predictions[0], list): # if model is branched model + bootstrap_predictions = list(map(lambda x: x[-1], bootstrap_predictions)) # save bootstrap predictions separately for each station and variable combination - bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1) - shape = bootstrap_predictions.shape - coords = (range(shape[0]), range(1, shape[1] + 1)) + bootstrap_predictions = list(map(lambda x: np.expand_dims(x, axis=-1), bootstrap_predictions)) + shape = bootstrap_predictions[0].shape + coords = (range(number_of_bootstraps), range(shape[0]), range(1, shape[1] + 1)) var = f"{index}_{dimension}" if index is not None else str(dimension) tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims) file_name = os.path.join(forecast_path, @@ -325,9 +334,9 @@ class PostProcessing(RunEnvironment): tmp.to_netcdf(file_name) else: # store also true labels for each station - labels = np.expand_dims(Y, axis=-1) + labels = np.expand_dims(Y[..., 0], axis=-1) file_name = os.path.join(forecast_path, f"bootstraps_{station}_{bootstrap_method}_labels.nc") - labels = xr.DataArray(labels, coords=(*coords, [self.observation_indicator]), dims=dims) + labels = xr.DataArray(labels, coords=(*coords[1:], [self.observation_indicator]), dims=dims[1:]) labels.to_netcdf(file_name) def calculate_feature_importance_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]: @@ -358,16 +367,13 @@ class PostProcessing(RunEnvironment): file_name = os.path.join(forecast_path, f"bootstraps_{str(station)}_{bootstrap_method}_labels.nc") with xr.open_dataarray(file_name) as da: labels = da.load() - shape = labels.shape # get original forecasts - orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps) - orig = orig.reshape(shape) - coords = (range(shape[0]), range(1, shape[1] + 1), [reference_name]) - orig = xr.DataArray(orig, coords=coords, dims=[self.index_dim, self.ahead_dim, self.model_type_dim]) + orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), reference_name=reference_name) + orig.coords[self.index_dim] = labels.coords[self.index_dim] # calculate skill scores for each variable - skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1)) + skill = [] for boot_set in bootstrap_iter: boot_var = f"{boot_set[0]}_{boot_set[1]}" if isinstance(boot_set, tuple) else str(boot_set) file_name = os.path.join(forecast_path, @@ -380,27 +386,33 @@ class PostProcessing(RunEnvironment): data = boot_data.sel({self.ahead_dim: ahead}) boot_scores.append( skill_scores.general_skill_score(data, forecast_name=boot_var, - reference_name=reference_name)) - skill.loc[boot_var] = np.array(boot_scores) + reference_name=reference_name, dim=self.index_dim)) + tmp = xr.DataArray(np.expand_dims(np.array(boot_scores), axis=-1), + coords={self.ahead_dim: range(1, self.window_lead_time + 1), + self.uncertainty_estimate_boot_dim: range(number_of_bootstraps), + self.boot_var_dim: [boot_var]}, + dims=[self.ahead_dim, self.uncertainty_estimate_boot_dim, self.boot_var_dim]) + skill.append(tmp) # collect all results in single dictionary - score[str(station)] = xr.DataArray(skill, dims=[self.boot_var_dim, self.ahead_dim]) + score[str(station)] = xr.concat(skill, dim=self.boot_var_dim) return score - def get_orig_prediction(self, path, file_name, number_of_bootstraps, prediction_name=None): + def get_orig_prediction(self, path, file_name, prediction_name=None, reference_name=None): if prediction_name is None: prediction_name = self.forecast_indicator file = os.path.join(path, file_name) with xr.open_dataarray(file) as da: - prediction = da.load().sel(type=prediction_name).squeeze() - return self.repeat_data(prediction, number_of_bootstraps) + prediction = da.load().sel({self.model_type_dim: [prediction_name]}) + if reference_name is not None: + prediction.coords[self.model_type_dim] = [reference_name] + return prediction.dropna(dim=self.index_dim) @staticmethod def repeat_data(data, number_of_repetition): if isinstance(data, xr.DataArray): data = data.data - vals = np.tile(data, (number_of_repetition, 1)) - return vals[~np.isnan(vals).any(axis=1), :] + return np.repeat(np.expand_dims(data, axis=-1), number_of_repetition, axis=-1) def _get_model_name(self): """Return model name without path information.""" @@ -461,20 +473,21 @@ class PostProcessing(RunEnvironment): f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: - if (self.feature_importance_skill_scores is not None) and ("PlotBootstrapSkillScore" in plot_list): + if (self.feature_importance_skill_scores is not None) and ("PlotFeatureImportanceSkillScore" in plot_list): for boot_type, boot_data in self.feature_importance_skill_scores.items(): for boot_method, boot_skill_score in boot_data.items(): try: - PlotBootstrapSkillScore(boot_skill_score, plot_folder=self.plot_path, - model_setup=self.forecast_indicator, sampling=self._sampling, - ahead_dim=self.ahead_dim, separate_vars=to_list(self.target_var), - bootstrap_type=boot_type, bootstrap_method=boot_method) + PlotFeatureImportanceSkillScore( + boot_skill_score, plot_folder=self.plot_path, model_setup=self.forecast_indicator, + sampling=self._sampling, ahead_dim=self.ahead_dim, + separate_vars=to_list(self.target_var), bootstrap_type=boot_type, + bootstrap_method=boot_method) except Exception as e: - logging.error(f"Could not create plot PlotBootstrapSkillScore ({boot_type}, {boot_method}) " - f"due to the following error:\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n" - f"{sys.exc_info()[2]}") + logging.error(f"Could not create plot PlotFeatureImportanceSkillScore ({boot_type}, " + f"{boot_method}) due to the following error:\n{sys.exc_info()[0]}\n" + f"{sys.exc_info()[1]}\n{sys.exc_info()[2]}") except Exception as e: - logging.error(f"Could not create plot PlotBootstrapSkillScore due to the following error: {e}") + logging.error(f"Could not create plot PlotFeatureImportanceSkillScore due to the following error: {e}") try: if "PlotConditionalQuantiles" in plot_list: @@ -574,7 +587,7 @@ class PostProcessing(RunEnvironment): try: if "PlotSampleUncertaintyFromBootstrap" in plot_list and self.uncertainty_estimate is not None: - block_length= self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") + block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") PlotSampleUncertaintyFromBootstrap( data=self.uncertainty_estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim, dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error", @@ -962,15 +975,17 @@ class PostProcessing(RunEnvironment): """Create a csv file containing all results from feature importance.""" report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(report_path) - res = [[self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, "vals"]] + res = [] for boot_type, d0 in results.items(): for boot_method, d1 in d0.items(): for station_name, vals in d1.items(): for boot_var in vals.coords[self.boot_var_dim].values.tolist(): for ahead in vals.coords[self.ahead_dim].values.tolist(): res.append([boot_type, boot_method, station_name, boot_var, ahead, - float(vals.sel({self.boot_var_dim: boot_var, self.ahead_dim: ahead}))]) - col_names = res.pop(0) + *vals.sel({self.boot_var_dim: boot_var, + self.ahead_dim: ahead}).values.round(5).tolist()]) + col_names = [self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, + *list(range(len(res[0]) - 5))] df = pd.DataFrame(res, columns=col_names) file_name = "feature_importance_skill_score_report_raw.csv" df.to_csv(os.path.join(report_path, file_name), sep=";") diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index 8644181185203186bb6c8549e8faa99e75a31a81..f6bc6d24724c2620083602d3864bcbca0a709681 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -72,8 +72,6 @@ class TestAllDefaults: assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE == "singleinput" assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD == "shuffle" assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", - "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", + "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotFeatureImportanceSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", "PlotSampleUncertaintyFromBootstrap"] - -