diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index a41c636b5ab17d2039f7976fca625e9c8e11ce6e..b39de8e957a110121c0e8812608d32aad3431570 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -479,6 +479,76 @@ class PlotCompetitiveSkillScore(RunEnvironment): plt.close() +class PlotBootstrapSkillScore(RunEnvironment): + """ + Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step + (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single + term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed (score_only=True, + default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with + name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi. + """ + + def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""): + """ + Sets attributes and create plot + :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms. + :param plot_folder: path to save the plot (default: current directory) + :param model_setup: architecture type to specify plot name (default "CNN") + """ + super().__init__() + self._labels = None + self._data = self._prepare_data(data) + self._plot(plot_folder, model_setup) + + def _prepare_data(self, data: Dict) -> pd.DataFrame: + """ + Shrink given data, if only scores are relevant. In any case, transform data to a plot friendly format. Also set + plot labels depending on the lead time dimensions. + :param data: dictionary with station names as keys and 2D xarrays as values + :return: pre-processed data set + """ + data = helpers.dict_to_xarray(data, "station") + self._labels = [str(i) + "d" for i in data.coords["ahead"].values] + return data.to_dataframe("data").reset_index(level=[0, 1, 2]) + + def _label_add(self, score_only: bool): + """ + Adds the phrase "terms and " if score_only is disabled or empty string (if score_only=True). + :param score_only: if false all terms are relevant, otherwise only CASE I to IV + :return: additional label + """ + return "" if score_only else "terms and " + + def _plot(self, plot_folder, model_setup): + """ + Main plot function to plot climatological skill score. + :param plot_folder: path to save the plot + :param model_setup: architecture type to specify plot name + """ + fig, ax = plt.subplots() + sns.boxplot(x="boot_var", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) + ax.axhline(y=0, color="grey", linewidth=.5) + ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations") + handles, _ = ax.get_legend_handles_labels() + ax.legend(handles, self._labels) + plt.tight_layout() + self._save(plot_folder, model_setup) + + @staticmethod + def _save(plot_folder, model_setup): + """ + Standard save method to store plot locally. The name of this plot is dynamic. It includes the model setup like + 'CNN' and can additionally be adjusted using an extra name tag. + :param plot_folder: path to save the plot + :param model_setup: architecture type to specify plot name + """ + plot_name = os.path.join(plot_folder, f"skill_score_bootstrap_{model_setup}.pdf") + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=500) + plt.close('all') + + class PlotTimeSeries(RunEnvironment): def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".", diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index 0a791617a8c35b57cc3a646abb7e2e9fe5aa968c..5c392a402da47251c51668e0b06a3067104a61e6 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -18,7 +18,7 @@ from src.datastore import NameNotFoundInDataStore from src.helpers import TimeTracking from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries + PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore from src.plotting.postprocessing_plotting import plot_conditional_quantiles from src.run_modules.run_environment import RunEnvironment @@ -38,6 +38,7 @@ class PostProcessing(RunEnvironment): self.target_var = self.data_store.get("target_var", "general") self._sampling = self.data_store.get("sampling", "general") self.skill_scores = None + self.bootstrap_skill_scores = None self._run() def _run(self): @@ -49,9 +50,9 @@ class PostProcessing(RunEnvironment): self.make_prediction() logging.info("take a look on the next reported time measure. If this increases a lot, one should think to " "skip make_prediction() whenever it is possible to save time.") - # self.skill_scores = self.calculate_skill_scores() - # self.plot() - self.create_boot_straps() + self.bootstrap_skill_scores = self.create_boot_straps() + self.skill_scores = self.calculate_skill_scores() + self.plot() def create_boot_straps(self): @@ -60,7 +61,7 @@ class PostProcessing(RunEnvironment): bootstrap_path = self.data_store.get("bootstrap_path", "general") forecast_path = self.data_store.get("forecast_path", "general") window_lead_time = self.data_store.get("window_lead_time", "general") - bootstraps = BootStraps(self.test_data, bootstrap_path, 2) + bootstraps = BootStraps(self.test_data, bootstrap_path, 20) with TimeTracking(name="boot predictions"): bootstrap_predictions = self.model.predict_generator(generator=bootstraps.boot_strap_generator(), steps=bootstraps.get_boot_strap_generator_length()) @@ -72,18 +73,19 @@ class PostProcessing(RunEnvironment): ind = np.all(bootstrap_meta == [boot, station], axis=1) length = sum(ind) sel = bootstrap_predictions[ind].reshape((length, window_lead_time, 1)) - coords = (range(length), range(window_lead_time)) - tmp = xr.DataArray(sel, coords=(*coords, [boot]), dims=["index", "window", "type"]) + coords = (range(length), range(1, window_lead_time + 1)) + tmp = xr.DataArray(sel, coords=(*coords, [boot]), dims=["index", "ahead", "type"]) file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc") tmp.to_netcdf(file_name) labels = bootstraps.get_labels(station).reshape((length, window_lead_time, 1)) file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc") - labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=["index", "window", "type"]) + labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=["index", "ahead", "type"]) labels.to_netcdf(file_name) # file_name = os.path.join(forecast_path, f"bootstraps_orig.nc") # orig = xr.open_dataarray(file_name) + # calc skill scores skill_scores = statistics.SkillScores(None) score = {} @@ -92,17 +94,20 @@ class PostProcessing(RunEnvironment): labels = xr.open_dataarray(file_name) shape = labels.shape orig = bootstraps.get_orig_prediction(forecast_path, f"forecasts_norm_{station}_test.nc").reshape(shape) - orig = xr.DataArray(orig, coords=(range(shape[0]), range(shape[1]), ["orig"]), dims=["index", "window", "type"]) - score[station] = {} + orig = xr.DataArray(orig, coords=(range(shape[0]), range(1, shape[1] + 1), ["orig"]), dims=["index", "ahead", "type"]) + skill = pd.DataFrame(columns=range(1, window_lead_time + 1)) for boot in variables: file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc") boot_data = xr.open_dataarray(file_name) boot_data = boot_data.combine_first(labels) boot_data = boot_data.combine_first(orig) - score[station][boot] = skill_scores.general_skill_score(boot_data, forecast_name=boot, reference_name="orig") - - # plot - + boot_scores = [] + for iahead in range(window_lead_time): + data = boot_data.sel(ahead=iahead + 1) + boot_scores.append(skill_scores.general_skill_score(data, forecast_name=boot, reference_name="orig")) + skill.loc[boot] = np.array(boot_scores) + score[station] = xr.DataArray(skill, dims=["boot_var", "ahead"]) + return score def _load_model(self): try: @@ -128,6 +133,7 @@ class PostProcessing(RunEnvironment): PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup="CNN") PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") + PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN") PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling) def calculate_test_score(self):