Skip to content
Snippets Groups Projects
Commit b9092b37 authored by lukas leufen's avatar lukas leufen
Browse files

use bootstrap skill score plot

parents 82a25fc8 8cdf9429
Branches
Tags
2 merge requests!59Develop,!52implemented bootstraps
Pipeline #30621 passed
...@@ -479,6 +479,76 @@ class PlotCompetitiveSkillScore(RunEnvironment): ...@@ -479,6 +479,76 @@ class PlotCompetitiveSkillScore(RunEnvironment):
plt.close() plt.close()
class PlotBootstrapSkillScore(RunEnvironment):
"""
Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step
(called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single
term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed (score_only=True,
default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with
name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi.
"""
def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""):
"""
Sets attributes and create plot
:param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms.
:param plot_folder: path to save the plot (default: current directory)
:param model_setup: architecture type to specify plot name (default "CNN")
"""
super().__init__()
self._labels = None
self._data = self._prepare_data(data)
self._plot(plot_folder, model_setup)
def _prepare_data(self, data: Dict) -> pd.DataFrame:
"""
Shrink given data, if only scores are relevant. In any case, transform data to a plot friendly format. Also set
plot labels depending on the lead time dimensions.
:param data: dictionary with station names as keys and 2D xarrays as values
:return: pre-processed data set
"""
data = helpers.dict_to_xarray(data, "station")
self._labels = [str(i) + "d" for i in data.coords["ahead"].values]
return data.to_dataframe("data").reset_index(level=[0, 1, 2])
def _label_add(self, score_only: bool):
"""
Adds the phrase "terms and " if score_only is disabled or empty string (if score_only=True).
:param score_only: if false all terms are relevant, otherwise only CASE I to IV
:return: additional label
"""
return "" if score_only else "terms and "
def _plot(self, plot_folder, model_setup):
"""
Main plot function to plot climatological skill score.
:param plot_folder: path to save the plot
:param model_setup: architecture type to specify plot name
"""
fig, ax = plt.subplots()
sns.boxplot(x="boot_var", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d",
showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."})
ax.axhline(y=0, color="grey", linewidth=.5)
ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations")
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, self._labels)
plt.tight_layout()
self._save(plot_folder, model_setup)
@staticmethod
def _save(plot_folder, model_setup):
"""
Standard save method to store plot locally. The name of this plot is dynamic. It includes the model setup like
'CNN' and can additionally be adjusted using an extra name tag.
:param plot_folder: path to save the plot
:param model_setup: architecture type to specify plot name
"""
plot_name = os.path.join(plot_folder, f"skill_score_bootstrap_{model_setup}.pdf")
logging.debug(f"... save plot to {plot_name}")
plt.savefig(plot_name, dpi=500)
plt.close('all')
class PlotTimeSeries(RunEnvironment): class PlotTimeSeries(RunEnvironment):
def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".", def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".",
......
...@@ -18,7 +18,7 @@ from src.datastore import NameNotFoundInDataStore ...@@ -18,7 +18,7 @@ from src.datastore import NameNotFoundInDataStore
from src.helpers import TimeTracking from src.helpers import TimeTracking
from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src.model_modules.linear_model import OrdinaryLeastSquaredModel
from src.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ from src.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \
PlotCompetitiveSkillScore, PlotTimeSeries PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore
from src.plotting.postprocessing_plotting import plot_conditional_quantiles from src.plotting.postprocessing_plotting import plot_conditional_quantiles
from src.run_modules.run_environment import RunEnvironment from src.run_modules.run_environment import RunEnvironment
...@@ -38,6 +38,7 @@ class PostProcessing(RunEnvironment): ...@@ -38,6 +38,7 @@ class PostProcessing(RunEnvironment):
self.target_var = self.data_store.get("target_var", "general") self.target_var = self.data_store.get("target_var", "general")
self._sampling = self.data_store.get("sampling", "general") self._sampling = self.data_store.get("sampling", "general")
self.skill_scores = None self.skill_scores = None
self.bootstrap_skill_scores = None
self._run() self._run()
def _run(self): def _run(self):
...@@ -49,9 +50,9 @@ class PostProcessing(RunEnvironment): ...@@ -49,9 +50,9 @@ class PostProcessing(RunEnvironment):
self.make_prediction() self.make_prediction()
logging.info("take a look on the next reported time measure. If this increases a lot, one should think to " logging.info("take a look on the next reported time measure. If this increases a lot, one should think to "
"skip make_prediction() whenever it is possible to save time.") "skip make_prediction() whenever it is possible to save time.")
# self.skill_scores = self.calculate_skill_scores() self.bootstrap_skill_scores = self.create_boot_straps()
# self.plot() self.skill_scores = self.calculate_skill_scores()
self.create_boot_straps() self.plot()
def create_boot_straps(self): def create_boot_straps(self):
...@@ -60,7 +61,7 @@ class PostProcessing(RunEnvironment): ...@@ -60,7 +61,7 @@ class PostProcessing(RunEnvironment):
bootstrap_path = self.data_store.get("bootstrap_path", "general") bootstrap_path = self.data_store.get("bootstrap_path", "general")
forecast_path = self.data_store.get("forecast_path", "general") forecast_path = self.data_store.get("forecast_path", "general")
window_lead_time = self.data_store.get("window_lead_time", "general") window_lead_time = self.data_store.get("window_lead_time", "general")
bootstraps = BootStraps(self.test_data, bootstrap_path, 2) bootstraps = BootStraps(self.test_data, bootstrap_path, 20)
with TimeTracking(name="boot predictions"): with TimeTracking(name="boot predictions"):
bootstrap_predictions = self.model.predict_generator(generator=bootstraps.boot_strap_generator(), bootstrap_predictions = self.model.predict_generator(generator=bootstraps.boot_strap_generator(),
steps=bootstraps.get_boot_strap_generator_length()) steps=bootstraps.get_boot_strap_generator_length())
...@@ -72,18 +73,19 @@ class PostProcessing(RunEnvironment): ...@@ -72,18 +73,19 @@ class PostProcessing(RunEnvironment):
ind = np.all(bootstrap_meta == [boot, station], axis=1) ind = np.all(bootstrap_meta == [boot, station], axis=1)
length = sum(ind) length = sum(ind)
sel = bootstrap_predictions[ind].reshape((length, window_lead_time, 1)) sel = bootstrap_predictions[ind].reshape((length, window_lead_time, 1))
coords = (range(length), range(window_lead_time)) coords = (range(length), range(1, window_lead_time + 1))
tmp = xr.DataArray(sel, coords=(*coords, [boot]), dims=["index", "window", "type"]) tmp = xr.DataArray(sel, coords=(*coords, [boot]), dims=["index", "ahead", "type"])
file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc") file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc")
tmp.to_netcdf(file_name) tmp.to_netcdf(file_name)
labels = bootstraps.get_labels(station).reshape((length, window_lead_time, 1)) labels = bootstraps.get_labels(station).reshape((length, window_lead_time, 1))
file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc") file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc")
labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=["index", "window", "type"]) labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=["index", "ahead", "type"])
labels.to_netcdf(file_name) labels.to_netcdf(file_name)
# file_name = os.path.join(forecast_path, f"bootstraps_orig.nc") # file_name = os.path.join(forecast_path, f"bootstraps_orig.nc")
# orig = xr.open_dataarray(file_name) # orig = xr.open_dataarray(file_name)
# calc skill scores # calc skill scores
skill_scores = statistics.SkillScores(None) skill_scores = statistics.SkillScores(None)
score = {} score = {}
...@@ -92,17 +94,20 @@ class PostProcessing(RunEnvironment): ...@@ -92,17 +94,20 @@ class PostProcessing(RunEnvironment):
labels = xr.open_dataarray(file_name) labels = xr.open_dataarray(file_name)
shape = labels.shape shape = labels.shape
orig = bootstraps.get_orig_prediction(forecast_path, f"forecasts_norm_{station}_test.nc").reshape(shape) orig = bootstraps.get_orig_prediction(forecast_path, f"forecasts_norm_{station}_test.nc").reshape(shape)
orig = xr.DataArray(orig, coords=(range(shape[0]), range(shape[1]), ["orig"]), dims=["index", "window", "type"]) orig = xr.DataArray(orig, coords=(range(shape[0]), range(1, shape[1] + 1), ["orig"]), dims=["index", "ahead", "type"])
score[station] = {} skill = pd.DataFrame(columns=range(1, window_lead_time + 1))
for boot in variables: for boot in variables:
file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc") file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc")
boot_data = xr.open_dataarray(file_name) boot_data = xr.open_dataarray(file_name)
boot_data = boot_data.combine_first(labels) boot_data = boot_data.combine_first(labels)
boot_data = boot_data.combine_first(orig) boot_data = boot_data.combine_first(orig)
score[station][boot] = skill_scores.general_skill_score(boot_data, forecast_name=boot, reference_name="orig") boot_scores = []
for iahead in range(window_lead_time):
# plot data = boot_data.sel(ahead=iahead + 1)
boot_scores.append(skill_scores.general_skill_score(data, forecast_name=boot, reference_name="orig"))
skill.loc[boot] = np.array(boot_scores)
score[station] = xr.DataArray(skill, dims=["boot_var", "ahead"])
return score
def _load_model(self): def _load_model(self):
try: try:
...@@ -128,6 +133,7 @@ class PostProcessing(RunEnvironment): ...@@ -128,6 +133,7 @@ class PostProcessing(RunEnvironment):
PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False,
extra_name_tag="all_terms_", model_setup="CNN") extra_name_tag="all_terms_", model_setup="CNN")
PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN")
PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN")
PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling) PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling)
def calculate_test_score(self): def calculate_test_score(self):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment