From d45b8d7472b6002b67d2cf39f91b4cadec2245f8 Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Wed, 29 Jan 2020 13:45:34 +0100 Subject: [PATCH] added some docs, new plot routine plot_competitive_skill_score, skill score plots are now included in postprocessing, many changes and bugfixes in skill score calculation --- src/plotting/postprocessing_plotting.py | 123 +++++++++++++++++++----- src/run_modules/post_processing.py | 88 +++++++++-------- 2 files changed, 147 insertions(+), 64 deletions(-) diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index 09402f04..469b00d7 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -5,6 +5,7 @@ import os import logging import math import warnings +from src import helpers from src.helpers import TimeTracking import numpy as np @@ -18,10 +19,24 @@ import cartopy.crs as ccrs import cartopy.feature as cfeature from matplotlib.backends.backend_pdf import PdfPages +from typing import Dict, List + logging.getLogger('matplotlib').setLevel(logging.WARNING) -def plot_monthly_summary(stations, data_path, name: str, window_lead_time, target_var, plot_folder="."): +def plot_monthly_summary(stations: List, data_path: str, name: str, target_var: str, window_lead_time: int = None, + plot_folder: str = "."): + """ + Show a monthly summary over all stations for each lead time ("ahead") as box and whiskers plot. The plot is saved + in data_path with name monthly_summary_box_plot.pdf and 500dpi resolution. + :param stations: all stations to plot + :param data_path: path, where the data is located + :param name: full name of the local files with a % as placeholder for the station name + :param target_var: display name of the target variable on plot's axis + :param window_lead_time: lead time to plot, if window_lead_time is higher than the available lead time or not given + the maximum lead time from data is used. (default None -> use maximum lead time from data). + :param plot_folder: path to save the plot (default: current directory) + """ logging.debug("run plot_monthly_summary()") forecasts = None @@ -44,6 +59,11 @@ def plot_monthly_summary(stations, data_path, name: str, window_lead_time, targe forecasts = xr.concat([forecasts, data_concat], 'index') if forecasts is not None else data_concat + ahead_steps = len(forecasts.ahead) + if window_lead_time is None: + window_lead_time = ahead_steps + window_lead_time = min(ahead_steps, window_lead_time) + forecasts = forecasts.to_dataset(name='values').to_dask_dataframe() logging.debug("... start plotting") ax = sns.boxplot(x='index', y='values', hue='ahead', data=forecasts.compute(), whis=1., @@ -53,17 +73,22 @@ def plot_monthly_summary(stations, data_path, name: str, window_lead_time, targe meanprops={'markersize': 1, 'markeredgecolor': 'k'}) ax.set(xlabel='month', ylabel=f'{target_var}') plt.tight_layout() - plot_path = os.path.join(os.path.abspath(plot_folder), 'test_monthly_box.pdf') - logging.debug(f"... save plot to {plot_path}") - plt.savefig(plot_path) + plot_name = os.path.join(os.path.abspath(plot_folder), 'monthly_summary_box_plot.pdf') + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=500) plt.close('all') -def plot_climsum_boxplot(): - return - - -def plot_station_map(generators, plot_folder="."): +def plot_station_map(generators: Dict, plot_folder: str = "."): + """ + Plot geographical overview of all used stations. Different data sets can be colorised by its key in the input + dictionary generators. The key represents the color to plot on the map. Currently, there is only a white background, + but this can be adjusted by loading locally stored topography data (not implemented yet). The plot is saved under + plot_path with the name station_map.pdf + :param generators: dictionary with the plot color of each data set as key and the generator containing all stations + as value. + :param plot_folder: path to save the plot (default: current directory) + """ logging.debug("run station_map()") fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree()) @@ -78,13 +103,14 @@ def plot_station_map(generators, plot_folder="."): for color, gen in generators.items(): for k, v in enumerate(gen): station_coords = gen.get_data_generator(k).meta.loc[['station_lon', 'station_lat']] - station_names = gen.get_data_generator(k).meta.loc[['station_id']] + # station_names = gen.get_data_generator(k).meta.loc[['station_id']] IDx, IDy = float(station_coords.loc['station_lon'].values), float( station_coords.loc['station_lat'].values) ax.plot(IDx, IDy, mfc=color, mec='k', marker='s', markersize=6, transform=ccrs.PlateCarree()) - plot_path = os.path.join(os.path.abspath(plot_folder), 'test_map_plot.pdf') - plt.savefig(plot_path) + plot_name = os.path.join(os.path.abspath(plot_folder), 'station_map.pdf') + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=500) plt.close('all') @@ -223,25 +249,76 @@ def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_w logging.info(f"plot_conditional_quantiles() finished after {time}") -def plot_climatological_skill_score(d: xr.DataArray, plot_folder=".", score_only=True, extra_nametag="", modelsetup=""): - labels = [str(i) + 'd' for i in d.coords['ahead'].values] +def plot_climatological_skill_score(data: Dict, plot_folder: str = ".", score_only: bool = True, + extra_name_tag: str = "", model_setup: str = ""): + """ + Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step + (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single + term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed (score_only=True, + default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with + name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi. + :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms. + :param plot_folder: path to save the plot (default: current directory) + :param score_only: if true plot only scores of CASE I to IV, otherwise plot all single terms (default True) + :param extra_name_tag: additional tag that can be included in the plot name (default "") + :param model_setup: architecture type (default "CNN") + """ + logging.debug("run plot_climatological_skill_score()") + data = helpers.dict_to_xarray(data, "station") + labels = [str(i) + "d" for i in data.coords["ahead"].values] fig, ax = plt.subplots() if score_only: - d = d.loc[:, ['CASE I', 'CASE II', 'CASE III', 'CASE IV'], :] + data = data.loc[:, ["CASE I", "CASE II", "CASE III", "CASE IV"], :] lab_add = '' else: fig.set_size_inches(11.7, 8.27) - lab_add = 'terms and ' - d = d.to_dataframe('data').reset_index(level=[0, 1, 2]) - sns.boxplot(x='terms', y='data', hue='ahead', data=d, ax=ax, whis=1., palette="Blues_d", showmeans=True, - meanprops={'markersize': 1,' markeredgecolor': 'k'}, flierprops={'marker': '.'}) - ax.axhline(y=0, color='grey', linewidth=.5) - ax.set(ylabel=lab_add+'skill score', xlabel='', title='summary of all stations') + lab_add = "terms and " + data = data.to_dataframe("data").reset_index(level=[0, 1, 2]) + sns.boxplot(x="terms", y="data", hue="ahead", data=data, ax=ax, whis=1., palette="Blues_d", showmeans=True, + meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) + ax.axhline(y=0, color="grey", linewidth=.5) + ax.set(ylabel=f"{lab_add}skill score", xlabel="", title="summary of all stations") handles, _ = ax.get_legend_handles_labels() ax.legend(handles, labels) plt.tight_layout() - plt.savefig(plot_folder+'SS_Clim_summary_' + extra_nametag + modelsetup + '.pdf', dpi=500) + plot_name = os.path.join(plot_folder, f"skill_score_clim_{extra_name_tag}{model_setup}.pdf") + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=500) plt.close('all') - return d +def plot_competitive_skill_score(data: pd.DataFrame, plot_folder=".", model_setup="CNN"): + """ + Create competitive skill score for the given model setup and the reference models ordinary least squared ("ols") and + the persistence forecast ("persi") for all lead times ("ahead"). The plot is saved under plot_folder with the name + skill_score_competitive_{model_setup}.pdf and resolution of 500dpi. + :param data: data frame with index=['cnn-persi', 'ols-persi', 'cnn-ols'] and columns "ahead" containing the pre- + calculated comparisons for cnn, persistence and ols. + :param plot_folder: path to save the plot (default: current directory) + :param model_setup: architecture type (default "CNN") + """ + logging.debug("run plot_general_skill_score()") + + data = pd.concat(data, axis=0) + data = xr.DataArray(data, dims=["stations", "ahead"]).unstack("stations") + data = data.rename({"stations_level_0": "stations", "stations_level_1": "comparison"}) + data = data.to_dataframe("data").unstack(level=1).swaplevel() + data.columns = data.columns.levels[1] + + labels = [str(i) + "d" for i in data.index.levels[1].values] + data = data.stack(level=0).reset_index(level=2, drop=True).reset_index(name="data") + + fig, ax = plt.subplots() + sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", showmeans=True, + meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, + order=["cnn-persi", "ols-persi", "cnn-ols"]) + ax.axhline(y=0, color="grey", linewidth=.5) + ax.set(ylabel="skill score", xlabel="competing models", title="summary of all stations", + ylim=(np.min([0, helpers.float_round(data.min()[2], 2) - 0.1]), helpers.float_round(data.max()[2], 2) + 0.1)) + handles, _ = ax.get_legend_handles_labels() + ax.legend(handles, labels) + plt.tight_layout() + plot_name = os.path.join(plot_folder, f"skill_score_competitive_{model_setup}.pdf") + logging.debug(f"... save plot to {plot_name}") + plt.savefig(plot_name, dpi=500) + plt.close() diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index a95869e0..fa163072 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -19,7 +19,7 @@ from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src import statistics from src import helpers from src.helpers import TimeTracking -from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_climsum_boxplot, plot_station_map, plot_conditional_quantiles +from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, plot_climatological_skill_score, plot_competitive_skill_score from src.datastore import NameNotFoundInDataStore @@ -36,7 +36,7 @@ class PostProcessing(RunEnvironment): self.train_val_data: DataGenerator = self.data_store.get("generator", "general.train_val") self.plot_path: str = self.data_store.get("plot_path", "general") self.skill_scores = None - # self._run() + self._run() def _run(self): self.train_ols_model() @@ -58,21 +58,20 @@ class PostProcessing(RunEnvironment): def plot(self): logging.debug("Run plotting routines...") path = self.data_store.get("forecast_path", "general") - window_lead_time = self.data_store.get("window_lead_time", "general") target_var = self.data_store.get("target_var", "general") plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="CNN", ref_name="orig", - forecast_path=self.data_store.get("forecast_path", "general"), plot_name_affix="cali-ref") + forecast_path=path, plot_name_affix="cali-ref") plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="orig", ref_name="CNN", - forecast_path=self.data_store.get("forecast_path", "general"), plot_name_affix="like-bas") + forecast_path=path, plot_name_affix="like-bas") plot_station_map(generators={'b': self.test_data}, plot_folder=self.plot_path) - plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", window_lead_time, target_var, + plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", target_var, plot_folder=self.plot_path) - # plot_climsum_boxplot() + # + plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN") + plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup="CNN") + plot_competitive_skill_score(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") - # FKf.ss_climsum_boxplot(data=ss_clim_dic, modelsetup=modelsetup, score_only=True, **kwargs) - # FKf.ss_climsum_boxplot(data=ss_clim_dic, modelsetup=modelsetup, score_only=False, extra_nametag='_all_terms', **kwargs) - # FKf.ss_sum_boxplot(ss_dic, plot_path, modelsetup) def calculate_test_score(self): test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(), @@ -133,7 +132,7 @@ class PostProcessing(RunEnvironment): @staticmethod def _create_orig_forecast(data, _, mean, std, transformation_method): - return statistics.apply_inverse_transformation(data.label, mean, std, transformation_method) + return statistics.apply_inverse_transformation(data.label.copy(), mean, std, transformation_method) def _create_ols_forecast(self, input_data, ols_prediction, mean, std, transformation_method): tmp_ols = self.ols_model.predict(input_data) @@ -157,7 +156,7 @@ class PostProcessing(RunEnvironment): @staticmethod def _create_empty_prediction_arrays(generator, count=1): - return [generator.label.copy()] * count + return [generator.label.copy() for _ in range(count)] @staticmethod def create_fullindex(df, freq): @@ -225,7 +224,6 @@ class PostProcessing(RunEnvironment): external_data = self._get_external_data(station) skill_score_general[station] = ss.skill_scores(window_lead_time) skill_score_climatological[station] = ss.climatological_skill_scores(external_data, window_lead_time) - return skill_score_general, skill_score_climatological @@ -240,9 +238,9 @@ class SkillScores(RunEnvironment): skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) for iahead in ahead_names: data = self.internal_data.sel(ahead=iahead) - skill_score[iahead] = [self.general_skill_score(data), - self.general_skill_score(data, forecast_name="OLS"), - self.general_skill_score(data, reference_name="OLS")] + skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"), + self.general_skill_score(data, forecast_name="OLS", reference_name="persi"), + self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] return skill_score def climatological_skill_scores(self, external_data, window_lead_time): @@ -250,8 +248,8 @@ class SkillScores(RunEnvironment): all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', 'CASE IV'] - skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms], - dims=['terms', 'ahead']) + skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], + dims=['terms', 'ahead']) for iahead in ahead_names: @@ -281,12 +279,12 @@ class SkillScores(RunEnvironment): @staticmethod def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"): data = data.dropna("index") - observation = data.loc[..., observation_name] - forecast = data.loc[..., forecast_name] - reference = data.loc[..., reference_name] + observation = data.sel(type=observation_name) + forecast = data.sel(type=forecast_name) + reference = data.sel(type=reference_name) mse = statistics.mean_squared_error skill_score = 1 - mse(observation, forecast) / mse(observation, reference) - return skill_score + return skill_score.values @staticmethod def skill_score_pre_calculations(data, observation_name, forecast_name): @@ -295,15 +293,16 @@ class SkillScores(RunEnvironment): data = data.dropna("index") mean = data.mean("index") - var = data.var("index") - r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]]) + sigma = np.sqrt(data.var("index")) + # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]]) + r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) AI = np.array(r ** 2) - BI = ((r - var.loc[..., forecast_name] / var.loc[..., observation_name]) ** 2).values - CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / var.loc[ + BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values + CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[ ..., observation_name]) ** 2).values - suffix = {"mean": mean, "var": var, "r": r, "p": p} + suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"): @@ -315,41 +314,48 @@ class SkillScores(RunEnvironment): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) data = xr.concat([data, monthly_mean], dim="type") - var = data.var("index") - r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]]) + sigma = suffix["sigma"] + sigma_monthly = np.sqrt(monthly_mean.var()) + # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]]) + r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) AII = np.array(r ** 2) - BII = ((r - var.loc[observation_name + 'X'] / var.loc[observation_name]) ** 2).values + BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) - mean, var = suffix["mean"], suffix["var"] - AIII = (((external_data.mean().values - mean.loc[observation_name]) / var.loc[observation_name])**2).values + mean, sigma = suffix["mean"], suffix["sigma"] + AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) - monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values) + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index) data = xr.concat([data, monthly_mean_external], dim="type") - mean = data.mean("index") - var = data.var("index") + mean, sigma = suffix["mean"], suffix["sigma"] + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values) + mean_external = monthly_mean_external.mean() + sigma_external = np.sqrt(monthly_mean_external.var()) - r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) + # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) + r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) AIV = np.array(r_mu**2) - BIV = ((r_mu - var.loc[observation_name + 'X'] / var.loc[observation_name])**2).values - CIV = (((mean.loc[observation_name + 'X'] - mean.loc[observation_name]) / var.loc[observation_name])**2).values + BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values + CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() @staticmethod - def create_monthly_mean_from_daily_data(data, columns=None): + def create_monthly_mean_from_daily_data(data, columns=None, index=None): if columns is None: columns = data.type.values - coordinates = [data.index, [v + "X" for v in list(columns)]] # TODO - empty_data = np.full((len(data.index), len(columns)), np.nan) + if index is None: + index = data.index + coordinates = [index, [v + "X" for v in list(columns)]] # TODO + empty_data = np.full((len(index), len(columns)), np.nan) monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"]) mu = data.groupby("index.month").mean() -- GitLab