From d45b8d7472b6002b67d2cf39f91b4cadec2245f8 Mon Sep 17 00:00:00 2001
From: lukas leufen <l.leufen@fz-juelich.de>
Date: Wed, 29 Jan 2020 13:45:34 +0100
Subject: [PATCH] added some docs, new plot routine
 plot_competitive_skill_score, skill score plots are now included in
 postprocessing, many changes and bugfixes in skill score calculation

---
 src/plotting/postprocessing_plotting.py | 123 +++++++++++++++++++-----
 src/run_modules/post_processing.py      |  88 +++++++++--------
 2 files changed, 147 insertions(+), 64 deletions(-)

diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py
index 09402f04..469b00d7 100644
--- a/src/plotting/postprocessing_plotting.py
+++ b/src/plotting/postprocessing_plotting.py
@@ -5,6 +5,7 @@ import os
 import logging
 import math
 import warnings
+from src import helpers
 from src.helpers import TimeTracking
 
 import numpy as np
@@ -18,10 +19,24 @@ import cartopy.crs as ccrs
 import cartopy.feature as cfeature
 from matplotlib.backends.backend_pdf import PdfPages
 
+from typing import Dict, List
+
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
 
 
-def plot_monthly_summary(stations, data_path, name: str, window_lead_time, target_var, plot_folder="."):
+def plot_monthly_summary(stations: List, data_path: str, name: str, target_var: str, window_lead_time: int = None,
+                         plot_folder: str = "."):
+    """
+    Show a monthly summary over all stations for each lead time ("ahead") as box and whiskers plot. The plot is saved
+    in data_path with name monthly_summary_box_plot.pdf and 500dpi resolution.
+    :param stations: all stations to plot
+    :param data_path: path, where the data is located
+    :param name: full name of the local files with a % as placeholder for the station name
+    :param target_var: display name of the target variable on plot's axis
+    :param window_lead_time: lead time to plot, if window_lead_time is higher than the available lead time or not given
+        the maximum lead time from data is used. (default None -> use maximum lead time from data).
+    :param plot_folder: path to save the plot (default: current directory)
+    """
     logging.debug("run plot_monthly_summary()")
     forecasts = None
 
@@ -44,6 +59,11 @@ def plot_monthly_summary(stations, data_path, name: str, window_lead_time, targe
 
         forecasts = xr.concat([forecasts, data_concat], 'index') if forecasts is not None else data_concat
 
+    ahead_steps = len(forecasts.ahead)
+    if window_lead_time is None:
+        window_lead_time = ahead_steps
+    window_lead_time = min(ahead_steps, window_lead_time)
+
     forecasts = forecasts.to_dataset(name='values').to_dask_dataframe()
     logging.debug("... start plotting")
     ax = sns.boxplot(x='index', y='values', hue='ahead', data=forecasts.compute(), whis=1.,
@@ -53,17 +73,22 @@ def plot_monthly_summary(stations, data_path, name: str, window_lead_time, targe
                      meanprops={'markersize': 1, 'markeredgecolor': 'k'})
     ax.set(xlabel='month', ylabel=f'{target_var}')
     plt.tight_layout()
-    plot_path = os.path.join(os.path.abspath(plot_folder), 'test_monthly_box.pdf')
-    logging.debug(f"... save plot to {plot_path}")
-    plt.savefig(plot_path)
+    plot_name = os.path.join(os.path.abspath(plot_folder), 'monthly_summary_box_plot.pdf')
+    logging.debug(f"... save plot to {plot_name}")
+    plt.savefig(plot_name, dpi=500)
     plt.close('all')
 
 
-def plot_climsum_boxplot():
-    return
-
-
-def plot_station_map(generators, plot_folder="."):
+def plot_station_map(generators: Dict, plot_folder: str = "."):
+    """
+    Plot geographical overview of all used stations. Different data sets can be colorised by its key in the input
+    dictionary generators. The key represents the color to plot on the map. Currently, there is only a white background,
+    but this can be adjusted by loading locally stored topography data (not implemented yet). The plot is saved under
+    plot_path with the name station_map.pdf
+    :param generators: dictionary with the plot color of each data set as key and the generator containing all stations
+        as value.
+    :param plot_folder: path to save the plot (default: current directory)
+    """
     logging.debug("run station_map()")
     fig = plt.figure(figsize=(10, 5))
     ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())
@@ -78,13 +103,14 @@ def plot_station_map(generators, plot_folder="."):
         for color, gen in generators.items():
             for k, v in enumerate(gen):
                 station_coords = gen.get_data_generator(k).meta.loc[['station_lon', 'station_lat']]
-                station_names = gen.get_data_generator(k).meta.loc[['station_id']]
+                # station_names = gen.get_data_generator(k).meta.loc[['station_id']]
                 IDx, IDy = float(station_coords.loc['station_lon'].values), float(
                     station_coords.loc['station_lat'].values)
                 ax.plot(IDx, IDy, mfc=color, mec='k', marker='s', markersize=6, transform=ccrs.PlateCarree())
 
-    plot_path = os.path.join(os.path.abspath(plot_folder), 'test_map_plot.pdf')
-    plt.savefig(plot_path)
+    plot_name = os.path.join(os.path.abspath(plot_folder), 'station_map.pdf')
+    logging.debug(f"... save plot to {plot_name}")
+    plt.savefig(plot_name, dpi=500)
     plt.close('all')
 
 
@@ -223,25 +249,76 @@ def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_w
     logging.info(f"plot_conditional_quantiles() finished after {time}")
 
 
-def plot_climatological_skill_score(d: xr.DataArray, plot_folder=".", score_only=True, extra_nametag="", modelsetup=""):
-    labels = [str(i) + 'd' for i in d.coords['ahead'].values]
+def plot_climatological_skill_score(data: Dict, plot_folder: str = ".", score_only: bool = True,
+                                    extra_name_tag: str = "", model_setup: str = ""):
+    """
+    Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step
+    (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single
+    term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed (score_only=True,
+    default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with
+    name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi.
+    :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms.
+    :param plot_folder: path to save the plot (default: current directory)
+    :param score_only: if true plot only scores of CASE I to IV, otherwise plot all single terms (default True)
+    :param extra_name_tag: additional tag that can be included in the plot name (default "")
+    :param model_setup: architecture type (default "CNN")
+    """
+    logging.debug("run plot_climatological_skill_score()")
+    data = helpers.dict_to_xarray(data, "station")
+    labels = [str(i) + "d" for i in data.coords["ahead"].values]
     fig, ax = plt.subplots()
     if score_only:
-        d = d.loc[:, ['CASE I', 'CASE II', 'CASE III', 'CASE IV'], :]
+        data = data.loc[:, ["CASE I", "CASE II", "CASE III", "CASE IV"], :]
         lab_add = ''
     else:
         fig.set_size_inches(11.7, 8.27)
-        lab_add = 'terms and '
-    d = d.to_dataframe('data').reset_index(level=[0, 1, 2])
-    sns.boxplot(x='terms', y='data', hue='ahead', data=d, ax=ax, whis=1., palette="Blues_d", showmeans=True,
-                meanprops={'markersize': 1,' markeredgecolor': 'k'}, flierprops={'marker': '.'})
-    ax.axhline(y=0, color='grey', linewidth=.5)
-    ax.set(ylabel=lab_add+'skill score', xlabel='', title='summary of all stations')
+        lab_add = "terms and "
+    data = data.to_dataframe("data").reset_index(level=[0, 1, 2])
+    sns.boxplot(x="terms", y="data", hue="ahead", data=data, ax=ax, whis=1., palette="Blues_d", showmeans=True,
+                meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."})
+    ax.axhline(y=0, color="grey", linewidth=.5)
+    ax.set(ylabel=f"{lab_add}skill score", xlabel="", title="summary of all stations")
     handles, _ = ax.get_legend_handles_labels()
     ax.legend(handles, labels)
     plt.tight_layout()
-    plt.savefig(plot_folder+'SS_Clim_summary_' + extra_nametag + modelsetup + '.pdf', dpi=500)
+    plot_name = os.path.join(plot_folder, f"skill_score_clim_{extra_name_tag}{model_setup}.pdf")
+    logging.debug(f"... save plot to {plot_name}")
+    plt.savefig(plot_name, dpi=500)
     plt.close('all')
 
-    return d
 
+def plot_competitive_skill_score(data: pd.DataFrame, plot_folder=".", model_setup="CNN"):
+    """
+    Create competitive skill score for the given model setup and the reference models ordinary least squared ("ols") and
+    the persistence forecast ("persi") for all lead times ("ahead"). The plot is saved under plot_folder with the name
+    skill_score_competitive_{model_setup}.pdf and resolution of 500dpi.
+    :param data: data frame with index=['cnn-persi', 'ols-persi', 'cnn-ols'] and columns "ahead" containing the pre-
+        calculated comparisons for cnn, persistence and ols.
+    :param plot_folder: path to save the plot (default: current directory)
+    :param model_setup: architecture type (default "CNN")
+    """
+    logging.debug("run plot_general_skill_score()")
+
+    data = pd.concat(data, axis=0)
+    data = xr.DataArray(data, dims=["stations", "ahead"]).unstack("stations")
+    data = data.rename({"stations_level_0": "stations", "stations_level_1": "comparison"})
+    data = data.to_dataframe("data").unstack(level=1).swaplevel()
+    data.columns = data.columns.levels[1]
+
+    labels = [str(i) + "d" for i in data.index.levels[1].values]
+    data = data.stack(level=0).reset_index(level=2, drop=True).reset_index(name="data")
+
+    fig, ax = plt.subplots()
+    sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", showmeans=True,
+                meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."},
+                order=["cnn-persi", "ols-persi", "cnn-ols"])
+    ax.axhline(y=0, color="grey", linewidth=.5)
+    ax.set(ylabel="skill score", xlabel="competing models", title="summary of all stations",
+           ylim=(np.min([0, helpers.float_round(data.min()[2], 2) - 0.1]), helpers.float_round(data.max()[2], 2) + 0.1))
+    handles, _ = ax.get_legend_handles_labels()
+    ax.legend(handles, labels)
+    plt.tight_layout()
+    plot_name = os.path.join(plot_folder, f"skill_score_competitive_{model_setup}.pdf")
+    logging.debug(f"... save plot to {plot_name}")
+    plt.savefig(plot_name, dpi=500)
+    plt.close()
diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py
index a95869e0..fa163072 100644
--- a/src/run_modules/post_processing.py
+++ b/src/run_modules/post_processing.py
@@ -19,7 +19,7 @@ from src.model_modules.linear_model import OrdinaryLeastSquaredModel
 from src import statistics
 from src import helpers
 from src.helpers import TimeTracking
-from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_climsum_boxplot, plot_station_map, plot_conditional_quantiles
+from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, plot_climatological_skill_score, plot_competitive_skill_score
 from src.datastore import NameNotFoundInDataStore
 
 
@@ -36,7 +36,7 @@ class PostProcessing(RunEnvironment):
         self.train_val_data: DataGenerator = self.data_store.get("generator", "general.train_val")
         self.plot_path: str = self.data_store.get("plot_path", "general")
         self.skill_scores = None
-        # self._run()
+        self._run()
 
     def _run(self):
         self.train_ols_model()
@@ -58,21 +58,20 @@ class PostProcessing(RunEnvironment):
     def plot(self):
         logging.debug("Run plotting routines...")
         path = self.data_store.get("forecast_path", "general")
-        window_lead_time = self.data_store.get("window_lead_time", "general")
         target_var = self.data_store.get("target_var", "general")
 
         plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="CNN", ref_name="orig",
-                                   forecast_path=self.data_store.get("forecast_path", "general"), plot_name_affix="cali-ref")
+                                   forecast_path=path, plot_name_affix="cali-ref")
         plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="orig", ref_name="CNN",
-                                   forecast_path=self.data_store.get("forecast_path", "general"), plot_name_affix="like-bas")
+                                   forecast_path=path, plot_name_affix="like-bas")
         plot_station_map(generators={'b': self.test_data}, plot_folder=self.plot_path)
-        plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", window_lead_time, target_var,
+        plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", target_var,
                              plot_folder=self.plot_path)
-        # plot_climsum_boxplot()
+        #
+        plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN")
+        plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup="CNN")
+        plot_competitive_skill_score(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN")
 
-        # FKf.ss_climsum_boxplot(data=ss_clim_dic, modelsetup=modelsetup, score_only=True, **kwargs)
-        # FKf.ss_climsum_boxplot(data=ss_clim_dic, modelsetup=modelsetup, score_only=False, extra_nametag='_all_terms', **kwargs)
-        # FKf.ss_sum_boxplot(ss_dic, plot_path, modelsetup)
 
     def calculate_test_score(self):
         test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(),
@@ -133,7 +132,7 @@ class PostProcessing(RunEnvironment):
 
     @staticmethod
     def _create_orig_forecast(data, _, mean, std, transformation_method):
-        return statistics.apply_inverse_transformation(data.label, mean, std, transformation_method)
+        return statistics.apply_inverse_transformation(data.label.copy(), mean, std, transformation_method)
 
     def _create_ols_forecast(self, input_data, ols_prediction, mean, std, transformation_method):
         tmp_ols = self.ols_model.predict(input_data)
@@ -157,7 +156,7 @@ class PostProcessing(RunEnvironment):
 
     @staticmethod
     def _create_empty_prediction_arrays(generator, count=1):
-        return [generator.label.copy()] * count
+        return [generator.label.copy() for _ in range(count)]
 
     @staticmethod
     def create_fullindex(df, freq):
@@ -225,7 +224,6 @@ class PostProcessing(RunEnvironment):
             external_data = self._get_external_data(station)
             skill_score_general[station] = ss.skill_scores(window_lead_time)
             skill_score_climatological[station] = ss.climatological_skill_scores(external_data, window_lead_time)
-
         return skill_score_general, skill_score_climatological
 
 
@@ -240,9 +238,9 @@ class SkillScores(RunEnvironment):
         skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
         for iahead in ahead_names:
             data = self.internal_data.sel(ahead=iahead)
-            skill_score[iahead] = [self.general_skill_score(data),
-                                   self.general_skill_score(data, forecast_name="OLS"),
-                                   self.general_skill_score(data, reference_name="OLS")]
+            skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"),
+                                   self.general_skill_score(data, forecast_name="OLS", reference_name="persi"),
+                                   self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
         return skill_score
 
     def climatological_skill_scores(self, external_data, window_lead_time):
@@ -250,8 +248,8 @@ class SkillScores(RunEnvironment):
 
         all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
                      'CASE IV']
-        skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms],
-                                        dims=['terms', 'ahead'])
+        skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names],
+                                   dims=['terms', 'ahead'])
 
         for iahead in ahead_names:
 
@@ -281,12 +279,12 @@ class SkillScores(RunEnvironment):
     @staticmethod
     def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"):
         data = data.dropna("index")
-        observation = data.loc[..., observation_name]
-        forecast = data.loc[..., forecast_name]
-        reference = data.loc[..., reference_name]
+        observation = data.sel(type=observation_name)
+        forecast = data.sel(type=forecast_name)
+        reference = data.sel(type=reference_name)
         mse = statistics.mean_squared_error
         skill_score = 1 - mse(observation, forecast) / mse(observation, reference)
-        return skill_score
+        return skill_score.values
 
     @staticmethod
     def skill_score_pre_calculations(data, observation_name, forecast_name):
@@ -295,15 +293,16 @@ class SkillScores(RunEnvironment):
         data = data.dropna("index")
 
         mean = data.mean("index")
-        var = data.var("index")
-        r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
+        sigma = np.sqrt(data.var("index"))
+        # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
+        r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
 
         AI = np.array(r ** 2)
-        BI = ((r - var.loc[..., forecast_name] / var.loc[..., observation_name]) ** 2).values
-        CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / var.loc[
+        BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values
+        CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[
             ..., observation_name]) ** 2).values
 
-        suffix = {"mean": mean, "var": var, "r": r, "p": p}
+        suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p}
         return AI, BI, CI, data, suffix
 
     def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"):
@@ -315,41 +314,48 @@ class SkillScores(RunEnvironment):
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
         monthly_mean = self.create_monthly_mean_from_daily_data(data)
         data = xr.concat([data, monthly_mean], dim="type")
-        var = data.var("index")
-        r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
+        sigma = suffix["sigma"]
+        sigma_monthly = np.sqrt(monthly_mean.var())
+        # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
+        r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
         AII = np.array(r ** 2)
-        BII = ((r - var.loc[observation_name + 'X'] / var.loc[observation_name]) ** 2).values
+        BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
         skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII))
         return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
 
     def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
-        mean, var = suffix["mean"], suffix["var"]
-        AIII = (((external_data.mean().values - mean.loc[observation_name]) / var.loc[observation_name])**2).values
+        mean, sigma = suffix["mean"], suffix["sigma"]
+        AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
         skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII)
         return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
 
     def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
         AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
-        monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values)
+        monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index)
         data = xr.concat([data, monthly_mean_external], dim="type")
-        mean = data.mean("index")
-        var = data.var("index")
+        mean, sigma = suffix["mean"], suffix["sigma"]
+        monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values)
+        mean_external = monthly_mean_external.mean()
+        sigma_external = np.sqrt(monthly_mean_external.var())
 
-        r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']])
+        # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']])
+        r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
 
         AIV = np.array(r_mu**2)
-        BIV = ((r_mu - var.loc[observation_name + 'X'] / var.loc[observation_name])**2).values
-        CIV = (((mean.loc[observation_name + 'X'] - mean.loc[observation_name]) / var.loc[observation_name])**2).values
+        BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values
+        CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
         skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV))
         return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array()
 
     @staticmethod
-    def create_monthly_mean_from_daily_data(data, columns=None):
+    def create_monthly_mean_from_daily_data(data, columns=None, index=None):
         if columns is None:
             columns = data.type.values
-        coordinates = [data.index, [v + "X" for v in list(columns)]]  # TODO
-        empty_data = np.full((len(data.index), len(columns)), np.nan)
+        if index is None:
+            index = data.index
+        coordinates = [index, [v + "X" for v in list(columns)]]  # TODO
+        empty_data = np.full((len(index), len(columns)), np.nan)
         monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"])
         mu = data.groupby("index.month").mean()
 
-- 
GitLab