diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index fa16307298bbc64b51e6532a139085ecd8b1e7b0..9627d7a5b0db34a7bc1621727a419808c4f2f323 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -8,18 +8,15 @@ import os import numpy as np import pandas as pd import xarray as xr -import statsmodels.api as sm import keras -from scipy import stats from src.run_modules.run_environment import RunEnvironment from src.data_handling.data_distributor import Distributor from src.data_handling.data_generator import DataGenerator from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src import statistics -from src import helpers -from src.helpers import TimeTracking -from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, plot_climatological_skill_score, plot_competitive_skill_score +from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, \ + plot_climatological_skill_score, plot_competitive_skill_score from src.datastore import NameNotFoundInDataStore @@ -41,7 +38,7 @@ class PostProcessing(RunEnvironment): def _run(self): self.train_ols_model() preds_for_all_stations = self.make_prediction() - self.skill_scores = self.calculate_skill_scores() #TODO: stopped here, continue with plot routine. Think about if skill score should be saved locally and loaded for plotting or if self.skill_scores should be used + self.skill_scores = self.calculate_skill_scores() self.plot() def _load_model(self): @@ -60,19 +57,19 @@ class PostProcessing(RunEnvironment): path = self.data_store.get("forecast_path", "general") target_var = self.data_store.get("target_var", "general") - plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="CNN", ref_name="orig", - forecast_path=path, plot_name_affix="cali-ref") - plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="orig", ref_name="CNN", - forecast_path=path, plot_name_affix="like-bas") + plot_conditional_quantiles(self.test_data.stations, pred_name="CNN", ref_name="orig", + forecast_path=path, plot_name_affix="cali-ref", plot_folder=self.plot_path) + plot_conditional_quantiles(self.test_data.stations, pred_name="orig", ref_name="CNN", + forecast_path=path, plot_name_affix="like-bas", plot_folder=self.plot_path) plot_station_map(generators={'b': self.test_data}, plot_folder=self.plot_path) plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", target_var, plot_folder=self.plot_path) # plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN") - plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup="CNN") + plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, + extra_name_tag="all_terms_", model_setup="CNN") plot_competitive_skill_score(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") - def calculate_test_score(self): test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(), use_multiprocessing=False, verbose=0, steps=1) @@ -170,7 +167,7 @@ class PostProcessing(RunEnvironment): elif isinstance(df, xr.DataArray): earliest = df.index[0].values latest = df.index[-1].values - elif isinstance(df, pd.core.indexes.datetimes.DatetimeIndex): + elif isinstance(df, pd.DatetimeIndex): earliest = df[0] latest = df[-1] else: @@ -212,154 +209,17 @@ class PostProcessing(RunEnvironment): except KeyError: return None - def calculate_skill_scores(self, threshold=60): + def calculate_skill_scores(self): path = self.data_store.get("forecast_path", "general") window_lead_time = self.data_store.get("window_lead_time", "general") skill_score_general = {} skill_score_climatological = {} - for station in self.test_data.stations: # TODO: replace this by a more general approach to also calculate on train/val + for station in self.test_data.stations: file = os.path.join(path, f"forecasts_{station}_test.nc") data = xr.open_dataarray(file) - ss = SkillScores(data) + skill_score = statistics.SkillScores(data) external_data = self._get_external_data(station) - skill_score_general[station] = ss.skill_scores(window_lead_time) - skill_score_climatological[station] = ss.climatological_skill_scores(external_data, window_lead_time) + skill_score_general[station] = skill_score.skill_scores(window_lead_time) + skill_score_climatological[station] = skill_score.climatological_skill_scores(external_data, + window_lead_time) return skill_score_general, skill_score_climatological - - -class SkillScores(RunEnvironment): - - def __init__(self, internal_data): - super().__init__() - self.internal_data = internal_data - - def skill_scores(self, window_lead_time): - ahead_names = list(range(1, window_lead_time + 1)) - skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) - for iahead in ahead_names: - data = self.internal_data.sel(ahead=iahead) - skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"), - self.general_skill_score(data, forecast_name="OLS", reference_name="persi"), - self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] - return skill_score - - def climatological_skill_scores(self, external_data, window_lead_time): - ahead_names = list(range(1, window_lead_time + 1)) - - all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', - 'CASE IV'] - skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], - dims=['terms', 'ahead']) - - for iahead in ahead_names: - - data = self.internal_data.sel(ahead=iahead) - - skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=1, forecast_name="CNN").values.flatten()) - - skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=2, forecast_name="CNN").values.flatten()) - - if external_data is not None: - skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=3, forecast_name="CNN", - external_data=external_data).values.flatten()) - - skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=4, forecast_name="CNN", - external_data=external_data).values.flatten()) - - return skill_score - - def _climatological_skill_score(self, data, mu_type=1, observation_name="orig", forecast_name="CNN", external_data=None): - kwargs = {"external_data": external_data} if external_data is not None else {} - return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) - - @staticmethod - def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"): - data = data.dropna("index") - observation = data.sel(type=observation_name) - forecast = data.sel(type=forecast_name) - reference = data.sel(type=reference_name) - mse = statistics.mean_squared_error - skill_score = 1 - mse(observation, forecast) / mse(observation, reference) - return skill_score.values - - @staticmethod - def skill_score_pre_calculations(data, observation_name, forecast_name): - - data = data.loc[..., [observation_name, forecast_name]].drop("ahead") - data = data.dropna("index") - - mean = data.mean("index") - sigma = np.sqrt(data.var("index")) - # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]]) - r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) - - AI = np.array(r ** 2) - BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values - CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[ - ..., observation_name]) ** 2).values - - suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} - return AI, BI, CI, data, suffix - - def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"): - AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) - skill_score = np.array(AI - BI - CI) - return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() - - def skill_score_mu_case_2(self, data, observation_name="orig", forecast_name="CNN"): - AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) - monthly_mean = self.create_monthly_mean_from_daily_data(data) - data = xr.concat([data, monthly_mean], dim="type") - sigma = suffix["sigma"] - sigma_monthly = np.sqrt(monthly_mean.var()) - # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]]) - r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) - AII = np.array(r ** 2) - BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values - skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) - return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() - - def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None): - AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) - mean, sigma = suffix["mean"], suffix["sigma"] - AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values - skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) - return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() - - def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None): - AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) - monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index) - data = xr.concat([data, monthly_mean_external], dim="type") - mean, sigma = suffix["mean"], suffix["sigma"] - monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values) - mean_external = monthly_mean_external.mean() - sigma_external = np.sqrt(monthly_mean_external.var()) - - # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) - r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) - - AIV = np.array(r_mu**2) - BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values - CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values - skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) - return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() - - @staticmethod - def create_monthly_mean_from_daily_data(data, columns=None, index=None): - if columns is None: - columns = data.type.values - if index is None: - index = data.index - coordinates = [index, [v + "X" for v in list(columns)]] # TODO - empty_data = np.full((len(index), len(columns)), np.nan) - monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"]) - mu = data.groupby("index.month").mean() - - for month in mu.month: - monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values - - return monthly_mean diff --git a/src/statistics.py b/src/statistics.py index 7c3caf91fbdf41eaedf7712b6e2cee21492c4ee7..fd8491748f7ebd30d7056af9cc1ce162c5743881 100644 --- a/src/statistics.py +++ b/src/statistics.py @@ -1,4 +1,8 @@ -__author__ = 'Lukas Leufen' +from scipy import stats + +from src.run_modules.run_environment import RunEnvironment + +__author__ = 'Lukas Leufen, Felix Kleinert' __date__ = '2019-10-23' import numpy as np @@ -75,3 +79,141 @@ def centre_inverse(data: Data, mean: Data) -> Data: def mean_squared_error(a, b): return np.square(a - b).mean() + + +class SkillScores(RunEnvironment): + + def __init__(self, internal_data): + super().__init__() + self.internal_data = internal_data + + def skill_scores(self, window_lead_time): + ahead_names = list(range(1, window_lead_time + 1)) + skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) + for iahead in ahead_names: + data = self.internal_data.sel(ahead=iahead) + skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"), + self.general_skill_score(data, forecast_name="OLS", reference_name="persi"), + self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] + return skill_score + + def climatological_skill_scores(self, external_data, window_lead_time): + ahead_names = list(range(1, window_lead_time + 1)) + + all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', + 'CASE IV'] + skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names], + dims=['terms', 'ahead']) + + for iahead in ahead_names: + + data = self.internal_data.sel(ahead=iahead) + + skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=1, forecast_name="CNN").values.flatten()) + + skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=2, forecast_name="CNN").values.flatten()) + + if external_data is not None: + skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=3, forecast_name="CNN", + external_data=external_data).values.flatten()) + + skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( + data, mu_type=4, forecast_name="CNN", + external_data=external_data).values.flatten()) + + return skill_score + + def _climatological_skill_score(self, data, mu_type=1, observation_name="orig", forecast_name="CNN", external_data=None): + kwargs = {"external_data": external_data} if external_data is not None else {} + return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) + + @staticmethod + def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"): + data = data.dropna("index") + observation = data.sel(type=observation_name) + forecast = data.sel(type=forecast_name) + reference = data.sel(type=reference_name) + mse = statistics.mean_squared_error + skill_score = 1 - mse(observation, forecast) / mse(observation, reference) + return skill_score.values + + @staticmethod + def skill_score_pre_calculations(data, observation_name, forecast_name): + + data = data.loc[..., [observation_name, forecast_name]].drop("ahead") + data = data.dropna("index") + + mean = data.mean("index") + sigma = np.sqrt(data.var("index")) + # r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]]) + r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) + + AI = np.array(r ** 2) + BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values + CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[ + ..., observation_name]) ** 2).values + + suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} + return AI, BI, CI, data, suffix + + def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"): + AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) + skill_score = np.array(AI - BI - CI) + return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() + + def skill_score_mu_case_2(self, data, observation_name="orig", forecast_name="CNN"): + AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) + monthly_mean = self.create_monthly_mean_from_daily_data(data) + data = xr.concat([data, monthly_mean], dim="type") + sigma = suffix["sigma"] + sigma_monthly = np.sqrt(monthly_mean.var()) + # r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]]) + r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + AII = np.array(r ** 2) + BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values + skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) + return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() + + def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None): + AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) + mean, sigma = suffix["mean"], suffix["sigma"] + AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values + skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) + return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() + + def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None): + AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index) + data = xr.concat([data, monthly_mean_external], dim="type") + mean, sigma = suffix["mean"], suffix["sigma"] + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values) + mean_external = monthly_mean_external.mean() + sigma_external = np.sqrt(monthly_mean_external.var()) + + # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) + r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + + AIV = np.array(r_mu**2) + BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values + CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values + skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) + return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() + + @staticmethod + def create_monthly_mean_from_daily_data(data, columns=None, index=None): + if columns is None: + columns = data.type.values + if index is None: + index = data.index + coordinates = [index, [v + "X" for v in list(columns)]] + empty_data = np.full((len(index), len(columns)), np.nan) + monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"]) + mu = data.groupby("index.month").mean() + + for month in mu.month: + monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values + + return monthly_mean \ No newline at end of file