Skip to content
Snippets Groups Projects
Commit 5b86317b authored by lukas leufen's avatar lukas leufen
Browse files

moved skill score to statistics module (still no test implemented)

parent d45b8d74
Branches
Tags
2 merge requests!37include new development,!27Lukas issue032 feat plotting postprocessing
Pipeline #28716 passed
...@@ -8,18 +8,15 @@ import os ...@@ -8,18 +8,15 @@ import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import xarray as xr import xarray as xr
import statsmodels.api as sm
import keras import keras
from scipy import stats
from src.run_modules.run_environment import RunEnvironment from src.run_modules.run_environment import RunEnvironment
from src.data_handling.data_distributor import Distributor from src.data_handling.data_distributor import Distributor
from src.data_handling.data_generator import DataGenerator from src.data_handling.data_generator import DataGenerator
from src.model_modules.linear_model import OrdinaryLeastSquaredModel from src.model_modules.linear_model import OrdinaryLeastSquaredModel
from src import statistics from src import statistics
from src import helpers from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, \
from src.helpers import TimeTracking plot_climatological_skill_score, plot_competitive_skill_score
from src.plotting.postprocessing_plotting import plot_monthly_summary, plot_station_map, plot_conditional_quantiles, plot_climatological_skill_score, plot_competitive_skill_score
from src.datastore import NameNotFoundInDataStore from src.datastore import NameNotFoundInDataStore
...@@ -41,7 +38,7 @@ class PostProcessing(RunEnvironment): ...@@ -41,7 +38,7 @@ class PostProcessing(RunEnvironment):
def _run(self): def _run(self):
self.train_ols_model() self.train_ols_model()
preds_for_all_stations = self.make_prediction() preds_for_all_stations = self.make_prediction()
self.skill_scores = self.calculate_skill_scores() #TODO: stopped here, continue with plot routine. Think about if skill score should be saved locally and loaded for plotting or if self.skill_scores should be used self.skill_scores = self.calculate_skill_scores()
self.plot() self.plot()
def _load_model(self): def _load_model(self):
...@@ -60,19 +57,19 @@ class PostProcessing(RunEnvironment): ...@@ -60,19 +57,19 @@ class PostProcessing(RunEnvironment):
path = self.data_store.get("forecast_path", "general") path = self.data_store.get("forecast_path", "general")
target_var = self.data_store.get("target_var", "general") target_var = self.data_store.get("target_var", "general")
plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="CNN", ref_name="orig", plot_conditional_quantiles(self.test_data.stations, pred_name="CNN", ref_name="orig",
forecast_path=path, plot_name_affix="cali-ref") forecast_path=path, plot_name_affix="cali-ref", plot_folder=self.plot_path)
plot_conditional_quantiles(self.test_data.stations, plot_folder=self.plot_path, pred_name="orig", ref_name="CNN", plot_conditional_quantiles(self.test_data.stations, pred_name="orig", ref_name="CNN",
forecast_path=path, plot_name_affix="like-bas") forecast_path=path, plot_name_affix="like-bas", plot_folder=self.plot_path)
plot_station_map(generators={'b': self.test_data}, plot_folder=self.plot_path) plot_station_map(generators={'b': self.test_data}, plot_folder=self.plot_path)
plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", target_var, plot_monthly_summary(self.test_data.stations, path, r"forecasts_%s_test.nc", target_var,
plot_folder=self.plot_path) plot_folder=self.plot_path)
# #
plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN") plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN")
plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup="CNN") plot_climatological_skill_score(self.skill_scores[1], plot_folder=self.plot_path, score_only=False,
extra_name_tag="all_terms_", model_setup="CNN")
plot_competitive_skill_score(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") plot_competitive_skill_score(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN")
def calculate_test_score(self): def calculate_test_score(self):
test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(), test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(),
use_multiprocessing=False, verbose=0, steps=1) use_multiprocessing=False, verbose=0, steps=1)
...@@ -170,7 +167,7 @@ class PostProcessing(RunEnvironment): ...@@ -170,7 +167,7 @@ class PostProcessing(RunEnvironment):
elif isinstance(df, xr.DataArray): elif isinstance(df, xr.DataArray):
earliest = df.index[0].values earliest = df.index[0].values
latest = df.index[-1].values latest = df.index[-1].values
elif isinstance(df, pd.core.indexes.datetimes.DatetimeIndex): elif isinstance(df, pd.DatetimeIndex):
earliest = df[0] earliest = df[0]
latest = df[-1] latest = df[-1]
else: else:
...@@ -212,154 +209,17 @@ class PostProcessing(RunEnvironment): ...@@ -212,154 +209,17 @@ class PostProcessing(RunEnvironment):
except KeyError: except KeyError:
return None return None
def calculate_skill_scores(self, threshold=60): def calculate_skill_scores(self):
path = self.data_store.get("forecast_path", "general") path = self.data_store.get("forecast_path", "general")
window_lead_time = self.data_store.get("window_lead_time", "general") window_lead_time = self.data_store.get("window_lead_time", "general")
skill_score_general = {} skill_score_general = {}
skill_score_climatological = {} skill_score_climatological = {}
for station in self.test_data.stations: # TODO: replace this by a more general approach to also calculate on train/val for station in self.test_data.stations:
file = os.path.join(path, f"forecasts_{station}_test.nc") file = os.path.join(path, f"forecasts_{station}_test.nc")
data = xr.open_dataarray(file) data = xr.open_dataarray(file)
ss = SkillScores(data) skill_score = statistics.SkillScores(data)
external_data = self._get_external_data(station) external_data = self._get_external_data(station)
skill_score_general[station] = ss.skill_scores(window_lead_time) skill_score_general[station] = skill_score.skill_scores(window_lead_time)
skill_score_climatological[station] = ss.climatological_skill_scores(external_data, window_lead_time) skill_score_climatological[station] = skill_score.climatological_skill_scores(external_data,
window_lead_time)
return skill_score_general, skill_score_climatological return skill_score_general, skill_score_climatological
class SkillScores(RunEnvironment):
def __init__(self, internal_data):
super().__init__()
self.internal_data = internal_data
def skill_scores(self, window_lead_time):
ahead_names = list(range(1, window_lead_time + 1))
skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
for iahead in ahead_names:
data = self.internal_data.sel(ahead=iahead)
skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"),
self.general_skill_score(data, forecast_name="OLS", reference_name="persi"),
self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
return skill_score
def climatological_skill_scores(self, external_data, window_lead_time):
ahead_names = list(range(1, window_lead_time + 1))
all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
'CASE IV']
skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names],
dims=['terms', 'ahead'])
for iahead in ahead_names:
data = self.internal_data.sel(ahead=iahead)
skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=1, forecast_name="CNN").values.flatten())
skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=2, forecast_name="CNN").values.flatten())
if external_data is not None:
skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=3, forecast_name="CNN",
external_data=external_data).values.flatten())
skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=4, forecast_name="CNN",
external_data=external_data).values.flatten())
return skill_score
def _climatological_skill_score(self, data, mu_type=1, observation_name="orig", forecast_name="CNN", external_data=None):
kwargs = {"external_data": external_data} if external_data is not None else {}
return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs)
@staticmethod
def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"):
data = data.dropna("index")
observation = data.sel(type=observation_name)
forecast = data.sel(type=forecast_name)
reference = data.sel(type=reference_name)
mse = statistics.mean_squared_error
skill_score = 1 - mse(observation, forecast) / mse(observation, reference)
return skill_score.values
@staticmethod
def skill_score_pre_calculations(data, observation_name, forecast_name):
data = data.loc[..., [observation_name, forecast_name]].drop("ahead")
data = data.dropna("index")
mean = data.mean("index")
sigma = np.sqrt(data.var("index"))
# r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
AI = np.array(r ** 2)
BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values
CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[
..., observation_name]) ** 2).values
suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p}
return AI, BI, CI, data, suffix
def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"):
AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name)
skill_score = np.array(AI - BI - CI)
return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
def skill_score_mu_case_2(self, data, observation_name="orig", forecast_name="CNN"):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean = self.create_monthly_mean_from_daily_data(data)
data = xr.concat([data, monthly_mean], dim="type")
sigma = suffix["sigma"]
sigma_monthly = np.sqrt(monthly_mean.var())
# r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
AII = np.array(r ** 2)
BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII))
return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
mean, sigma = suffix["mean"], suffix["sigma"]
AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII)
return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index)
data = xr.concat([data, monthly_mean_external], dim="type")
mean, sigma = suffix["mean"], suffix["sigma"]
monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values)
mean_external = monthly_mean_external.mean()
sigma_external = np.sqrt(monthly_mean_external.var())
# r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']])
r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
AIV = np.array(r_mu**2)
BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values
CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV))
return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array()
@staticmethod
def create_monthly_mean_from_daily_data(data, columns=None, index=None):
if columns is None:
columns = data.type.values
if index is None:
index = data.index
coordinates = [index, [v + "X" for v in list(columns)]] # TODO
empty_data = np.full((len(index), len(columns)), np.nan)
monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"])
mu = data.groupby("index.month").mean()
for month in mu.month:
monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values
return monthly_mean
__author__ = 'Lukas Leufen' from scipy import stats
from src.run_modules.run_environment import RunEnvironment
__author__ = 'Lukas Leufen, Felix Kleinert'
__date__ = '2019-10-23' __date__ = '2019-10-23'
import numpy as np import numpy as np
...@@ -75,3 +79,141 @@ def centre_inverse(data: Data, mean: Data) -> Data: ...@@ -75,3 +79,141 @@ def centre_inverse(data: Data, mean: Data) -> Data:
def mean_squared_error(a, b): def mean_squared_error(a, b):
return np.square(a - b).mean() return np.square(a - b).mean()
class SkillScores(RunEnvironment):
def __init__(self, internal_data):
super().__init__()
self.internal_data = internal_data
def skill_scores(self, window_lead_time):
ahead_names = list(range(1, window_lead_time + 1))
skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols'])
for iahead in ahead_names:
data = self.internal_data.sel(ahead=iahead)
skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"),
self.general_skill_score(data, forecast_name="OLS", reference_name="persi"),
self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")]
return skill_score
def climatological_skill_scores(self, external_data, window_lead_time):
ahead_names = list(range(1, window_lead_time + 1))
all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III',
'CASE IV']
skill_score = xr.DataArray(np.full((len(all_terms), len(ahead_names)), np.nan), coords=[all_terms, ahead_names],
dims=['terms', 'ahead'])
for iahead in ahead_names:
data = self.internal_data.sel(ahead=iahead)
skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=1, forecast_name="CNN").values.flatten())
skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=2, forecast_name="CNN").values.flatten())
if external_data is not None:
skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=3, forecast_name="CNN",
external_data=external_data).values.flatten())
skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score(
data, mu_type=4, forecast_name="CNN",
external_data=external_data).values.flatten())
return skill_score
def _climatological_skill_score(self, data, mu_type=1, observation_name="orig", forecast_name="CNN", external_data=None):
kwargs = {"external_data": external_data} if external_data is not None else {}
return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs)
@staticmethod
def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"):
data = data.dropna("index")
observation = data.sel(type=observation_name)
forecast = data.sel(type=forecast_name)
reference = data.sel(type=reference_name)
mse = statistics.mean_squared_error
skill_score = 1 - mse(observation, forecast) / mse(observation, reference)
return skill_score.values
@staticmethod
def skill_score_pre_calculations(data, observation_name, forecast_name):
data = data.loc[..., [observation_name, forecast_name]].drop("ahead")
data = data.dropna("index")
mean = data.mean("index")
sigma = np.sqrt(data.var("index"))
# r, p = stats.spearmanr(data.loc[..., [forecast_name, observation_name]])
r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name])
AI = np.array(r ** 2)
BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values
CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[
..., observation_name]) ** 2).values
suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p}
return AI, BI, CI, data, suffix
def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"):
AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name)
skill_score = np.array(AI - BI - CI)
return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array()
def skill_score_mu_case_2(self, data, observation_name="orig", forecast_name="CNN"):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean = self.create_monthly_mean_from_daily_data(data)
data = xr.concat([data, monthly_mean], dim="type")
sigma = suffix["sigma"]
sigma_monthly = np.sqrt(monthly_mean.var())
# r, p = stats.spearmanr(data.loc[..., [observation_name, observation_name + "X"]])
r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
AII = np.array(r ** 2)
BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values
skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII))
return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array()
def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
mean, sigma = suffix["mean"], suffix["sigma"]
AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII)
return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array()
def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None):
AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name)
monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index)
data = xr.concat([data, monthly_mean_external], dim="type")
mean, sigma = suffix["mean"], suffix["sigma"]
monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values)
mean_external = monthly_mean_external.mean()
sigma_external = np.sqrt(monthly_mean_external.var())
# r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']])
r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"])
AIV = np.array(r_mu**2)
BIV = ((r_mu - sigma_external / sigma.loc[observation_name])**2).values
CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values
skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV))
return pd.DataFrame({"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array()
@staticmethod
def create_monthly_mean_from_daily_data(data, columns=None, index=None):
if columns is None:
columns = data.type.values
if index is None:
index = data.index
coordinates = [index, [v + "X" for v in list(columns)]]
empty_data = np.full((len(index), len(columns)), np.nan)
monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"])
mu = data.groupby("index.month").mean()
for month in mu.month:
monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values
return monthly_mean
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment