diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index b99d212b5616e16ff635aa98a9257833d31956fa..0251f3eab1101ce2c23433ac5f63d8a87dd71a9a 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -8,8 +8,9 @@ __date__ = '2019-10-23' import numpy as np import xarray as xr import pandas as pd -from typing import Union, Tuple, Dict +from typing import Union, Tuple, Dict, List from matplotlib import pyplot as plt +import itertools from mlair.helpers import to_list, remove_items @@ -178,10 +179,33 @@ class SkillScores: skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3) """ + models_default = ["cnn", "persi", "ols"] - def __init__(self, internal_data: Data): + def __init__(self, internal_data: Data, models=None, observation_name="obs"): """Set internal data.""" self.internal_data = internal_data + self.models = self.set_model_names(models) + self.observation_name = observation_name + + def set_model_names(self, models: List[str]) -> List[str]: + """Either use given models or use defaults.""" + if models is None: + models = self.models_default + return self._reorder(models) + + @staticmethod + def _reorder(model_list: List[str]) -> List[str]: + """Set elements persi and obs at the very end of given list.""" + for element in ["persi", "obs"]: + if element in model_list: + model_list.append(model_list.pop(model_list.index(element))) + return model_list + + def get_model_name_combinations(self): + """Return all combinations of two models as tuple and string.""" + combinations = list(itertools.combinations(self.models, 2)) + combination_strings = [f"{first}-{second}" for (first, second) in combinations] + return combinations, combination_strings def skill_scores(self, window_lead_time: int) -> pd.DataFrame: """ @@ -192,16 +216,19 @@ class SkillScores: :return: skill score for each comparison and forecast step """ ahead_names = list(range(1, window_lead_time + 1)) - skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols', 'cnn-competitor']) + combinations, combination_strings = self.get_model_name_combinations() + skill_score = pd.DataFrame(index=combination_strings) for iahead in ahead_names: data = self.internal_data.sel(ahead=iahead) - skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"), - self.general_skill_score(data, forecast_name="OLS", reference_name="persi"), - self.general_skill_score(data, forecast_name="CNN", reference_name="OLS"), - self.general_skill_score(data, forecast_name="CNN", reference_name="competitor")] + skill_score[iahead] = [self.general_skill_score(data, + forecast_name=first, + reference_name=second, + observation_name=self.observation_name) + for (first, second) in combinations] return skill_score - def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> xr.DataArray: + def climatological_skill_scores(self, external_data: Data, window_lead_time: int, + forecast_name: str = "cnn") -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -210,6 +237,7 @@ class SkillScores: :param external_data: external data :param window_lead_time: interested time step of forecast horizon to select data + :param forecast_name: name of the forecast to use for this calculation (must be available in `data`) :return: all CASES as well as all terms """ @@ -225,30 +253,28 @@ class SkillScores: data = self.internal_data.sel(ahead=iahead) skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=1, forecast_name="CNN").values.flatten()) + data, mu_type=1, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=2, forecast_name="CNN").values.flatten()) + data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) if external_data is not None: skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=3, forecast_name="CNN", + data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=4, forecast_name="CNN", + data, mu_type=4, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) return skill_score - def _climatological_skill_score(self, data, mu_type=1, observation_name="obs", forecast_name="CNN", - external_data=None): + def _climatological_skill_score(self, data, observation_name, forecast_name, mu_type=1, external_data=None): kwargs = {"external_data": external_data} if external_data is not None else {} return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) @staticmethod - def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN", - reference_name: str = "persi") -> np.ndarray: + def general_skill_score(data: Data, observation_name: str, forecast_name: str, reference_name: str) -> np.ndarray: r""" Calculate general skill score based on mean squared error. @@ -285,49 +311,51 @@ class SkillScores: :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value """ - data = data.loc[..., [observation_name, forecast_name]].drop("ahead") + data = data.sel(type=[observation_name, forecast_name]).drop("ahead") data = data.dropna("index") mean = data.mean("index") sigma = np.sqrt(data.var("index")) - r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) + r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [forecast_name, observation_name]]) AI = np.array(r ** 2) - BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values - CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[ - ..., observation_name]) ** 2).values + BI = ((r - (sigma.sel(type=forecast_name, drop=True) / sigma.sel(type=observation_name, + drop=True))) ** 2).values + CI = (((mean.sel(type=forecast_name, drop=True) - mean.sel(type=observation_name, drop=True)) / sigma.sel( + type=observation_name, drop=True)) ** 2).values suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix - def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"): + def skill_score_mu_case_1(self, data, observation_name, forecast_name): """Calculate CASE I.""" AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) skill_score = np.array(AI - BI - CI) return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() - def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"): + def skill_score_mu_case_2(self, data, observation_name, forecast_name): """Calculate CASE II.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) data = xr.concat([data, monthly_mean], dim="type") sigma = suffix["sigma"] sigma_monthly = np.sqrt(monthly_mean.var()) - r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) AII = np.array(r ** 2) - BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values + BII = ((r - sigma_monthly / sigma.sel(type=observation_name, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() - def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + def skill_score_mu_case_3(self, data, observation_name, forecast_name, external_data=None): """Calculate CASE III.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] - AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values + AIII = (((external_data.mean().values - mean.sel(type=observation_name, drop=True)) / sigma.sel( + type=observation_name, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() - def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + def skill_score_mu_case_4(self, data, observation_name, forecast_name, external_data=None): """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, @@ -338,12 +366,13 @@ class SkillScores: mean_external = monthly_mean_external.mean() sigma_external = np.sqrt(monthly_mean_external.var()) - # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) - r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + r_mu, p_mu = stats.pearsonr( + *[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) AIV = np.array(r_mu ** 2) - BIV = ((r_mu - sigma_external / sigma.loc[observation_name]) ** 2).values - CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values + BIV = ((r_mu - sigma_external / sigma.sel(type=observation_name, drop=True)) ** 2).values + CIV = (((mean_external - mean.sel(type=observation_name, drop=True)) / sigma.sel(type=observation_name, + drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) return pd.DataFrame( {"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() @@ -369,7 +398,7 @@ class SkillScores: mu = data.groupby("index.month").mean() for month in mu.month: - monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values + monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values.flatten() return monthly_mean