diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 19a4893d49c7702cd092858c5c885453e974cbc1..9cbb2d71b86261d5a3a89cb8c1c533aa3538da42 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -260,12 +260,15 @@ class SkillScores: """ models_default = ["cnn", "persi", "ols"] - def __init__(self, external_data: Union[Data, None], models=None, observation_name="obs", ahead_dim="ahead"): + def __init__(self, external_data: Union[Data, None], models=None, observation_name="obs", ahead_dim="ahead", + type_dim="type", index_dim="index"): """Set internal data.""" self.external_data = external_data self.models = self.set_model_names(models) self.observation_name = observation_name self.ahead_dim = ahead_dim + self.type_dim = type_dim + self.index_dim = index_dim def set_model_names(self, models: List[str]) -> List[str]: """Either use given models or use defaults.""" @@ -299,16 +302,12 @@ class SkillScores: count = pd.DataFrame(index=combination_strings) for iahead in ahead_names: data = self.external_data.sel({self.ahead_dim: iahead}) - skill_score[iahead] = [self.general_skill_score(data, + skill_score[iahead] = [self.general_skill_score(data, dim=self.index_dim, forecast_name=first, reference_name=second, observation_name=self.observation_name) for (first, second) in combinations] - count[iahead] = [self.get_count(data, - forecast_name=first, - reference_name=second, - observation_name=self.observation_name) - for (first, second) in combinations] + count[iahead] = [self.get_count(data, dim=self.index_dim) for _ in combinations] return skill_score, count def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: @@ -342,8 +341,8 @@ class SkillScores: skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) - if self.external_data is not None and self.observation_name in self.external_data.coords["type"]: - external_data = self.external_data.sel({self.ahead_dim: iahead, "type": [self.observation_name]}) + if self.external_data is not None and self.observation_name in self.external_data.coords[self.type_dim]: + external_data = self.external_data.sel({self.ahead_dim: iahead, self.type_dim: [self.observation_name]}) skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) @@ -362,7 +361,7 @@ class SkillScores: def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, observation_name: str = None, dim: str = "index") -> np.ndarray: - r""" + """ Calculate general skill score based on mean squared error. :param data: internal data containing data for observation, forecast and reference @@ -375,27 +374,17 @@ class SkillScores: if observation_name is None: observation_name = self.observation_name data = data.dropna(dim) - observation = data.sel(type=observation_name) - forecast = data.sel(type=forecast_name) - reference = data.sel(type=reference_name) + observation = data.sel({self.type_dim: observation_name}) + forecast = data.sel({self.type_dim: forecast_name}) + reference = data.sel({self.type_dim: reference_name}) mse = mean_squared_error skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim) return skill_score.values - def get_count(self, data: Data, forecast_name: str, reference_name: str, - observation_name: str = None) -> np.ndarray: - r""" - Calculate general skill score based on mean squared error. - - :param data: internal data containing data for observation, forecast and reference - :param observation_name: name of observation - :param forecast_name: name of forecast - :param reference_name: name of reference - - :return: skill score of forecast - """ - data = data.dropna("index") - return data.count("index").max().values + def get_count(self, data: Data, dim: str = "index") -> np.ndarray: + """Count data and return number""" + data = data.dropna(dim) + return data.count(dim).max().values def skill_score_pre_calculations(self, data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray, np.ndarray, @@ -415,18 +404,18 @@ class SkillScores: :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value """ - data = data.sel(type=[observation_name, forecast_name]).drop(self.ahead_dim) - data = data.dropna("index") + data = data.sel({self.type_dim: [observation_name, forecast_name]}).drop(self.ahead_dim) + data = data.dropna(self.index_dim) - mean = data.mean("index") - sigma = np.sqrt(data.var("index")) - r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [forecast_name, observation_name]]) + mean = data.mean(self.index_dim) + sigma = np.sqrt(data.var(self.index_dim)) + r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [forecast_name, observation_name]]) AI = np.array(r ** 2) - BI = ((r - (sigma.sel(type=forecast_name, drop=True) / sigma.sel(type=observation_name, - drop=True))) ** 2).values - CI = (((mean.sel(type=forecast_name, drop=True) - mean.sel(type=observation_name, drop=True)) / sigma.sel( - type=observation_name, drop=True)) ** 2).values + BI = ((r - (sigma.sel({self.type_dim: forecast_name}, drop=True) / sigma.sel({self.type_dim: observation_name}, + drop=True))) ** 2).values + CI = (((mean.sel({self.type_dim: forecast_name}, drop=True) - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel( + {self.type_dim: observation_name}, drop=True)) ** 2).values suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix @@ -441,12 +430,12 @@ class SkillScores: """Calculate CASE II.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) - data = xr.concat([data, monthly_mean], dim="type") + data = xr.concat([data, monthly_mean], dim=self.type_dim) sigma = suffix["sigma"] sigma_monthly = np.sqrt(monthly_mean.var()) - r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) + r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]]) AII = np.array(r ** 2) - BII = ((r - sigma_monthly / sigma.sel(type=observation_name, drop=True)) ** 2).values + BII = ((r - sigma_monthly / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() @@ -454,31 +443,30 @@ class SkillScores: """Calculate CASE III.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] - AIII = (((external_data.mean().values - mean.sel(type=observation_name, drop=True)) / sigma.sel( - type=observation_name, drop=True)) ** 2).values + AIII = (((external_data.mean().values - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel( + {self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI + AIII) / (1 + AIII)) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() def skill_score_mu_case_4(self, internal_data, observation_name, forecast_name, external_data=None): """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) - monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data.index) - data = xr.concat([data, monthly_mean_external], dim="type").dropna(dim="index") + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data[self.index_dim]) + data = xr.concat([data, monthly_mean_external], dim=self.type_dim).dropna(dim=self.index_dim) mean, sigma = suffix["mean"], suffix["sigma"] mean_external = monthly_mean_external.mean() sigma_external = np.sqrt(monthly_mean_external.var()) r_mu, p_mu = stats.pearsonr( - *[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) + *[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]]) AIV = np.array(r_mu ** 2) - BIV = ((r_mu - sigma_external / sigma.sel(type=observation_name, drop=True)) ** 2).values - CIV = (((mean_external - mean.sel(type=observation_name, drop=True)) / sigma.sel(type=observation_name, + BIV = ((r_mu - sigma_external / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values + CIV = (((mean_external - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) return pd.DataFrame( {"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() - @staticmethod - def create_monthly_mean_from_daily_data(data, columns=None, index=None): + def create_monthly_mean_from_daily_data(self, data, columns=None, index=None): """ Calculate average for each month and save as daily values with flag 'X'. @@ -489,16 +477,16 @@ class SkillScores: :return: data containing monthly means in daily resolution """ if columns is None: - columns = data.type.values + columns = data.coords[self.type_dim].values if index is None: - index = data.index + index = data.coords[self.index_dim].values coordinates = [index, [v + "X" for v in list(columns)]] empty_data = np.full((len(index), len(columns)), np.nan) - monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"]) - mu = data.groupby("index.month").mean() + monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=[self.index_dim, self.type_dim]) + mu = data.groupby(f"{self.index_dim}.month").mean() for month in mu.month: - monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values.flatten() + monthly_mean[monthly_mean[self.index_dim].dt.month == month, :] = mu[mu.month == month].values.flatten() return monthly_mean diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index d73f991ec331a8050e91c2ee449f8b28016d48a4..9cd9fce98ff35b62151f89e1debbbb23c0bcc172 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -365,7 +365,8 @@ class PostProcessing(RunEnvironment): number_of_bootstraps = bootstraps.number_of_bootstraps bootstrap_iter = bootstraps.bootstraps() branch_length = self.get_distinct_branches_from_bootstrap_iter(bootstrap_iter) - skill_scores = statistics.SkillScores(None, ahead_dim=self.ahead_dim) + skill_scores = statistics.SkillScores(None, ahead_dim=self.ahead_dim, type_dim=self.model_type_dim, + index_dim=self.index_dim, observation_name=self.observation_indicator) score = {} for station in self.test_data: # get station labels @@ -969,7 +970,8 @@ class PostProcessing(RunEnvironment): [model_type, self.observation_indicator]), dim=self.index_dim) # skill score - skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim) + skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim, + type_dim=self.model_type_dim, index_dim=self.index_dim) if external_data is not None: skill_score_competitive[station], skill_score_competitive_count[station] = skill_score.skill_scores() @@ -996,7 +998,6 @@ class PostProcessing(RunEnvironment): fill_value=0) return avg_skill_score - @staticmethod def calculate_average_errors(errors): avg_error = {}