diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 382b8dcf6a6c43f0f04301fa3831cc202a5dbb60..d4c25bb8891b88e114ea9244cf308bd810910daf 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -317,12 +317,15 @@ class SkillScores: """ models_default = ["cnn", "persi", "ols"] - def __init__(self, external_data: Union[Data, None], models=None, observation_name="obs", ahead_dim="ahead"): + def __init__(self, external_data: Union[Data, None], models=None, observation_name="obs", ahead_dim="ahead", + type_dim="type", index_dim="index"): """Set internal data.""" self.external_data = external_data self.models = self.set_model_names(models) self.observation_name = observation_name self.ahead_dim = ahead_dim + self.type_dim = type_dim + self.index_dim = index_dim def set_model_names(self, models: List[str]) -> List[str]: """Either use given models or use defaults.""" @@ -356,16 +359,12 @@ class SkillScores: count = pd.DataFrame(index=combination_strings) for iahead in ahead_names: data = self.external_data.sel({self.ahead_dim: iahead}) - skill_score[iahead] = [self.general_skill_score(data, + skill_score[iahead] = [self.general_skill_score(data, dim=self.index_dim, forecast_name=first, reference_name=second, observation_name=self.observation_name) for (first, second) in combinations] - count[iahead] = [self.get_count(data, - forecast_name=first, - reference_name=second, - observation_name=self.observation_name) - for (first, second) in combinations] + count[iahead] = [self.get_count(data, dim=self.index_dim) for _ in combinations] return skill_score, count def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: @@ -399,8 +398,8 @@ class SkillScores: skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) - if self.external_data is not None and self.observation_name in self.external_data.coords["type"]: - external_data = self.external_data.sel({self.ahead_dim: iahead, "type": [self.observation_name]}) + if self.external_data is not None and self.observation_name in self.external_data.coords[self.type_dim]: + external_data = self.external_data.sel({self.ahead_dim: iahead, self.type_dim: [self.observation_name]}) skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) @@ -419,7 +418,7 @@ class SkillScores: def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, observation_name: str = None, dim: str = "index") -> np.ndarray: - r""" + """ Calculate general skill score based on mean squared error. :param data: internal data containing data for observation, forecast and reference @@ -432,27 +431,17 @@ class SkillScores: if observation_name is None: observation_name = self.observation_name data = data.dropna(dim) - observation = data.sel(type=observation_name) - forecast = data.sel(type=forecast_name) - reference = data.sel(type=reference_name) + observation = data.sel({self.type_dim: observation_name}) + forecast = data.sel({self.type_dim: forecast_name}) + reference = data.sel({self.type_dim: reference_name}) mse = mean_squared_error skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim) return skill_score.values - def get_count(self, data: Data, forecast_name: str, reference_name: str, - observation_name: str = None) -> np.ndarray: - r""" - Calculate general skill score based on mean squared error. - - :param data: internal data containing data for observation, forecast and reference - :param observation_name: name of observation - :param forecast_name: name of forecast - :param reference_name: name of reference - - :return: skill score of forecast - """ - data = data.dropna("index") - return data.count("index").max().values + def get_count(self, data: Data, dim: str = "index") -> np.ndarray: + """Count data and return number""" + data = data.dropna(dim) + return data.count(dim).max().values def skill_score_pre_calculations(self, data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray, np.ndarray, @@ -472,18 +461,18 @@ class SkillScores: :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value """ - data = data.sel(type=[observation_name, forecast_name]).drop(self.ahead_dim) - data = data.dropna("index") + data = data.sel({self.type_dim: [observation_name, forecast_name]}).drop(self.ahead_dim) + data = data.dropna(self.index_dim) - mean = data.mean("index") - sigma = np.sqrt(data.var("index")) - r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [forecast_name, observation_name]]) + mean = data.mean(self.index_dim) + sigma = np.sqrt(data.var(self.index_dim)) + r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [forecast_name, observation_name]]) AI = np.array(r ** 2) - BI = ((r - (sigma.sel(type=forecast_name, drop=True) / sigma.sel(type=observation_name, - drop=True))) ** 2).values - CI = (((mean.sel(type=forecast_name, drop=True) - mean.sel(type=observation_name, drop=True)) / sigma.sel( - type=observation_name, drop=True)) ** 2).values + BI = ((r - (sigma.sel({self.type_dim: forecast_name}, drop=True) / sigma.sel({self.type_dim: observation_name}, + drop=True))) ** 2).values + CI = (((mean.sel({self.type_dim: forecast_name}, drop=True) - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel( + {self.type_dim: observation_name}, drop=True)) ** 2).values suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix @@ -498,12 +487,12 @@ class SkillScores: """Calculate CASE II.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) - data = xr.concat([data, monthly_mean], dim="type") + data = xr.concat([data, monthly_mean], dim=self.type_dim) sigma = suffix["sigma"] sigma_monthly = np.sqrt(monthly_mean.var()) - r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) + r, p = stats.pearsonr(*[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]]) AII = np.array(r ** 2) - BII = ((r - sigma_monthly / sigma.sel(type=observation_name, drop=True)) ** 2).values + BII = ((r - sigma_monthly / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() @@ -511,31 +500,30 @@ class SkillScores: """Calculate CASE III.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] - AIII = (((external_data.mean().values - mean.sel(type=observation_name, drop=True)) / sigma.sel( - type=observation_name, drop=True)) ** 2).values + AIII = (((external_data.mean().values - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel( + {self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI + AIII) / (1 + AIII)) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() def skill_score_mu_case_4(self, internal_data, observation_name, forecast_name, external_data=None): """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) - monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data.index) - data = xr.concat([data, monthly_mean_external], dim="type").dropna(dim="index") + monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data[self.index_dim]) + data = xr.concat([data, monthly_mean_external], dim=self.type_dim).dropna(dim=self.index_dim) mean, sigma = suffix["mean"], suffix["sigma"] mean_external = monthly_mean_external.mean() sigma_external = np.sqrt(monthly_mean_external.var()) r_mu, p_mu = stats.pearsonr( - *[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) + *[data.sel({self.type_dim: n}).values.flatten() for n in [observation_name, observation_name + "X"]]) AIV = np.array(r_mu ** 2) - BIV = ((r_mu - sigma_external / sigma.sel(type=observation_name, drop=True)) ** 2).values - CIV = (((mean_external - mean.sel(type=observation_name, drop=True)) / sigma.sel(type=observation_name, + BIV = ((r_mu - sigma_external / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values + CIV = (((mean_external - mean.sel({self.type_dim: observation_name}, drop=True)) / sigma.sel({self.type_dim: observation_name}, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) return pd.DataFrame( {"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() - @staticmethod - def create_monthly_mean_from_daily_data(data, columns=None, index=None): + def create_monthly_mean_from_daily_data(self, data, columns=None, index=None): """ Calculate average for each month and save as daily values with flag 'X'. @@ -546,16 +534,16 @@ class SkillScores: :return: data containing monthly means in daily resolution """ if columns is None: - columns = data.type.values + columns = data.coords[self.type_dim].values if index is None: - index = data.index + index = data.coords[self.index_dim].values coordinates = [index, [v + "X" for v in list(columns)]] empty_data = np.full((len(index), len(columns)), np.nan) - monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=["index", "type"]) - mu = data.groupby("index.month").mean() + monthly_mean = xr.DataArray(empty_data, coords=coordinates, dims=[self.index_dim, self.type_dim]) + mu = data.groupby(f"{self.index_dim}.month").mean() for month in mu.month: - monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values.flatten() + monthly_mean[monthly_mean[self.index_dim].dt.month == month, :] = mu[mu.month == month].values.flatten() return monthly_mean diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 3aef279271a0d3be2c1189abdd2c29965b543405..69c8d00b6cafb1d8471f1e2a45c946145dfeff59 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -731,7 +731,7 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover if "branch" in self._data.columns: plot_name = self.plot_name for branch in self._data["branch"].unique(): - self._set_title(model_name, branch) + self._set_title(model_name, branch, len(self._data["branch"].unique())) self.plot_name = f"{plot_name}_{branch}" try: self._plot(branch=branch) @@ -763,7 +763,7 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover def _set_bootstrap_type(boot_type): return {"singleinput": "single input"}.get(boot_type, boot_type) - def _set_title(self, model_name, branch=None): + def _set_title(self, model_name, branch=None, n_branches=None): title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables", "group_of_variables_sector": "grouped variables by sector", "group_of_variables_var_in_sectors": "grouped variables across sectors"} @@ -771,7 +771,11 @@ class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover additional = [] if branch is not None: - branch_name = self._branches_names[branch] if self._branches_names is not None else branch + try: + assert n_branches == len(self._branches_names) + branch_name = self._branches_names[int(branch)] + except (IndexError, TypeError, ValueError, AssertionError): + branch_name = branch additional.append(branch_name) if self._number_of_bootstraps > 1: additional.append(f"n={self._number_of_bootstraps}") diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index a351dd92e70166dc5a4917a5bf7c3d5ef2b9686f..91194da9dc8da8bdd6e71ef0c3e13844984b0331 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -386,12 +386,15 @@ class PostProcessing(RunEnvironment): number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") forecast_file = f"forecasts_norm_%s_test.nc" reference_name = "orig" + branch_names = self.data_store.get_default("branch_names", None) bootstraps = Bootstraps(self.test_data[0], number_of_bootstraps, bootstrap_type=bootstrap_type, bootstrap_method=bootstrap_method) number_of_bootstraps = bootstraps.number_of_bootstraps bootstrap_iter = bootstraps.bootstraps() - skill_scores = statistics.SkillScores(None, ahead_dim=self.ahead_dim) + branch_length = self.get_distinct_branches_from_bootstrap_iter(bootstrap_iter) + skill_scores = statistics.SkillScores(None, ahead_dim=self.ahead_dim, type_dim=self.model_type_dim, + index_dim=self.index_dim, observation_name=self.observation_indicator) score = {} for station in self.test_data: # get station labels @@ -418,10 +421,11 @@ class PostProcessing(RunEnvironment): boot_scores.append( skill_scores.general_skill_score(data, forecast_name=boot_var, reference_name=reference_name, dim=self.index_dim)) + boot_var_renamed = self.rename_boot_var_with_branch(boot_var, bootstrap_type, branch_names, expected_len=branch_length) tmp = xr.DataArray(np.expand_dims(np.array(boot_scores), axis=-1), coords={self.ahead_dim: range(1, self.window_lead_time + 1), self.uncertainty_estimate_boot_dim: range(number_of_bootstraps), - self.boot_var_dim: [boot_var]}, + self.boot_var_dim: [boot_var_renamed]}, dims=[self.ahead_dim, self.uncertainty_estimate_boot_dim, self.boot_var_dim]) skill.append(tmp) @@ -429,6 +433,31 @@ class PostProcessing(RunEnvironment): score[str(station)] = xr.concat(skill, dim=self.boot_var_dim) return score + @staticmethod + def get_distinct_branches_from_bootstrap_iter(bootstrap_iter): + if isinstance(bootstrap_iter[0], tuple): + return len(set(map(lambda x: x[0], bootstrap_iter))) + else: + return len(bootstrap_iter) + + def rename_boot_var_with_branch(self, boot_var, bootstrap_type, branch_names=None, expected_len=0): + if branch_names is None: + return boot_var + if bootstrap_type == "branch": + try: + assert len(branch_names) > int(boot_var) + assert len(branch_names) == expected_len + return branch_names[int(boot_var)] + except (AssertionError, TypeError): + return boot_var + elif bootstrap_type == "singleinput": + if "_" in boot_var: + branch, other = boot_var.split("_", 1) + branch = self.rename_boot_var_with_branch(branch, "branch", branch_names=branch_names, expected_len=expected_len) + boot_var = "_".join([branch, other]) + return boot_var + return boot_var + def get_orig_prediction(self, path, file_name, prediction_name=None, reference_name=None): if prediction_name is None: prediction_name = self.forecast_indicator @@ -506,6 +535,7 @@ class PostProcessing(RunEnvironment): try: if (self.feature_importance_skill_scores is not None) and ("PlotFeatureImportanceSkillScore" in plot_list): + branch_names = self.data_store.get_default("branch_names", None) for boot_type, boot_data in self.feature_importance_skill_scores.items(): for boot_method, boot_skill_score in boot_data.items(): try: @@ -513,7 +543,7 @@ class PostProcessing(RunEnvironment): boot_skill_score, plot_folder=self.plot_path, model_name=self.model_display_name, sampling=self._sampling, ahead_dim=self.ahead_dim, separate_vars=to_list(self.target_var), bootstrap_type=boot_type, - bootstrap_method=boot_method) + bootstrap_method=boot_method, branch_names=branch_names) except Exception as e: logging.error(f"Could not create plot PlotFeatureImportanceSkillScore ({boot_type}, " f"{boot_method}) due to the following error:\n{sys.exc_info()[0]}\n" @@ -1035,7 +1065,8 @@ class PostProcessing(RunEnvironment): [model_type, self.observation_indicator]), dim=self.index_dim) # skill score - skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim) + skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim, + type_dim=self.model_type_dim, index_dim=self.index_dim) if external_data is not None: skill_score_competitive[station], skill_score_competitive_count[station] = skill_score.skill_scores() @@ -1062,7 +1093,6 @@ class PostProcessing(RunEnvironment): fill_value=0) return avg_skill_score - @staticmethod def calculate_average_errors(errors): avg_error = {} @@ -1079,6 +1109,7 @@ class PostProcessing(RunEnvironment): report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(report_path) res = [] + max_cols = 0 for boot_type, d0 in results.items(): for boot_method, d1 in d0.items(): for station_name, vals in d1.items(): @@ -1087,8 +1118,9 @@ class PostProcessing(RunEnvironment): res.append([boot_type, boot_method, station_name, boot_var, ahead, *vals.sel({self.boot_var_dim: boot_var, self.ahead_dim: ahead}).values.round(5).tolist()]) + max_cols = max(max_cols, len(res[-1])) col_names = [self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, - *list(range(len(res[0]) - 5))] + *list(range(max_cols - 5))] df = pd.DataFrame(res, columns=col_names) file_name = "feature_importance_skill_score_report_raw.csv" df.to_csv(os.path.join(report_path, file_name), sep=";")