diff --git a/CHANGELOG.md b/CHANGELOG.md index bf0c2b6b3ab672522630b28a1865e020b64ac86b..4f59375d8ee3c245e7d8008e7e8c6d6ff13b3d96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,38 @@ # Changelog All notable changes to this project will be documented in this file. +## v1.2.0 - 2020-12-18 - parallel preprocessing and improved data handlers + +### general: + + * new plots + * parallelism for faster preprocessing + * improved data handler with mixed sampling types + * enhanced test coverage + +### new features: + +* station map plot highlights now subsets on the map and displays number of stations for each subset (#227, #231) +* two new data availability plots `PlotAvailabilityHistogram` (#191, #192, #223) +* introduced parallel code in preprocessing if system supports parallelism (#164, #224, #225) +* data handler `DataHandlerMixedSampling` (and inheritances) supports an offset parameter to end inputs at a different time than 00 hours (#220) +* args for data handler `DataHandlerMixedSampling` (and inheritances) that differ for input and target can now be parsed as tuple (#229) + +### technical: + +* added templates for release and bug issues (#189) +* improved test coverage (#236, #238, #239, #240, #241, #242, #243, #244, #245) +* station map plot includes now number of stations for each subset (#231) +* postprocessing plots are encapsulated in try except statements (#107) +* updated git settings (#213) +* bug fix for data handler (#235) +* reordering and bug fix for preprocessing reporting (#207, #232) +* bug fix for outdated system path style (#226) +* new plots are included in default plot list (#211) +* `helpers/join` connection to ToarDB (e.g. used by DefaultDataHandler) reports now which variable could not be loaded (#222) +* plot `PlotBootstrapSkillScore` can now additionally highlight specific variables, but not included in postprocessing up to now (#201) +* data handler `DataHandlerMixedSampling` has now a reduced data loading (#221) + ## v1.1.0 - 2020-11-18 - hourly resolution support and new data handlers ### general: diff --git a/README.md b/README.md index 2e7b0cff48ba92143263c65c7a3fa82c139b86c8..c48b7cdb44b6f98a6a1f12a81c0a4717cc1e0d41 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ HPC systems, see [here](#special-instructions-for-installation-on-jülich-hpc-sy * Installation of **MLAir**: * Either clone MLAir from the [gitlab repository](https://gitlab.version.fz-juelich.de/toar/mlair.git) and use it without installation (beside the requirements) - * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.1.0-py3-none-any.whl)) + * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.2.0-py3-none-any.whl)) and install it via `pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using `import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't diff --git a/dist/mlair-1.2.0-py3-none-any.whl b/dist/mlair-1.2.0-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..7b4c3eff904e45a20591e80eccd3f3720d3d339a Binary files /dev/null and b/dist/mlair-1.2.0-py3-none-any.whl differ diff --git a/docs/_source/get-started.rst b/docs/_source/get-started.rst index 477b4b89e5d56d1ec7a94301a4f9378dc1dce7dd..ede3cebfb7e1d9f673da3751c0cc2ab4dfba12ea 100644 --- a/docs/_source/get-started.rst +++ b/docs/_source/get-started.rst @@ -31,7 +31,7 @@ Installation of MLAir * Install all requirements from `requirements.txt <https://gitlab.version.fz-juelich.de/toar/machinelearningtools/-/blob/master/requirements.txt>`_ preferably in a virtual environment * Either clone MLAir from the `gitlab repository <https://gitlab.version.fz-juelich.de/toar/machinelearningtools.git>`_ -* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.1.0-py3-none-any.whl>`_) +* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.2.0-py3-none-any.whl>`_) and install it via :py:`pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using :py:`import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't diff --git a/mlair/__init__.py b/mlair/__init__.py index 217038ca4ac8992c3294a04e15d92a2d3b18bf80..e9a157ca5bba11b22e80df0f3f18092fb0f32db6 100644 --- a/mlair/__init__.py +++ b/mlair/__init__.py @@ -1,6 +1,6 @@ __version_info__ = { 'major': 1, - 'minor': 1, + 'minor': 2, 'micro': 0, } diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 3db6618a5e8ebd575d61bc261144ff47ccaf9b53..546a463650ccca4c6f7e2b63b3afb01db9d90a40 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -8,8 +8,9 @@ __date__ = '2019-10-23' import numpy as np import xarray as xr import pandas as pd -from typing import Union, Tuple, Dict +from typing import Union, Tuple, Dict, List from matplotlib import pyplot as plt +import itertools from mlair.helpers import to_list, remove_items @@ -178,29 +179,56 @@ class SkillScores: skill_scores_clim = skill_scores_class.climatological_skill_scores(external_data, window_lead_time=3) """ + models_default = ["cnn", "persi", "ols"] - def __init__(self, internal_data: Data): + def __init__(self, internal_data: Data, models=None, observation_name="obs"): """Set internal data.""" self.internal_data = internal_data + self.models = self.set_model_names(models) + self.observation_name = observation_name + + def set_model_names(self, models: List[str]) -> List[str]: + """Either use given models or use defaults.""" + if models is None: + models = self.models_default + return self._reorder(models) + + @staticmethod + def _reorder(model_list: List[str]) -> List[str]: + """Set elements persi and obs at the very end of given list.""" + for element in ["persi", "obs"]: + if element in model_list: + model_list.append(model_list.pop(model_list.index(element))) + return model_list + + def get_model_name_combinations(self): + """Return all combinations of two models as tuple and string.""" + combinations = list(itertools.combinations(self.models, 2)) + combination_strings = [f"{first}-{second}" for (first, second) in combinations] + return combinations, combination_strings def skill_scores(self, window_lead_time: int) -> pd.DataFrame: """ - Calculate skill scores for all combinations of CNN, persistence and OLS. + Calculate skill scores for all combinations of model names. :param window_lead_time: length of forecast steps :return: skill score for each comparison and forecast step """ ahead_names = list(range(1, window_lead_time + 1)) - skill_score = pd.DataFrame(index=['cnn-persi', 'ols-persi', 'cnn-ols']) + combinations, combination_strings = self.get_model_name_combinations() + skill_score = pd.DataFrame(index=combination_strings) for iahead in ahead_names: data = self.internal_data.sel(ahead=iahead) - skill_score[iahead] = [self.general_skill_score(data, forecast_name="CNN", reference_name="persi"), - self.general_skill_score(data, forecast_name="OLS", reference_name="persi"), - self.general_skill_score(data, forecast_name="CNN", reference_name="OLS")] + skill_score[iahead] = [self.general_skill_score(data, + forecast_name=first, + reference_name=second, + observation_name=self.observation_name) + for (first, second) in combinations] return skill_score - def climatological_skill_scores(self, external_data: Data, window_lead_time: int) -> xr.DataArray: + def climatological_skill_scores(self, external_data: Data, window_lead_time: int, + forecast_name: str) -> xr.DataArray: """ Calculate climatological skill scores according to Murphy (1988). @@ -209,6 +237,7 @@ class SkillScores: :param external_data: external data :param window_lead_time: interested time step of forecast horizon to select data + :param forecast_name: name of the forecast to use for this calculation (must be available in `data`) :return: all CASES as well as all terms """ @@ -224,30 +253,28 @@ class SkillScores: data = self.internal_data.sel(ahead=iahead) skill_score.loc[["CASE I", "AI", "BI", "CI"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=1, forecast_name="CNN").values.flatten()) + data, mu_type=1, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) skill_score.loc[["CASE II", "AII", "BII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=2, forecast_name="CNN").values.flatten()) + data, mu_type=2, forecast_name=forecast_name, observation_name=self.observation_name).values.flatten()) if external_data is not None: skill_score.loc[["CASE III", "AIII"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=3, forecast_name="CNN", + data, mu_type=3, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) skill_score.loc[["CASE IV", "AIV", "BIV", "CIV"], iahead] = np.stack(self._climatological_skill_score( - data, mu_type=4, forecast_name="CNN", + data, mu_type=4, forecast_name=forecast_name, observation_name=self.observation_name, external_data=external_data).values.flatten()) return skill_score - def _climatological_skill_score(self, data, mu_type=1, observation_name="obs", forecast_name="CNN", - external_data=None): + def _climatological_skill_score(self, data, observation_name, forecast_name, mu_type=1, external_data=None): kwargs = {"external_data": external_data} if external_data is not None else {} return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) - @staticmethod - def general_skill_score(data: Data, observation_name: str = "obs", forecast_name: str = "CNN", - reference_name: str = "persi") -> np.ndarray: + def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, + observation_name: str = None) -> np.ndarray: r""" Calculate general skill score based on mean squared error. @@ -258,6 +285,8 @@ class SkillScores: :return: skill score of forecast """ + if observation_name is None: + observation_name = self.observation_name data = data.dropna("index") observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) @@ -284,49 +313,51 @@ class SkillScores: :returns: Terms AI, BI, and CI, internal data without nans and mean, variance, correlation and its p-value """ - data = data.loc[..., [observation_name, forecast_name]].drop("ahead") + data = data.sel(type=[observation_name, forecast_name]).drop("ahead") data = data.dropna("index") mean = data.mean("index") sigma = np.sqrt(data.var("index")) - r, p = stats.pearsonr(data.loc[..., forecast_name], data.loc[..., observation_name]) + r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [forecast_name, observation_name]]) AI = np.array(r ** 2) - BI = ((r - (sigma.loc[..., forecast_name] / sigma.loc[..., observation_name])) ** 2).values - CI = (((mean.loc[..., forecast_name] - mean.loc[..., observation_name]) / sigma.loc[ - ..., observation_name]) ** 2).values + BI = ((r - (sigma.sel(type=forecast_name, drop=True) / sigma.sel(type=observation_name, + drop=True))) ** 2).values + CI = (((mean.sel(type=forecast_name, drop=True) - mean.sel(type=observation_name, drop=True)) / sigma.sel( + type=observation_name, drop=True)) ** 2).values suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix - def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"): + def skill_score_mu_case_1(self, data, observation_name, forecast_name): """Calculate CASE I.""" AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) skill_score = np.array(AI - BI - CI) return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() - def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"): + def skill_score_mu_case_2(self, data, observation_name, forecast_name): """Calculate CASE II.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) data = xr.concat([data, monthly_mean], dim="type") sigma = suffix["sigma"] sigma_monthly = np.sqrt(monthly_mean.var()) - r, p = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + r, p = stats.pearsonr(*[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) AII = np.array(r ** 2) - BII = ((r - sigma_monthly / sigma.loc[observation_name]) ** 2).values + BII = ((r - sigma_monthly / sigma.sel(type=observation_name, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() - def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + def skill_score_mu_case_3(self, data, observation_name, forecast_name, external_data=None): """Calculate CASE III.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] - AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values + AIII = (((external_data.mean().values - mean.sel(type=observation_name, drop=True)) / sigma.sel( + type=observation_name, drop=True)) ** 2).values skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() - def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None): + def skill_score_mu_case_4(self, data, observation_name, forecast_name, external_data=None): """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, @@ -337,12 +368,13 @@ class SkillScores: mean_external = monthly_mean_external.mean() sigma_external = np.sqrt(monthly_mean_external.var()) - # r_mu, p_mu = stats.spearmanr(data.loc[..., [observation_name, observation_name+'X']]) - r_mu, p_mu = stats.pearsonr(data.loc[..., observation_name], data.loc[..., observation_name + "X"]) + r_mu, p_mu = stats.pearsonr( + *[data.sel(type=n).values.flatten() for n in [observation_name, observation_name + "X"]]) AIV = np.array(r_mu ** 2) - BIV = ((r_mu - sigma_external / sigma.loc[observation_name]) ** 2).values - CIV = (((mean_external - mean.loc[observation_name]) / sigma.loc[observation_name]) ** 2).values + BIV = ((r_mu - sigma_external / sigma.sel(type=observation_name, drop=True)) ** 2).values + CIV = (((mean_external - mean.sel(type=observation_name, drop=True)) / sigma.sel(type=observation_name, + drop=True)) ** 2).values skill_score = np.array((AI - BI - CI - AIV + BIV + CIV) / (1 - AIV + BIV + CIV)) return pd.DataFrame( {"skill_score": [skill_score], "AIV": [AIV], "BIV": [BIV], "CIV": CIV}).to_xarray().to_array() @@ -368,7 +400,7 @@ class SkillScores: mu = data.groupby("index.month").mean() for month in mu.month: - monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values + monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values.flatten() return monthly_mean diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index f775f7419dba7530d8dbfbde9250d38c312496fa..2c7d8fdedb2720b1276b77a5e817723094ecf03d 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -695,13 +695,18 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): """ - def __init__(self, data: pd.DataFrame, plot_folder=".", model_setup="CNN"): + def __init__(self, data: pd.DataFrame, plot_folder=".", model_setup="NN"): """Initialise.""" super().__init__(plot_folder, f"skill_score_competitive_{model_setup}") + self._model_setup = model_setup self._labels = None self._data = self._prepare_data(data) self._plot() self._save() + # draw also a vertical version + self.plot_name += "_vertical" + self._plot_vertical() + self._save() def _prepare_data(self, data: pd.DataFrame) -> pd.DataFrame: """ @@ -720,21 +725,43 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): return data.stack(level=0).reset_index(level=2, drop=True).reset_index(name="data") def _plot(self): - """Plot skill scores of the comparisons cnn-persi, ols-persi and cnn-ols.""" + """Plot skill scores of the comparisons.""" fig, ax = plt.subplots() + order = self._create_pseudo_order() sns.boxplot(x="comparison", y="data", hue="ahead", data=self._data, whis=1., ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, - order=["cnn-persi", "ols-persi", "cnn-ols"]) + order=order) ax.axhline(y=0, color="grey", linewidth=.5) - ax.set(ylabel="skill score", xlabel="competing models", title="summary of all stations", ylim=self._ylim()) + ax.set(ylabel="skill score", xlabel="competing models", title="summary of all stations", ylim=self._lim()) + handles, _ = ax.get_legend_handles_labels() + plt.xticks(rotation=20) + ax.legend(handles, self._labels) + plt.tight_layout() + + def _plot_vertical(self): + """Plot skill scores of the comparisons, but vertically aligned.""" + fig, ax = plt.subplots() + order = self._create_pseudo_order() + sns.boxplot(y="comparison", x="data", hue="ahead", data=self._data, whis=1., ax=ax, palette="Blues_d", + showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, + order=order) + # ax.axhline(x=0, color="grey", linewidth=.5) + ax.axvline(x=0, color="grey", linewidth=.5) + ax.set(xlabel="skill score", ylabel="competing models", title="summary of all stations", xlim=self._lim()) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) plt.tight_layout() - def _ylim(self) -> Tuple[float, float]: + def _create_pseudo_order(self): + """Provide first predefined elements and append all remaining.""" + first_elements = [f"{self._model_setup}-persi", "ols-persi", f"{self._model_setup}-ols"] + uniq, index = np.unique(first_elements + self._data.comparison.unique().tolist(), return_index=True) + return uniq[index.argsort()] + + def _lim(self) -> Tuple[float, float]: """ - Calculate y-axis limits from data. + Calculate axis limits from data (Can be used to set axis extend). Lower limit is the minimum of 0 and data's minimum (reduced by small subtrahend) and upper limit is data's maximum (increased by a small addend). @@ -1173,6 +1200,7 @@ class PlotAvailability(AbstractPlotClass): def _plot(self, plt_dict): colors = self.get_dataset_colors() + _used_colors = [] pos = 0 height = 0.8 # should be <= 1 yticklabels = [] @@ -1184,13 +1212,15 @@ class PlotAvailability(AbstractPlotClass): plt_data = d.get(subset) if plt_data is None: continue + elif color not in _used_colors: # this is required for a proper legend creation + _used_colors.append(color) ax.broken_barh(plt_data, (pos, height), color=color, edgecolor="white", linewidth=self.linewidth) yticklabels.append(station) ax.set_ylim([height, number_of_stations + 1]) ax.set_yticks(np.arange(len(plt_dict.keys())) + 1 + height / 2) ax.set_yticklabels(yticklabels) - handles = [mpatches.Patch(color=c, label=k) for k, c in colors.items()] + handles = [mpatches.Patch(color=c, label=k) for k, c in colors.items() if c in _used_colors] lgd = plt.legend(handles=handles, bbox_to_anchor=(0, 1, 1, 0.2), loc="lower center", ncol=len(handles)) return lgd diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 54d2307718bf083cfbfb8296682c9c545157eb72..e772bbaaba5a7618b95f345dcb7056616c56403e 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -226,7 +226,7 @@ class ExperimentSetup(RunEnvironment): number_of_bootstraps=None, create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None, hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, - data_origin: Dict = None, **kwargs): + data_origin: Dict = None, competitors: list = None, competitor_path: str = None, **kwargs): # create run framework super().__init__() @@ -345,6 +345,12 @@ class ExperimentSetup(RunEnvironment): self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing") self._set_param("neighbors", ["DEBW030"]) # TODO: just for testing + # set competitors + self._set_param("competitors", helpers.to_list(competitors), default=[]) + competitor_path_default = os.path.join(self.data_store.get("data_path"), "competitors", + "_".join(self.data_store.get("target_var"))) + self._set_param("competitor_path", competitor_path, default=competitor_path_default) + # check variables, statistics and target variable self._check_target_var() self._compare_variables_and_statistics() diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index cd8ee266a092a6afb5111ad7b241b38cdbfb6fa7..ebcf105f3b4b6c36b95d003275a2e3e52c78e61e 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -15,7 +15,7 @@ import xarray as xr from mlair.data_handler import BootStraps, KerasIterator from mlair.helpers.datastore import NameNotFoundInDataStore -from mlair.helpers import TimeTracking, statistics, extract_value +from mlair.helpers import TimeTracking, statistics, extract_value, remove_items from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules.model_class import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ @@ -64,11 +64,13 @@ class PostProcessing(RunEnvironment): """Initialise and run post-processing.""" super().__init__() self.model: keras.Model = self._load_model() + self.model_name = self.data_store.get("model_name", "model").rsplit("/", 1)[1].split(".", 1)[0] self.ols_model = None self.batch_size: int = self.data_store.get_default("batch_size", "model", 64) self.test_data = self.data_store.get("data_collection", "test") batch_path = self.data_store.get("batch_path", scope="test") - self.test_data_distributed = KerasIterator(self.test_data, self.batch_size, model=self.model, name="test", batch_path=batch_path) + self.test_data_distributed = KerasIterator(self.test_data, self.batch_size, model=self.model, name="test", + batch_path=batch_path) self.train_data = self.data_store.get("data_collection", "train") self.val_data = self.data_store.get("data_collection", "val") self.train_val_data = self.data_store.get("data_collection", "train_val") @@ -78,6 +80,9 @@ class PostProcessing(RunEnvironment): self.window_lead_time = extract_value(self.data_store.get("output_shape", "model")) self.skill_scores = None self.bootstrap_skill_scores = None + self.competitor_path = self.data_store.get("competitor_path") + self.competitors = self.data_store.get_default("competitors", default=[]) + self.forecast_indicator = "nn" self._run() def _run(self): @@ -87,7 +92,7 @@ class PostProcessing(RunEnvironment): # forecasts self.make_prediction() - # skill scores on test data + # calculate error metrics on test data self.calculate_test_score() # bootstraps @@ -103,6 +108,27 @@ class PostProcessing(RunEnvironment): # plotting self.plot() + def load_competitors(self, station_name: str) -> xr.DataArray: + """ + Load all requested and available competitors for a given station. Forecasts must be available in the competitor + path like `<competitor_path>/<target_var>/forecasts_<station_name>_test.nc`. The naming style is equal for all + forecasts of MLAir, so that forecasts of a different experiment can easily be copied into the competitor path + without any change. + + :param station_name: station indicator to load competitors for + + :return: a single xarray with all competing forecasts + """ + competing_predictions = [] + for competitor_name in self.competitors: + try: + prediction = self._create_competitor_forecast(station_name, competitor_name) + competing_predictions.append(prediction) + except (FileNotFoundError, KeyError): + logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") + continue + return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0) -> None: """ Calculate skill scores of bootstrapped data. @@ -217,13 +243,18 @@ class PostProcessing(RunEnvironment): score[str(station)] = xr.DataArray(skill, dims=["boot_var", "ahead"]) return score - @staticmethod - def get_orig_prediction(path, file_name, number_of_bootstraps, prediction_name="CNN"): + def get_orig_prediction(self, path, file_name, number_of_bootstraps, prediction_name=None): + if prediction_name is None: + prediction_name = self.forecast_indicator file = os.path.join(path, file_name) prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze() vals = np.tile(prediction.data, (number_of_bootstraps, 1)) return vals[~np.isnan(vals).any(axis=1), :] + def _get_model_name(self): + """Return model name without path information.""" + return self.data_store.get("model_name", "model").rsplit("/", 1)[1].split(".", 1)[0] + def _load_model(self) -> keras.models: """ Load NN model either from data store or from local path. @@ -273,7 +304,8 @@ class PostProcessing(RunEnvironment): try: if (self.bootstrap_skill_scores is not None) and ("PlotBootstrapSkillScore" in plot_list): - PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN") + PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, + model_setup=self.forecast_indicator) except Exception as e: logging.error(f"Could not create plot PlotBootstrapSkillScore due to the following error: {e}") @@ -309,15 +341,17 @@ class PostProcessing(RunEnvironment): try: if "PlotClimatologicalSkillScore" in plot_list: - PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN") + PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, + model_setup=self.forecast_indicator) PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, - extra_name_tag="all_terms_", model_setup="CNN") + extra_name_tag="all_terms_", model_setup=self.forecast_indicator) except Exception as e: logging.error(f"Could not create plot PlotClimatologicalSkillScore due to the following error: {e}") try: if "PlotCompetitiveSkillScore" in plot_list: - PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") + PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, + model_setup=self.forecast_indicator) except Exception as e: logging.error(f"Could not create plot PlotCompetitiveSkillScore due to the following error: {e}") @@ -392,16 +426,17 @@ class PostProcessing(RunEnvironment): normalised) # observation - observation = self._create_observation(target_data, observation, mean, std, transformation_method, normalised) + observation = self._create_observation(target_data, observation, mean, std, transformation_method, + normalised) # merge all predictions full_index = self.create_fullindex(observation_data.indexes[time_dimension], self._get_frequency()) + prediction_dict = {self.forecast_indicator: nn_prediction, + "persi": persistence_prediction, + "obs": observation, + "ols": ols_prediction} all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes['window']), - time_dimension, - CNN=nn_prediction, - persi=persistence_prediction, - obs=observation, - OLS=ols_prediction) + time_dimension, **prediction_dict) # save all forecasts locally path = self.data_store.get("forecast_path") @@ -414,6 +449,26 @@ class PostProcessing(RunEnvironment): getter = {"daily": "1D", "hourly": "1H"} return getter.get(self._sampling, None) + def _create_competitor_forecast(self, station_name: str, competitor_name: str) -> xr.DataArray: + """ + Load and format the competing forecast of a distinct model indicated by `competitor_name` for a distinct station + indicated by `station_name`. The name of the competitor is set in the `type` axis as indicator. This method will + raise either a `FileNotFoundError` or `KeyError` if no competitor could be found for the given station. Either + there is no file provided in the expected path or no forecast for given `competitor_name` in the forecast file. + + :param station_name: name of the station to load data for + :param competitor_name: name of the model + :return: the forecast of the given competitor + """ + path = os.path.join(self.competitor_path, competitor_name) + file = os.path.join(path, f"forecasts_{station_name}_test.nc") + data = xr.open_dataarray(file) + # data = data.expand_dims(Stations=[station_name]) # ToDo: remove line + forecast = data.sel(type=[self.forecast_indicator]) + forecast.coords["type"] = [competitor_name] + return forecast + + @staticmethod def _create_observation(data, _, mean: xr.DataArray, std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray: @@ -441,7 +496,7 @@ class PostProcessing(RunEnvironment): Inverse transformation is applied to the forecast to get the output in the original space. - :param data: transposed history from DataPrep + :param input_data: transposed history from DataPrep :param ols_prediction: empty array in right shape to fill with data :param mean: mean of target value transformation :param std: standard deviation of target value transformation @@ -559,7 +614,8 @@ class PostProcessing(RunEnvironment): res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan), coords=[index.index, ahead_names, keys], dims=['index', 'ahead', 'type']) for k, v in kwargs.items(): - match_index = np.stack(set(res.index.values) & set(v.indexes[time_dimension].values)) + intersection = set(res.index.values) & set(v.indexes[time_dimension].values) + match_index = np.array(list(intersection)) res.loc[match_index, :, k] = v.loc[match_index] return res @@ -600,9 +656,11 @@ class PostProcessing(RunEnvironment): for station in self.test_data: file = os.path.join(path, f"forecasts_{str(station)}_test.nc") data = xr.open_dataarray(file) - skill_score = statistics.SkillScores(data) - external_data = self._get_external_data(station) + competitor = self.load_competitors(str(station)) + combined = xr.concat([data, competitor], dim="type") if competitor is not None else data + skill_score = statistics.SkillScores(combined, models=remove_items(list(combined.type.values), "obs")) + external_data = self._get_external_data(station) # ToDo: check if external is still right? skill_score_competitive[station] = skill_score.skill_scores(self.window_lead_time) - skill_score_climatological[station] = skill_score.climatological_skill_scores(external_data, - self.window_lead_time) + skill_score_climatological[station] = skill_score.climatological_skill_scores( + external_data, self.window_lead_time, forecast_name=self.forecast_indicator) return skill_score_competitive, skill_score_climatological diff --git a/run.py b/run.py index 15f30a7ee775948fa744832a464562cd40c3e460..9f5d0f083081d097e24b86f5dfc2b3b380e28e9b 100644 --- a/run.py +++ b/run.py @@ -3,11 +3,26 @@ __date__ = '2020-06-29' import argparse from mlair.workflows import DefaultWorkflow +from mlair.helpers import remove_items +from mlair.configuration.defaults import DEFAULT_PLOT_LIST -def main(parser_args): +def load_stations(): + import json + try: + filename = 'supplement/station_list_north_german_plain_rural.json' + with open(filename, 'r') as jfile: + stations = json.load(jfile) + except FileNotFoundError: + stations = None + return stations + - workflow = DefaultWorkflow(**parser_args.__dict__) +def main(parser_args): + plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + workflow = DefaultWorkflow(stations=load_stations(), + train_model=False, create_new_model=False, network="UBA", + evaluate_bootstraps=False, plot_list=["PlotStationMap"], **parser_args.__dict__) workflow.run() diff --git a/run_hourly.py b/run_hourly.py index a21c779bc007c7fbe67c98584687be3954e1d62c..48c7205883eda7e08ee1c14fe3c0a8a9f429e3da 100644 --- a/run_hourly.py +++ b/run_hourly.py @@ -9,7 +9,7 @@ from mlair.workflows import DefaultWorkflow def load_stations(): import json try: - filename = 'supplement/station_list_north_german_plain.json' + filename = 'supplement/station_list_north_german_plain_rural.json' with open(filename, 'r') as jfile: stations = json.load(jfile) except FileNotFoundError: @@ -18,8 +18,11 @@ def load_stations(): def main(parser_args): - - workflow = DefaultWorkflow(sampling="hourly", window_history_size=48, **parser_args.__dict__) + workflow = DefaultWorkflow(sampling="hourly", window_history_size=8, stations=load_stations(), + train_model=False, + create_new_model=False, + network="UBA", + plot_list=["PlotStationMap"], **parser_args.__dict__) workflow.run() diff --git a/supplement/station_list_north_german_plain_rural.json b/supplement/station_list_north_german_plain_rural.json new file mode 100644 index 0000000000000000000000000000000000000000..2991bfbf4168a101fe340909b4e6be8bf23dd987 --- /dev/null +++ b/supplement/station_list_north_german_plain_rural.json @@ -0,0 +1,40 @@ +[ + "DENI031", + "DEBE062", + "DEUB020", + "DESH001", + "DEUB022", + "DEBB053", + "DEMV017", + "DENI063", + "DENI058", + "DESH014", + "DEUB007", + "DEUB005", + "DEBB051", + "DEUB034", + "DEST089", + "DEUB028", + "DESH017", + "DEUB030", + "DEMV012", + "DENI059", + "DENI060", + "DESH013", + "DEUB006", + "DEUB027", + "DEUB026", + "DEUB038", + "DEMV001", + "DEUB024", + "DEUB037", + "DESH008", + "DEMV004", + "DEUB040", + "DEMV024", + "DEMV026", + "DESH056", + "DEHH063", + "DEUB001", + "DEST069" +]