diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index d3f6a56a2ec42e41454f0ced8853cb8d1001e495..a41c636b5ab17d2039f7976fca625e9c8e11ce6e 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -50,8 +50,8 @@ class PlotMonthlySummary(RunEnvironment): def _prepare_data(self, stations: List) -> xr.DataArray: """ - Pre-process data required to plot. For each station, load locally saved predictions, extract the CNN and orig - prediction and group them into monthly bins (no aggregation, only sorting them). + Pre-process data required to plot. For each station, load locally saved predictions, extract the CNN prediction + and the observation and group them into monthly bins (no aggregation, only sorting them). :param stations: all stations to plot :return: The entire data set, flagged with the corresponding month. """ @@ -65,10 +65,10 @@ class PlotMonthlySummary(RunEnvironment): if len(data_cnn.shape) > 1: data_cnn.coords["ahead"].values = [f"{days}d" for days in data_cnn.coords["ahead"].values] - data_orig = data.sel(type="orig", ahead=1).squeeze() - data_orig.coords["ahead"] = "orig" + data_obs = data.sel(type="obs", ahead=1).squeeze() + data_obs.coords["ahead"] = "obs" - data_concat = xr.concat([data_orig, data_cnn], dim="ahead") + data_concat = xr.concat([data_obs, data_cnn], dim="ahead") data_concat = data_concat.drop("type") data_concat.index.values = data_concat.index.values.astype("datetime64[M]").astype(int) % 12 + 1 @@ -189,7 +189,7 @@ class PlotStationMap(RunEnvironment): plt.close('all') -def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_window: int = 3, ref_name: str = 'orig', +def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_window: int = 3, ref_name: str = 'obs', pred_name: str = 'CNN', season: str = "", forecast_path: str = None, plot_name_affix: str = "", units: str = "ppb"): """ @@ -222,7 +222,7 @@ def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_w for station in stations: file = os.path.join(forecast_path, f"forecasts_{station}_test.nc") data_tmp = xr.open_dataarray(file) - data_collector.append(data_tmp.loc[:, :, ['CNN', 'orig', 'OLS']].assign_coords(station=station)) + data_collector.append(data_tmp.loc[:, :, ['CNN', 'obs', 'OLS']].assign_coords(station=station)) return xr.concat(data_collector, dim='station').transpose('index', 'type', 'ahead', 'station') def segment_data(data): @@ -252,7 +252,7 @@ def plot_conditional_quantiles(stations: list, plot_folder: str = ".", rolling_w def labels(plot_type, data_unit="ppb"): names = (f"forecast concentration (in {data_unit})", f"observed concentration (in {data_unit})") - if plot_type == "orig": + if plot_type == "obs": return names else: return names[::-1] @@ -515,7 +515,7 @@ class PlotTimeSeries(RunEnvironment): logging.debug(f"... preprocess station {station}") file_name = os.path.join(self._data_path, self._data_name % station) data = xr.open_dataarray(file_name) - return data.sel(type=["CNN", "orig"]) + return data.sel(type=["CNN", "obs"]) def _plot(self, plot_folder): pdf_pages = self._create_pdf_pages(plot_folder) @@ -529,7 +529,7 @@ class PlotTimeSeries(RunEnvironment): for i_half_of_year in range(factor): pos = factor * i_year + i_half_of_year plot_data = self._create_plot_data(data_year, factor, i_half_of_year) - self._plot_orig(axes[pos], plot_data) + self._plot_obs(axes[pos], plot_data) self._plot_ahead(axes[pos], plot_data) if np.isnan(plot_data.values).all(): nan_list.append(pos) @@ -574,10 +574,10 @@ class PlotTimeSeries(RunEnvironment): label = f"{ahead}{self._sampling}" ax.plot(index, plot_data.values, color=color[ahead-1], label=label) - def _plot_orig(self, ax, data): - orig_data = data.sel(type="orig", ahead=1) + def _plot_obs(self, ax, data): + obs_data = data.sel(type="obs", ahead=1) index = data.index + np.timedelta64(1, self._sampling) - ax.plot(index, orig_data.values, color=matplotlib.colors.cnames["green"], label="orig") + ax.plot(index, obs_data.values, color=matplotlib.colors.cnames["green"], label="obs") @staticmethod def _get_time_range(data): diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index 3c50799b939da2bc8517d1e58dd2c73baebe367b..06203c879872891f57c719040482fe052824c65e 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -43,7 +43,7 @@ class PostProcessing(RunEnvironment): with TimeTracking(): self.train_ols_model() logging.info("take a look on the next reported time measure. If this increases a lot, one should think to " - "skip make_prediction() whenever it is possible to save time.") + "skip train_ols_model() whenever it is possible to save time.") with TimeTracking(): self.make_prediction() logging.info("take a look on the next reported time measure. If this increases a lot, one should think to " @@ -64,9 +64,9 @@ class PostProcessing(RunEnvironment): logging.debug("Run plotting routines...") path = self.data_store.get("forecast_path", "general") - plot_conditional_quantiles(self.test_data.stations, pred_name="CNN", ref_name="orig", + plot_conditional_quantiles(self.test_data.stations, pred_name="CNN", ref_name="obs", forecast_path=path, plot_name_affix="cali-ref", plot_folder=self.plot_path) - plot_conditional_quantiles(self.test_data.stations, pred_name="orig", ref_name="CNN", + plot_conditional_quantiles(self.test_data.stations, pred_name="obs", ref_name="CNN", forecast_path=path, plot_name_affix="like-bas", plot_folder=self.plot_path) PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) PlotMonthlySummary(self.test_data.stations, path, r"forecasts_%s_test.nc", self.target_var, @@ -113,15 +113,15 @@ class PostProcessing(RunEnvironment): # ols ols_prediction = self._create_ols_forecast(input_data, ols_prediction, mean, std, transformation_method) - # orig pred - orig_pred = self._create_orig_forecast(data, None, mean, std, transformation_method) + # observation + observation = self._create_observation(data, None, mean, std, transformation_method) # merge all predictions full_index = self.create_fullindex(data.data.indexes['datetime'], self._get_frequency()) all_predictions = self.create_forecast_arrays(full_index, list(data.label.indexes['window']), CNN=nn_prediction, persi=persistence_prediction, - orig=orig_pred, + obs=observation, OLS=ols_prediction) # save all forecasts locally @@ -134,7 +134,7 @@ class PostProcessing(RunEnvironment): return getter.get(self._sampling, None) @staticmethod - def _create_orig_forecast(data, _, mean, std, transformation_method): + def _create_observation(data, _, mean, std, transformation_method): return statistics.apply_inverse_transformation(data.label.copy(), mean, std, transformation_method) def _create_ols_forecast(self, input_data, ols_prediction, mean, std, transformation_method): @@ -227,7 +227,7 @@ class PostProcessing(RunEnvironment): try: data = self.train_val_data.get_data_generator(station) mean, std, transformation_method = data.get_transformation_information(variable=self.target_var) - external_data = self._create_orig_forecast(data, None, mean, std, transformation_method) + external_data = self._create_observation(data, None, mean, std, transformation_method) external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"]) return external_data.rename({'datetime': 'index'}) except KeyError: diff --git a/src/statistics.py b/src/statistics.py index df73784df830d5f7b96bf0fcd18a65d362516f12..e3481d0e0f0561ac8a903648a69e92c6d6acc40d 100644 --- a/src/statistics.py +++ b/src/statistics.py @@ -126,12 +126,12 @@ class SkillScores(RunEnvironment): return skill_score - def _climatological_skill_score(self, data, mu_type=1, observation_name="orig", forecast_name="CNN", external_data=None): + def _climatological_skill_score(self, data, mu_type=1, observation_name="obs", forecast_name="CNN", external_data=None): kwargs = {"external_data": external_data} if external_data is not None else {} return self.__getattribute__(f"skill_score_mu_case_{mu_type}")(data, observation_name, forecast_name, **kwargs) @staticmethod - def general_skill_score(data, observation_name="orig", forecast_name="CNN", reference_name="persi"): + def general_skill_score(data, observation_name="obs", forecast_name="CNN", reference_name="persi"): data = data.dropna("index") observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) @@ -159,12 +159,12 @@ class SkillScores(RunEnvironment): suffix = {"mean": mean, "sigma": sigma, "r": r, "p": p} return AI, BI, CI, data, suffix - def skill_score_mu_case_1(self, data, observation_name="orig", forecast_name="CNN"): + def skill_score_mu_case_1(self, data, observation_name="obs", forecast_name="CNN"): AI, BI, CI, data, _ = self.skill_score_pre_calculations(data, observation_name, forecast_name) skill_score = np.array(AI - BI - CI) return pd.DataFrame({"skill_score": [skill_score], "AI": [AI], "BI": [BI], "CI": [CI]}).to_xarray().to_array() - def skill_score_mu_case_2(self, data, observation_name="orig", forecast_name="CNN"): + def skill_score_mu_case_2(self, data, observation_name="obs", forecast_name="CNN"): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean = self.create_monthly_mean_from_daily_data(data) data = xr.concat([data, monthly_mean], dim="type") @@ -177,14 +177,14 @@ class SkillScores(RunEnvironment): skill_score = np.array((AI - BI - CI - AII + BII) / (1 - AII + BII)) return pd.DataFrame({"skill_score": [skill_score], "AII": [AII], "BII": [BII]}).to_xarray().to_array() - def skill_score_mu_case_3(self, data, observation_name="orig", forecast_name="CNN", external_data=None): + def skill_score_mu_case_3(self, data, observation_name="obs", forecast_name="CNN", external_data=None): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) mean, sigma = suffix["mean"], suffix["sigma"] AIII = (((external_data.mean().values - mean.loc[observation_name]) / sigma.loc[observation_name])**2).values skill_score = np.array((AI - BI - CI + AIII) / 1 + AIII) return pd.DataFrame({"skill_score": [skill_score], "AIII": [AIII]}).to_xarray().to_array() - def skill_score_mu_case_4(self, data, observation_name="orig", forecast_name="CNN", external_data=None): + def skill_score_mu_case_4(self, data, observation_name="obs", forecast_name="CNN", external_data=None): AI, BI, CI, data, suffix = self.skill_score_pre_calculations(data, observation_name, forecast_name) monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, columns=data.type.values, index=data.index) data = xr.concat([data, monthly_mean_external], dim="type") diff --git a/test/test_modules/test_experiment_setup.py b/test/test_modules/test_experiment_setup.py index 7a4f16fd055e0af0aea95181d475d061c185ca92..9e6d17627d1697a2150ea7f74a373a720d2f02ac 100644 --- a/test/test_modules/test_experiment_setup.py +++ b/test/test_modules/test_experiment_setup.py @@ -47,7 +47,8 @@ class TestExperimentSetup: data_store = exp_setup.data_store # experiment setup assert data_store.get("data_path", "general") == prepare_host() - assert data_store.get("trainable", "general") is False + assert data_store.get("trainable", "general") is True + assert data_store.get("create_new_model", "general") is True assert data_store.get("fraction_of_training", "general") == 0.8 # set experiment name assert data_store.get("experiment_name", "general") == "TestExperiment" @@ -104,13 +105,14 @@ class TestExperimentSetup: target_var="temp", target_dim="target", window_lead_time=10, dimensions="dim1", interpolate_dim="int_dim", interpolate_method="cubic", limit_nan_fill=5, train_start="2000-01-01", train_end="2000-01-02", val_start="2000-01-03", val_end="2000-01-04", test_start="2000-01-05", - test_end="2000-01-06", use_all_stations_on_all_data_sets=False, trainable=True, - fraction_of_train=0.5, experiment_path=experiment_path) + test_end="2000-01-06", use_all_stations_on_all_data_sets=False, trainable=False, + fraction_of_train=0.5, experiment_path=experiment_path, create_new_model=True) exp_setup = ExperimentSetup(**kwargs) data_store = exp_setup.data_store # experiment setup assert data_store.get("data_path", "general") == prepare_host() assert data_store.get("trainable", "general") is True + assert data_store.get("create_new_model", "general") is True assert data_store.get("fraction_of_training", "general") == 0.5 # set experiment name assert data_store.get("experiment_name", "general") == "TODAY_network" @@ -150,10 +152,30 @@ class TestExperimentSetup: # use all stations on all data sets (train, val, test) assert data_store.get("use_all_stations_on_all_data_sets", "general.test") is False + def test_init_trainable_behaviour(self): + exp_setup = ExperimentSetup(trainable=False, create_new_model=True) + data_store = exp_setup.data_store + assert data_store.get("trainable", "general") is True + assert data_store.get("create_new_model", "general") is True + exp_setup = ExperimentSetup(trainable=False, create_new_model=False) + data_store = exp_setup.data_store + assert data_store.get("trainable", "general") is False + assert data_store.get("create_new_model", "general") is False + exp_setup = ExperimentSetup(trainable=True, create_new_model=True) + data_store = exp_setup.data_store + assert data_store.get("trainable", "general") is True + assert data_store.get("create_new_model", "general") is True + exp_setup = ExperimentSetup(trainable=True, create_new_model=False) + data_store = exp_setup.data_store + assert data_store.get("trainable", "general") is True + assert data_store.get("create_new_model", "general") is False + def test_compare_variables_and_statistics(self): + experiment_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "testExperimentFolder")) kwargs = dict(parser_args={"experiment_date": "TODAY"}, var_all_dict={'o3': 'dma8eu', 'temp': 'maximum'}, - stations=['DEBY053', 'DEBW059', 'DEBW027'], variables=["o3", "relhum"], statistics_per_var=None) + stations=['DEBY053', 'DEBW059', 'DEBW027'], variables=["o3", "relhum"], statistics_per_var=None, + experiment_path=experiment_path) with pytest.raises(ValueError) as e: ExperimentSetup(**kwargs) assert "for the variables: {'relhum'}" in e.value.args[0] diff --git a/test/test_modules/test_model_setup.py b/test/test_modules/test_model_setup.py index 2864ae45bcd7d3c6109d6d84fe5ea152a7d86384..35c5f8ee7581856a9feee3abd0face73ee83952c 100644 --- a/test/test_modules/test_model_setup.py +++ b/test/test_modules/test_model_setup.py @@ -20,6 +20,7 @@ class TestModelSetup: obj.callbacks_name = "placeholder_%s_str.pickle" obj.data_store.set("lr_decay", "dummy_str", "general.model") obj.data_store.set("hist", "dummy_str", "general.model") + obj.model_name = "%s.h5" yield obj RunEnvironment().__del__() diff --git a/test/test_modules/test_training.py b/test/test_modules/test_training.py index 485348ceca740d8263394fca36efbfbde6dd2d0d..ac040c3a286c25dc84853c26c8509278642a1495 100644 --- a/test/test_modules/test_training.py +++ b/test/test_modules/test_training.py @@ -57,10 +57,12 @@ class TestTraining: obj.data_store.set("generator", mock.MagicMock(return_value="mock_test_gen"), "general.test") os.makedirs(path) obj.data_store.set("experiment_path", path, "general") + obj.data_store.set("model_name", os.path.join(path, "test_model.h5"), "general.model") obj.data_store.set("experiment_name", "TestExperiment", "general") path_plot = os.path.join(path, "plots") os.makedirs(path_plot) obj.data_store.set("plot_path", path_plot, "general") + obj._trainable = True yield obj if os.path.exists(path): shutil.rmtree(path) @@ -131,6 +133,7 @@ class TestTraining: obj.data_store.set("generator", generator, "general.test") model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error) obj.data_store.set("model", model, "general.model") + obj.data_store.set("model_name", os.path.join(path, "test_model.h5"), "general.model") obj.data_store.set("batch_size", 256, "general.model") obj.data_store.set("epochs", 2, "general.model") obj.data_store.set("checkpoint", checkpoint, "general.model") @@ -138,6 +141,9 @@ class TestTraining: obj.data_store.set("hist", HistoryAdvanced(), "general.model") obj.data_store.set("experiment_name", "TestExperiment", "general") obj.data_store.set("experiment_path", path, "general") + obj.data_store.set("trainable", True, "general") + obj.data_store.set("create_new_model" + "", True, "general") path_plot = os.path.join(path, "plots") os.makedirs(path_plot) obj.data_store.set("plot_path", path_plot, "general") @@ -179,7 +185,7 @@ class TestTraining: def test_save_model(self, init_without_run, path, caplog): caplog.set_level(logging.DEBUG) - model_name = "TestExperiment_my_model.h5" + model_name = "test_model.h5" assert model_name not in os.listdir(path) init_without_run.save_model() assert caplog.record_tuples[0] == ("root", 10, PyTestRegex(f"save best model to {os.path.join(path, model_name)}"))