diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 242dcd31d01134b75f70a12c2708d4c99bb94301..dc1ffe2e6602a4ef0c63bca641fd5ebf3428ab49 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -44,6 +44,8 @@ DEFAULT_TEST_END = "2017-12-31" DEFAULT_TEST_MIN_LENGTH = 90 DEFAULT_TRAIN_VAL_MIN_LENGTH = 180 DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS = True +DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH = "1m" +DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS = True DEFAULT_EVALUATE_BOOTSTRAPS = True DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 28277d5057698c01594431008b81d959d415c3e2..9126a4eec21d34b8441c1e440bc3af95b8c0b0d3 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -21,7 +21,8 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ - DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA + DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA, \ + DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -219,7 +220,8 @@ class ExperimentSetup(RunEnvironment): data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, - overwrite_lazy_data: bool = None, **kwargs): + overwrite_lazy_data: bool = None, uncertainty_estimate_block_length: str = None, + uncertainty_estimate_evaluate_competitors: bool = None, **kwargs): # create run framework super().__init__() @@ -349,6 +351,11 @@ class ExperimentSetup(RunEnvironment): default=DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS) # set post-processing instructions + self._set_param("uncertainty_estimate_block_length", uncertainty_estimate_block_length, + default=DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH) + self._set_param("uncertainty_estimate_evaluate_competitors", uncertainty_estimate_evaluate_competitors, + default=DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS) + self._set_param("evaluate_bootstraps", evaluate_bootstraps, default=DEFAULT_EVALUATE_BOOTSTRAPS, scope="general.postprocessing") create_new_bootstraps = max([self.data_store.get("train_model", "general"), diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 3cff0447cc25a715242197a108d9ca4f2cdce3ba..e5e6b77196368a571124efc0844ba7f1bb8ed97f 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -91,6 +91,7 @@ class PostProcessing(RunEnvironment): self.ahead_dim = "ahead" self.boot_var_dim = "boot_var" self.model_type_dim = "type" + self.index_dim = "index" self._run() def _run(self): @@ -129,8 +130,9 @@ class PostProcessing(RunEnvironment): # plotting self.plot() - def estimate_sample_uncertainty(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"): - # block_length = self.data_store.get("sample_uncertainty_block_length") + def estimate_sample_uncertainty(self, separate_ahead=False): + block_length = self.data_store.get_default("uncertainty_estimate_block_length", default="1m") + evaluate_competitors = self.data_store.get_default("uncertainty_estimate_evaluate_competitors", default=True) block_mse = self.calculate_block_mse(evaluate_competitors=evaluate_competitors, separate_ahead=separate_ahead, block_length=block_length) @@ -139,7 +141,7 @@ class PostProcessing(RunEnvironment): all_stations = self.data_store.get("stations") start = self.data_store.get("start", "test") end = self.data_store.get("end", "test") - index_dim = "index" + index_dim = self.index_dim coll_dim = "station" collector = [] for station in all_stations: @@ -156,15 +158,18 @@ class PostProcessing(RunEnvironment): else: combined = external_data - # if combined is None: continue else: combined = self.create_full_time_dim(combined, index_dim, self._sampling, start, end) - errors = self.create_error_array(combined) # get squared errors + # get squared errors + errors = self.create_error_array(combined) + # calc mse for each block (single station) mse = errors.resample(indexer={index_dim: block_length}).mean(skipna=True) collector.append(mse.assign_coords({coll_dim: station})) + # calc mse for each block (average over all stations) mse_blocks = xr.concat(collector, dim=coll_dim).mean(dim=coll_dim, skipna=True) + # average also on ahead steps if separate_ahead is False: mse_blocks = mse_blocks.mean(dim=self.ahead_dim, skipna=True) return mse_blocks @@ -257,7 +262,7 @@ class PostProcessing(RunEnvironment): # extract all requirements from data store forecast_path = self.data_store.get("forecast_path") number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing") - dims = ["index", self.ahead_dim, self.model_type_dim] + dims = [self.index_dim, self.ahead_dim, self.model_type_dim] for station in self.test_data: X, Y = None, None bootstraps = BootStraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type, @@ -317,7 +322,7 @@ class PostProcessing(RunEnvironment): orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps) orig = orig.reshape(shape) coords = (range(shape[0]), range(1, shape[1] + 1), ["orig"]) - orig = xr.DataArray(orig, coords=coords, dims=["index", self.ahead_dim, self.model_type_dim]) + orig = xr.DataArray(orig, coords=coords, dims=[self.index_dim, self.ahead_dim, self.model_type_dim]) # calculate skill scores for each variable skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1)) @@ -587,6 +592,7 @@ class PostProcessing(RunEnvironment): "ols": ols_prediction} all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes[window_dim]), time_dimension, ahead_dim=self.ahead_dim, + index_dim=self.index_dim, type_dim=self.model_type_dim, **prediction_dict) # save all forecasts locally @@ -746,7 +752,7 @@ class PostProcessing(RunEnvironment): @staticmethod def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], time_dimension, - ahead_dim="ahead", **kwargs): + ahead_dim="ahead", index_dim="index", type_dim="type", **kwargs): """ Combine different forecast types into single xarray. @@ -759,7 +765,7 @@ class PostProcessing(RunEnvironment): """ keys = list(kwargs.keys()) res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan), - coords=[index.index, ahead_names, keys], dims=['index', ahead_dim, 'type']) + coords=[index.index, ahead_names, keys], dims=[index_dim, ahead_dim, type_dim]) for k, v in kwargs.items(): intersection = set(res.index.values) & set(v.indexes[time_dimension].values) match_index = np.array(list(intersection)) @@ -837,7 +843,7 @@ class PostProcessing(RunEnvironment): errors[model_type] = {} errors[model_type][station] = statistics.calculate_error_metrics( *map(lambda x: external_data.sel(**{self.model_type_dim: x}), - [model_type, self.observation_indicator]), dim="index") + [model_type, self.observation_indicator]), dim=self.index_dim) # load competitors competitor = self.load_competitors(station) @@ -856,7 +862,7 @@ class PostProcessing(RunEnvironment): errors[model_type] = {} errors[model_type][station] = statistics.calculate_error_metrics( *map(lambda x: combined.sel(**{self.model_type_dim: x}), - [model_type, self.observation_indicator]), dim="index") + [model_type, self.observation_indicator]), dim=self.index_dim) # skill score skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim)