diff --git a/src/data_handling/bootstraps.py b/src/data_handling/bootstraps.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac33e2a2555fe3f253593423e9e71e0aa97f4af --- /dev/null +++ b/src/data_handling/bootstraps.py @@ -0,0 +1,97 @@ +__author__ = 'Felix Kleinert, Lukas Leufen' +__date__ = '2020-02-07' + + +from src.run_modules.run_environment import RunEnvironment +from src.data_handling.data_generator import DataGenerator +import numpy as np +import logging +import xarray as xr +import os +import re + + +class BootStraps(RunEnvironment): + + def __init__(self): + + super().__init__() + self.test_data: DataGenerator = self.data_store.get("generator", "general.test") + self.number_bootstraps = 200 + self.bootstrap_path = self.data_store.get("bootstrap_path", "general") + self.create_shuffled_data() + + def create_shuffled_data(self): + """ + Create shuffled data. Use original test data, add dimension 'boots' with length number of bootstraps and insert + randomly selected variables. If there is a suitable local file for requested window size and number of + bootstraps, no additional file will be created inside this function. + """ + variables_str = '_'.join(sorted(self.test_data.variables)) + window = self.test_data.window_history_size + for station in self.test_data.stations: + valid, nboot = self.valid_bootstrap_file(station, variables_str, window) + if not valid: + logging.info(f'create bootstap data for {station}') + hist, _ = self.test_data[station] + data = hist.copy() + file_name = f"{station}_{variables_str}_hist{window}_nboots{nboot}_shuffled.nc" + file_path = os.path.join(self.bootstrap_path, file_name) + data = data.expand_dims({'boots': range(nboot)}, axis=-1) + shuffled_variable = np.full(data.shape, np.nan) + for i, var in enumerate(data.coords['variables']): + single_variable = data.sel(variables=var).values + shuffled_variable[..., i, :] = self.shuffle_single_variable(single_variable) + shuffled_data = xr.DataArray(shuffled_variable, coords=data.coords, dims=data.dims) + shuffled_data.to_netcdf(file_path) + + def valid_bootstrap_file(self, station, variables, window): + """ + Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. If a + match was found, this method returns a tuple (True, None). In any other case, it returns (False, max_nboot), + where max_nboot is the highest boot number found in the local storage. A match is defined so that the window + length is ge than given window size form args and the number of boots is also ge than the given number of boots + from this class. Furthermore, this functions deletes local files, if the match the station pattern but don't fit + the window and bootstrap condition. This is performed, because it is assumed, that the corresponding file will + be created with a longer or at least same window size and numbers of bootstraps. + :param station: + :param variables: + :param window: + :return: + """ + regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled*") + max_nboot = self.number_bootstraps + for file in os.listdir(self.bootstrap_path): + match = regex.match(file) + if match: + window_file = int(match.group(1)) + nboot_file = int(match.group(2)) + max_nboot = max([max_nboot, nboot_file]) + if (window_file >= window) and (nboot_file >= self.number_bootstraps): + return True, None + else: + os.remove(os.path.join(self.bootstrap_path, file)) + return False, max_nboot + + @staticmethod + def shuffle_single_variable(data: np.ndarray) -> np.ndarray: + orig_shape = data.shape + size = orig_shape + return np.random.choice(data.reshape(-1,), size=size) + + +if __name__ == "__main__": + + from src.run_modules.experiment_setup import ExperimentSetup + from src.run_modules.run_environment import RunEnvironment + from src.run_modules.pre_processing import PreProcessing + + formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]' + logging.basicConfig(format=formatter, level=logging.INFO) + + with RunEnvironment(): + ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'], + station_type='background', trainable=True, window_history_size=9) + PreProcessing() + + BootStraps() diff --git a/src/helpers.py b/src/helpers.py index 5646eb94dbd43941b5673e64f6b70a7ed0e51c26..680d3bd12132065763cb1f311feaca32bf7c75a8 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -145,6 +145,13 @@ def set_experiment_name(experiment_date=None, experiment_path=None): return experiment_name, experiment_path +def set_bootstrap_path(bootstrap_path, data_path, sampling): + if bootstrap_path is None: + bootstrap_path = os.path.join(data_path, "..", f"bootstrap_{sampling}") + check_path_and_create(bootstrap_path) + return bootstrap_path + + class PyTestRegex: """Assert that a given string meets some expectations.""" diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index 9ecc421bc1ef6790d0de8343066c15332728ecc9..834d0c57e8c82b2eaee687a751318266c1515943 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -33,13 +33,17 @@ class ExperimentSetup(RunEnvironment): window_lead_time=None, dimensions=None, interpolate_dim=None, interpolate_method=None, limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, use_all_stations_on_all_data_sets=True, trainable=False, fraction_of_train=None, - experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None): + experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None, sampling="daily", + bootstrap_path=None): # create run framework super().__init__() # experiment setup self._set_param("data_path", helpers.prepare_host()) + data_path = self.data_store.get("data_path", "general") + bootstrap_path = helpers.set_bootstrap_path(bootstrap_path, data_path, sampling) + self._set_param("bootstrap_path", bootstrap_path) self._set_param("trainable", trainable, default=False) self._set_param("fraction_of_training", fraction_of_train, default=0.8) diff --git a/test/test_data_handling/test_bootstraps.py b/test/test_data_handling/test_bootstraps.py new file mode 100644 index 0000000000000000000000000000000000000000..c1edd7ca7f012ccdebc2c75119eb37c5bc56c125 --- /dev/null +++ b/test/test_data_handling/test_bootstraps.py @@ -0,0 +1,64 @@ + +from src.data_handling.bootstraps import BootStraps + +import pytest +import os + +import numpy as np + + +class TestBootstraps: + + @pytest.fixture + def path(self): + path = os.path.join(os.path.dirname(__file__), "data") + if not os.path.exists(path): + os.makedirs(path) + return path + + @pytest.fixture + def boot_no_init(self, path): + obj = object.__new__(BootStraps) + super(BootStraps, obj).__init__() + obj.number_bootstraps = 50 + obj.bootstrap_path = path + return obj + + def test_valid_bootstrap_file(self, path, boot_no_init): + station = "TESTSTATION" + variables = "var1_var2_var3" + window = 5 + # empty case + assert len(os.listdir(path)) == 0 + assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 50) + # different cases, where files with bigger range are existing + os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat")) + assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None) + os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots100_shuffled.dat")) + assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None) + os.mknod(os.path.join(path, f"{station}_{variables}_hist10_nboots50_shuffled.dat")) + os.mknod(os.path.join(path, f"{station}1_{variables}_hist10_nboots50_shuffled.dat")) + assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None) + # need to reload data and therefore remove not fitting files for this station + assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 100) + assert len(os.listdir(path)) == 1 + # reload because expanded boot number + os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat")) + boot_no_init.number_bootstraps = 60 + assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 60) + assert len(os.listdir(path)) == 1 + # reload because of expanded window size, but use maximum boot number from file names + os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots60_shuffled.dat")) + boot_no_init.number_bootstraps = 50 + assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60) + + def test_shuffle_single_variale(self, boot_no_init): + data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) + res = boot_no_init.shuffle_single_variable(data) + assert res.shape == data.shape + assert res.max() == data.max() + assert res.min() == data.min() + assert set(np.unique(res)).issubset({1, 2, 3}) + + def test_create_shuffled_data(self): + pass \ No newline at end of file