include first dev of create shuffled data, /close #48

Pair issue048 feat create shuffled data See merge request toar/machinelearningtools!34

include first dev of create shuffled data, /close #48
c50e6c55 · lukas leufen · ebc4ff73 · 3dd830af · c50e6c55 · c50e6c55
Commit c50e6c55 authored Feb 10, 2020 by lukas leufen
--- a/src/data_handling/bootstraps.py
+++ b/src/data_handling/bootstraps.py
+__author__ = 'Felix Kleinert, Lukas Leufen'
+__date__ = '2020-02-07'
+
+
+from src.run_modules.run_environment import RunEnvironment
+from src.data_handling.data_generator import DataGenerator
+import numpy as np
+import logging
+import xarray as xr
+import os
+import re
+
+
+class BootStraps(RunEnvironment):
+
+    def __init__(self):
+
+        super().__init__()
+        self.test_data: DataGenerator = self.data_store.get("generator", "general.test")
+        self.number_bootstraps = 200
+        self.bootstrap_path = self.data_store.get("bootstrap_path", "general")
+        self.create_shuffled_data()
+
+    def create_shuffled_data(self):
+        """
+        Create shuffled data. Use original test data, add dimension 'boots' with length number of bootstraps and insert
+        randomly selected variables. If there is a suitable local file for requested window size and number of
+        bootstraps, no additional file will be created inside this function.
+        """
+        variables_str = '_'.join(sorted(self.test_data.variables))
+        window = self.test_data.window_history_size
+        for station in self.test_data.stations:
+            valid, nboot = self.valid_bootstrap_file(station, variables_str, window)
+            if not valid:
+                logging.info(f'create bootstap data for {station}')
+                hist, _ = self.test_data[station]
+                data = hist.copy()
+                file_name = f"{station}_{variables_str}_hist{window}_nboots{nboot}_shuffled.nc"
+                file_path = os.path.join(self.bootstrap_path, file_name)
+                data = data.expand_dims({'boots': range(nboot)}, axis=-1)
+                shuffled_variable = np.full(data.shape, np.nan)
+                for i, var in enumerate(data.coords['variables']):
+                    single_variable = data.sel(variables=var).values
+                    shuffled_variable[..., i, :] = self.shuffle_single_variable(single_variable)
+                shuffled_data = xr.DataArray(shuffled_variable, coords=data.coords, dims=data.dims)
+                shuffled_data.to_netcdf(file_path)
+
+    def valid_bootstrap_file(self, station, variables, window):
+        """
+        Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. If a
+        match was found, this method returns a tuple (True, None). In any other case, it returns (False, max_nboot),
+        where max_nboot is the highest boot number found in the local storage. A match is defined so that the window
+        length is ge than given window size form args and the number of boots is also ge than the given number of boots
+        from this class. Furthermore, this functions deletes local files, if the match the station pattern but don't fit
+        the window and bootstrap condition. This is performed, because it is assumed, that the corresponding file will
+        be created with a longer or at least same window size and numbers of bootstraps.
+        :param station:
+        :param variables:
+        :param window:
+        :return:
+        """
+        regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled*")
+        max_nboot = self.number_bootstraps
+        for file in os.listdir(self.bootstrap_path):
+            match = regex.match(file)
+            if match:
+                window_file = int(match.group(1))
+                nboot_file = int(match.group(2))
+                max_nboot = max([max_nboot, nboot_file])
+                if (window_file >= window) and (nboot_file >= self.number_bootstraps):
+                    return True, None
+                else:
+                    os.remove(os.path.join(self.bootstrap_path, file))
+        return False, max_nboot
+
+    @staticmethod
+    def shuffle_single_variable(data: np.ndarray) -> np.ndarray:
+        orig_shape = data.shape
+        size = orig_shape
+        return np.random.choice(data.reshape(-1,), size=size)
+
+
+if __name__ == "__main__":
+
+    from src.run_modules.experiment_setup import ExperimentSetup
+    from src.run_modules.run_environment import RunEnvironment
+    from src.run_modules.pre_processing import PreProcessing
+
+    formatter = '%(asctime)s - %(levelname)s: %(message)s  [%(filename)s:%(funcName)s:%(lineno)s]'
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    with RunEnvironment():
+        ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'],
+                        station_type='background', trainable=True, window_history_size=9)
+        PreProcessing()
+
+        BootStraps()
--- a/src/helpers.py
+++ b/src/helpers.py
@@ -145,6 +145,13 @@ def set_experiment_name(experiment_date=None, experiment_path=None):
    return experiment_name, experiment_path


+def set_bootstrap_path(bootstrap_path, data_path, sampling):
+    if bootstrap_path is None:
+        bootstrap_path = os.path.join(data_path, "..", f"bootstrap_{sampling}")
+    check_path_and_create(bootstrap_path)
+    return bootstrap_path
+
+
 class PyTestRegex:
    """Assert that a given string meets some expectations."""


--- a/src/run_modules/experiment_setup.py
+++ b/src/run_modules/experiment_setup.py
@@ -33,13 +33,17 @@ class ExperimentSetup(RunEnvironment):
                 window_lead_time=None, dimensions=None, interpolate_dim=None, interpolate_method=None,
                 limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None,
                 test_end=None, use_all_stations_on_all_data_sets=True, trainable=False, fraction_of_train=None,
-                 experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None):
+                 experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None, sampling="daily",
+                 bootstrap_path=None):

        # create run framework
        super().__init__()

        # experiment setup
        self._set_param("data_path", helpers.prepare_host())
+        data_path = self.data_store.get("data_path", "general")
+        bootstrap_path = helpers.set_bootstrap_path(bootstrap_path, data_path, sampling)
+        self._set_param("bootstrap_path", bootstrap_path)
        self._set_param("trainable", trainable, default=False)
        self._set_param("fraction_of_training", fraction_of_train, default=0.8)


--- a/test/test_data_handling/test_bootstraps.py
+++ b/test/test_data_handling/test_bootstraps.py
+
+from src.data_handling.bootstraps import BootStraps
+
+import pytest
+import os
+
+import numpy as np
+
+
+class TestBootstraps:
+
+    @pytest.fixture
+    def path(self):
+        path = os.path.join(os.path.dirname(__file__), "data")
+        if not os.path.exists(path):
+            os.makedirs(path)
+        return path
+
+    @pytest.fixture
+    def boot_no_init(self, path):
+        obj = object.__new__(BootStraps)
+        super(BootStraps, obj).__init__()
+        obj.number_bootstraps = 50
+        obj.bootstrap_path = path
+        return obj
+
+    def test_valid_bootstrap_file(self, path, boot_no_init):
+        station = "TESTSTATION"
+        variables = "var1_var2_var3"
+        window = 5
+        # empty case
+        assert len(os.listdir(path)) == 0
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 50)
+        # different cases, where files with bigger range are existing
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots100_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist10_nboots50_shuffled.dat"))
+        os.mknod(os.path.join(path, f"{station}1_{variables}_hist10_nboots50_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        #  need to reload data and therefore remove not fitting files for this station
+        assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 100)
+        assert len(os.listdir(path)) == 1
+        # reload because expanded boot number
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
+        boot_no_init.number_bootstraps = 60
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 60)
+        assert len(os.listdir(path)) == 1
+        # reload because of expanded window size, but use maximum boot number from file names
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots60_shuffled.dat"))
+        boot_no_init.number_bootstraps = 50
+        assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60)
+
+    def test_shuffle_single_variale(self, boot_no_init):
+        data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
+        res = boot_no_init.shuffle_single_variable(data)
+        assert res.shape == data.shape
+        assert res.max() == data.max()
+        assert res.min() == data.min()
+        assert set(np.unique(res)).issubset({1, 2, 3})
+
+    def test_create_shuffled_data(self):
+        pass
\ No newline at end of file