Skip to content
Snippets Groups Projects
Commit c50e6c55 authored by lukas leufen's avatar lukas leufen
Browse files

include first dev of create shuffled data, /close #48

Pair issue048 feat create shuffled data

See merge request toar/machinelearningtools!34
parents ebc4ff73 3dd830af
No related branches found
No related tags found
3 merge requests!59Develop,!52implemented bootstraps,!34Pair issue048 feat create shuffled data
Pipeline #29245 passed
__author__ = 'Felix Kleinert, Lukas Leufen'
__date__ = '2020-02-07'
from src.run_modules.run_environment import RunEnvironment
from src.data_handling.data_generator import DataGenerator
import numpy as np
import logging
import xarray as xr
import os
import re
class BootStraps(RunEnvironment):
def __init__(self):
super().__init__()
self.test_data: DataGenerator = self.data_store.get("generator", "general.test")
self.number_bootstraps = 200
self.bootstrap_path = self.data_store.get("bootstrap_path", "general")
self.create_shuffled_data()
def create_shuffled_data(self):
"""
Create shuffled data. Use original test data, add dimension 'boots' with length number of bootstraps and insert
randomly selected variables. If there is a suitable local file for requested window size and number of
bootstraps, no additional file will be created inside this function.
"""
variables_str = '_'.join(sorted(self.test_data.variables))
window = self.test_data.window_history_size
for station in self.test_data.stations:
valid, nboot = self.valid_bootstrap_file(station, variables_str, window)
if not valid:
logging.info(f'create bootstap data for {station}')
hist, _ = self.test_data[station]
data = hist.copy()
file_name = f"{station}_{variables_str}_hist{window}_nboots{nboot}_shuffled.nc"
file_path = os.path.join(self.bootstrap_path, file_name)
data = data.expand_dims({'boots': range(nboot)}, axis=-1)
shuffled_variable = np.full(data.shape, np.nan)
for i, var in enumerate(data.coords['variables']):
single_variable = data.sel(variables=var).values
shuffled_variable[..., i, :] = self.shuffle_single_variable(single_variable)
shuffled_data = xr.DataArray(shuffled_variable, coords=data.coords, dims=data.dims)
shuffled_data.to_netcdf(file_path)
def valid_bootstrap_file(self, station, variables, window):
"""
Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. If a
match was found, this method returns a tuple (True, None). In any other case, it returns (False, max_nboot),
where max_nboot is the highest boot number found in the local storage. A match is defined so that the window
length is ge than given window size form args and the number of boots is also ge than the given number of boots
from this class. Furthermore, this functions deletes local files, if the match the station pattern but don't fit
the window and bootstrap condition. This is performed, because it is assumed, that the corresponding file will
be created with a longer or at least same window size and numbers of bootstraps.
:param station:
:param variables:
:param window:
:return:
"""
regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled*")
max_nboot = self.number_bootstraps
for file in os.listdir(self.bootstrap_path):
match = regex.match(file)
if match:
window_file = int(match.group(1))
nboot_file = int(match.group(2))
max_nboot = max([max_nboot, nboot_file])
if (window_file >= window) and (nboot_file >= self.number_bootstraps):
return True, None
else:
os.remove(os.path.join(self.bootstrap_path, file))
return False, max_nboot
@staticmethod
def shuffle_single_variable(data: np.ndarray) -> np.ndarray:
orig_shape = data.shape
size = orig_shape
return np.random.choice(data.reshape(-1,), size=size)
if __name__ == "__main__":
from src.run_modules.experiment_setup import ExperimentSetup
from src.run_modules.run_environment import RunEnvironment
from src.run_modules.pre_processing import PreProcessing
formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]'
logging.basicConfig(format=formatter, level=logging.INFO)
with RunEnvironment():
ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'],
station_type='background', trainable=True, window_history_size=9)
PreProcessing()
BootStraps()
......@@ -145,6 +145,13 @@ def set_experiment_name(experiment_date=None, experiment_path=None):
return experiment_name, experiment_path
def set_bootstrap_path(bootstrap_path, data_path, sampling):
if bootstrap_path is None:
bootstrap_path = os.path.join(data_path, "..", f"bootstrap_{sampling}")
check_path_and_create(bootstrap_path)
return bootstrap_path
class PyTestRegex:
"""Assert that a given string meets some expectations."""
......
......@@ -33,13 +33,17 @@ class ExperimentSetup(RunEnvironment):
window_lead_time=None, dimensions=None, interpolate_dim=None, interpolate_method=None,
limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None,
test_end=None, use_all_stations_on_all_data_sets=True, trainable=False, fraction_of_train=None,
experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None):
experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None, sampling="daily",
bootstrap_path=None):
# create run framework
super().__init__()
# experiment setup
self._set_param("data_path", helpers.prepare_host())
data_path = self.data_store.get("data_path", "general")
bootstrap_path = helpers.set_bootstrap_path(bootstrap_path, data_path, sampling)
self._set_param("bootstrap_path", bootstrap_path)
self._set_param("trainable", trainable, default=False)
self._set_param("fraction_of_training", fraction_of_train, default=0.8)
......
from src.data_handling.bootstraps import BootStraps
import pytest
import os
import numpy as np
class TestBootstraps:
@pytest.fixture
def path(self):
path = os.path.join(os.path.dirname(__file__), "data")
if not os.path.exists(path):
os.makedirs(path)
return path
@pytest.fixture
def boot_no_init(self, path):
obj = object.__new__(BootStraps)
super(BootStraps, obj).__init__()
obj.number_bootstraps = 50
obj.bootstrap_path = path
return obj
def test_valid_bootstrap_file(self, path, boot_no_init):
station = "TESTSTATION"
variables = "var1_var2_var3"
window = 5
# empty case
assert len(os.listdir(path)) == 0
assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 50)
# different cases, where files with bigger range are existing
os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots100_shuffled.dat"))
assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
os.mknod(os.path.join(path, f"{station}_{variables}_hist10_nboots50_shuffled.dat"))
os.mknod(os.path.join(path, f"{station}1_{variables}_hist10_nboots50_shuffled.dat"))
assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
# need to reload data and therefore remove not fitting files for this station
assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 100)
assert len(os.listdir(path)) == 1
# reload because expanded boot number
os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
boot_no_init.number_bootstraps = 60
assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 60)
assert len(os.listdir(path)) == 1
# reload because of expanded window size, but use maximum boot number from file names
os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots60_shuffled.dat"))
boot_no_init.number_bootstraps = 50
assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60)
def test_shuffle_single_variale(self, boot_no_init):
data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
res = boot_no_init.shuffle_single_variable(data)
assert res.shape == data.shape
assert res.max() == data.max()
assert res.min() == data.min()
assert set(np.unique(res)).issubset({1, 2, 3})
def test_create_shuffled_data(self):
pass
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment