diff --git a/src/data_handling/bootstraps.py b/src/data_handling/bootstraps.py index 6ac33e2a2555fe3f253593423e9e71e0aa97f4af..808678228348bd45dbc8f9de8a7e77de3b585cbd 100644 --- a/src/data_handling/bootstraps.py +++ b/src/data_handling/bootstraps.py @@ -6,6 +6,7 @@ from src.run_modules.run_environment import RunEnvironment from src.data_handling.data_generator import DataGenerator import numpy as np import logging +import dask.array as da import xarray as xr import os import re @@ -17,7 +18,7 @@ class BootStraps(RunEnvironment): super().__init__() self.test_data: DataGenerator = self.data_store.get("generator", "general.test") - self.number_bootstraps = 200 + self.number_bootstraps = 100 self.bootstrap_path = self.data_store.get("bootstrap_path", "general") self.create_shuffled_data() @@ -38,11 +39,12 @@ class BootStraps(RunEnvironment): file_name = f"{station}_{variables_str}_hist{window}_nboots{nboot}_shuffled.nc" file_path = os.path.join(self.bootstrap_path, file_name) data = data.expand_dims({'boots': range(nboot)}, axis=-1) - shuffled_variable = np.full(data.shape, np.nan) + shuffled_variable = [] for i, var in enumerate(data.coords['variables']): single_variable = data.sel(variables=var).values - shuffled_variable[..., i, :] = self.shuffle_single_variable(single_variable) - shuffled_data = xr.DataArray(shuffled_variable, coords=data.coords, dims=data.dims) + shuffled_variable.append(self.shuffle_single_variable(single_variable, chunks=(100, *data.shape[1:3], data.shape[-1]))) + shuffled_variable_da = da.stack(shuffled_variable, axis=-2, ).rechunk("auto") + shuffled_data = xr.DataArray(shuffled_variable_da, coords=data.coords, dims=data.dims) shuffled_data.to_netcdf(file_path) def valid_bootstrap_file(self, station, variables, window): @@ -74,10 +76,9 @@ class BootStraps(RunEnvironment): return False, max_nboot @staticmethod - def shuffle_single_variable(data: np.ndarray) -> np.ndarray: - orig_shape = data.shape - size = orig_shape - return np.random.choice(data.reshape(-1,), size=size) + def shuffle_single_variable(data: da.array, chunks) -> np.ndarray: + size = data.shape + return da.random.choice(data.reshape(-1,), size=size, chunks=chunks) if __name__ == "__main__":