From c3bca05012ce386d6d6d3f256e06178836111eac Mon Sep 17 00:00:00 2001 From: leufen1 <l.leufen@fz-juelich.de> Date: Thu, 5 Nov 2020 16:42:39 +0100 Subject: [PATCH] path inside a data handler is more often parsed as arg to enable further inheritance adjustments, data handler data is now stored separatly from the raw data location and inside experiment, data_path lost sampling attribute (is added by the handler itself), --- mlair/configuration/defaults.py | 1 + mlair/configuration/path_config.py | 16 ++++----- .../data_handler_single_station.py | 36 ++++++------------- mlair/data_handler/default_data_handler.py | 4 +-- mlair/run_modules/experiment_setup.py | 29 +++++++++------ test/test_configuration/test_path_config.py | 19 +++++----- test/test_run_modules/test_training.py | 3 +- 7 files changed, 49 insertions(+), 59 deletions(-) diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 51d4beaf..3da91b18 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -49,6 +49,7 @@ DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability"] +DEFAULT_SAMPLING = "daily" def get_defaults(): diff --git a/mlair/configuration/path_config.py b/mlair/configuration/path_config.py index 9b3d6f25..bf40c361 100644 --- a/mlair/configuration/path_config.py +++ b/mlair/configuration/path_config.py @@ -20,7 +20,7 @@ def prepare_host(create_new=True, data_path=None, sampling="daily") -> str: :param create_new: Create new path if enabled :param data_path: Parse your custom path (and therefore ignore preset paths fitting to known hosts) - :param sampling: sampling rate to separate data physically by temporal resolution + :param sampling: sampling rate to separate data physically by temporal resolution (deprecated) :return: full path to data """ @@ -32,17 +32,14 @@ def prepare_host(create_new=True, data_path=None, sampling="daily") -> str: data_path = f"/home/{user}/Data/toar_{sampling}/" elif hostname == "zam347": data_path = f"/home/{user}/Data/toar_{sampling}/" - elif hostname == "linux-aa9b": - data_path = f"/home/{user}/mlair/data/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] == "jr"): data_path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] in ['jw', 'ju'] or hostname[:5] in ['hdfml']): - data_path = f"/p/project/deepacf/intelliaq/{user}/DATA/toar_{sampling}/" + data_path = f"/p/project/deepacf/intelliaq/{user}/DATA/MLAIR/" elif runner_regex.match(hostname) is not None: - data_path = f"/home/{user}/mlair/data/toar_{sampling}/" + data_path = f"/home/{user}/mlair/data/" else: - data_path = os.path.join(os.getcwd(), "data", sampling) - # raise OSError(f"unknown host '{hostname}'") + data_path = os.path.join(os.getcwd(), "data") if not os.path.exists(data_path): try: @@ -97,7 +94,7 @@ def set_experiment_name(name: str = None, sampling: str = None) -> str: return experiment_name -def set_bootstrap_path(bootstrap_path: str, data_path: str, sampling: str) -> str: +def set_bootstrap_path(bootstrap_path: str, data_path: str) -> str: """ Set path for bootstrap input data. @@ -105,12 +102,11 @@ def set_bootstrap_path(bootstrap_path: str, data_path: str, sampling: str) -> st :param bootstrap_path: custom path to store bootstrap data :param data_path: path of data for default bootstrap path - :param sampling: sampling rate to add, if path is set to default :return: full bootstrap path """ if bootstrap_path is None: - bootstrap_path = os.path.join(data_path, "..", f"bootstrap_{sampling}") + bootstrap_path = os.path.join(data_path, "bootstrap") check_path_and_create(bootstrap_path) return os.path.abspath(bootstrap_path) diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 4cbd0c6a..e780c620 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -52,7 +52,7 @@ class DataHandlerSingleStation(AbstractDataHandler): min_length: int = 0, start=None, end=None, variables=None, **kwargs): super().__init__() # path, station, statistics_per_var, transformation, **kwargs) self.station = helpers.to_list(station) - self.path = os.path.abspath(data_path) # ToDo: data_path could be a dict or list? + self.path = self.setup_data_path(data_path, sampling) self.statistics_per_var = statistics_per_var self.do_transformation = transformation is not None self.input_data, self.target_data = self.setup_transformation(transformation) @@ -141,7 +141,7 @@ class DataHandlerSingleStation(AbstractDataHandler): """ Setup samples. This method prepares and creates samples X, and labels Y. """ - self.load_data(self.station, self.statistics_per_var, self.sampling, self.station_type, self.network, + self.load_data(self.path, self.station, self.statistics_per_var, self.sampling, self.station_type, self.network, self.store_data_locally) self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) self.set_inputs_and_targets() @@ -161,7 +161,7 @@ class DataHandlerSingleStation(AbstractDataHandler): self.make_observation(self.target_dim, self.target_var, self.time_dim) self.remove_nan(self.time_dim) - def load_data(self, station, statistics_per_var, sampling, station_type=None, network=None, + def load_data(self, path, station, statistics_per_var, sampling, station_type=None, network=None, store_data_locally=False): """ Load data and meta data either from local disk (preferred) or download new data by using a custom download method. @@ -170,9 +170,9 @@ class DataHandlerSingleStation(AbstractDataHandler): cases, downloaded data is only stored locally if store_data_locally is not disabled. If this parameter is not set, it is assumed, that data should be saved locally. """ - check_path_and_create(self.path) - file_name = self._set_file_name(self.path, station, statistics_per_var) - meta_file = self._set_meta_file_name(self.path, station, statistics_per_var) + check_path_and_create(path) + file_name = self._set_file_name(path, station, statistics_per_var) + meta_file = self._set_meta_file_name(path, station, statistics_per_var) if self.overwrite_local_data is True: logging.debug(f"overwrite_local_data is true, therefore reload {file_name}") if os.path.exists(file_name): @@ -265,10 +265,15 @@ class DataHandlerSingleStation(AbstractDataHandler): """ chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", "toluene"] + # used_chem_vars = list(set(chem_vars) & set(self.statistics_per_var.keys())) used_chem_vars = list(set(chem_vars) & set(self.variables)) data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) return data + @staticmethod + def setup_data_path(data_path, sampling): + return os.path.join(os.path.abspath(data_path), sampling) + def shift(self, data: xr.DataArray, dim: str, window: int) -> xr.DataArray: """ Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0). @@ -462,25 +467,6 @@ class DataHandlerSingleStation(AbstractDataHandler): """ return data.loc[{coord: slice(str(start), str(end))}] - def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray: - """ - Set all negative concentrations to zero. - - Names of all concentrations are extracted from https://join.fz-juelich.de/services/rest/surfacedata/ - #2.1 Parameters. Currently, this check is applied on "benzene", "ch4", "co", "ethane", "no", "no2", "nox", - "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", and "toluene". - - :param data: data array containing variables to check - :param minimum: minimum value, by default this should be 0 - - :return: corrected data - """ - chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", - "propane", "so2", "toluene"] - used_chem_vars = list(set(chem_vars) & set(self.statistics_per_var.keys())) - data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) - return data - @staticmethod def setup_transformation(transformation: statistics.TransformationClass): """ diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index e6dde10b..584151e3 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -30,7 +30,7 @@ class DefaultDataHandler(AbstractDataHandler): _requirements = remove_items(inspect.getfullargspec(data_handler).args, ["self", "station"]) - def __init__(self, id_class: data_handler, data_path: str, min_length: int = 0, + def __init__(self, id_class: data_handler, experiment_path: str, min_length: int = 0, extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None, store_processed_data=True): super().__init__() @@ -42,7 +42,7 @@ class DefaultDataHandler(AbstractDataHandler): self._X_extreme = None self._Y_extreme = None _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self)) - self._save_file = os.path.join(data_path, f"data_preparation_{_name_affix}.pickle") + self._save_file = os.path.join(experiment_path, "data", f"{_name_affix}.pickle") self._collection = self._create_collection() self.harmonise_X() self.multiply_extremes(extreme_values, extremes_on_right_tail_only, dim=self.interpolation_dim) diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 5de7ef5f..25cd7c09 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -17,7 +17,7 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \ DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ - DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST + DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.model_class import MyLittleModel as VanillaModel @@ -214,20 +214,25 @@ class ExperimentSetup(RunEnvironment): dimensions=None, time_dim=None, interpolation_method=None, - interpolation_limit=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, - test_end=None, use_all_stations_on_all_data_sets=None, train_model: bool = None, fraction_of_train: float = None, - experiment_path=None, plot_path: str = None, forecast_path: str = None, overwrite_local_data = None, sampling: str = "daily", - create_new_model = None, bootstrap_path=None, permute_data_on_training = None, transformation=None, + interpolation_limit=None, train_start=None, train_end=None, val_start=None, val_end=None, + test_start=None, + test_end=None, use_all_stations_on_all_data_sets=None, train_model: bool = None, + fraction_of_train: float = None, + experiment_path=None, plot_path: str = None, forecast_path: str = None, overwrite_local_data=None, + sampling: str = None, + create_new_model=None, bootstrap_path=None, permute_data_on_training=None, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None, - extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None, number_of_bootstraps=None, + extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None, + number_of_bootstraps=None, create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None, - hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, **kwargs): + hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, sampling_inputs=None, + sampling_outputs=None, **kwargs): # create run framework super().__init__() # experiment setup, hyperparameters - self._set_param("data_path", path_config.prepare_host(data_path=data_path, sampling=sampling)) + self._set_param("data_path", path_config.prepare_host(data_path=data_path)) self._set_param("hostname", path_config.get_host()) self._set_param("hpc_hosts", hpc_hosts, default=DEFAULT_HPC_HOST_LIST + DEFAULT_HPC_LOGIN_LIST) self._set_param("login_nodes", login_nodes, default=DEFAULT_HPC_LOGIN_LIST) @@ -235,7 +240,7 @@ class ExperimentSetup(RunEnvironment): if self.data_store.get("create_new_model"): train_model = True data_path = self.data_store.get("data_path") - bootstrap_path = path_config.set_bootstrap_path(bootstrap_path, data_path, sampling) + bootstrap_path = path_config.set_bootstrap_path(bootstrap_path, data_path) self._set_param("bootstrap_path", bootstrap_path) self._set_param("train_model", train_model, default=DEFAULT_TRAIN_MODEL) self._set_param("fraction_of_training", fraction_of_train, default=DEFAULT_FRACTION_OF_TRAINING) @@ -250,6 +255,7 @@ class ExperimentSetup(RunEnvironment): self._set_param("epochs", epochs, default=DEFAULT_EPOCHS) # set experiment name + sampling = self._set_param("sampling", sampling, default=DEFAULT_SAMPLING) experiment_name = path_config.set_experiment_name(name=experiment_date, sampling=sampling) experiment_path = path_config.set_experiment_path(name=experiment_name, path=experiment_path) self._set_param("experiment_name", experiment_name) @@ -287,7 +293,7 @@ class ExperimentSetup(RunEnvironment): self._set_param("window_history_size", window_history_size, default=DEFAULT_WINDOW_HISTORY_SIZE) self._set_param("overwrite_local_data", overwrite_local_data, default=DEFAULT_OVERWRITE_LOCAL_DATA, scope="preprocessing") - self._set_param("sampling", sampling) + self._set_param("sampling_inputs", sampling_inputs, default=sampling) self._set_param("transformation", transformation, default=DEFAULT_TRANSFORMATION) self._set_param("transformation", None, scope="preprocessing") self._set_param("data_handler", data_handler, default=DefaultDataHandler) @@ -356,7 +362,7 @@ class ExperimentSetup(RunEnvironment): f"conflict with an existing entry with same naming: {k}={self.data_store.get(k)}") def _set_param(self, param: str, value: Any, default: Any = None, scope: str = "general", - apply: Callable = None) -> None: + apply: Callable = None) -> Any: """Set given parameter and log in debug. Use apply parameter to adjust the stored value (e.g. to transform value to a list use apply=helpers.to_list).""" if value is None and default is not None: @@ -365,6 +371,7 @@ class ExperimentSetup(RunEnvironment): value = apply(value) self.data_store.set(param, value, scope) logging.debug(f"set experiment attribute: {param}({scope})={value}") + return value def _compare_variables_and_statistics(self): """ diff --git a/test/test_configuration/test_path_config.py b/test/test_configuration/test_path_config.py index b9776363..2ba80a3b 100644 --- a/test/test_configuration/test_path_config.py +++ b/test/test_configuration/test_path_config.py @@ -11,22 +11,21 @@ from mlair.helpers import PyTestRegex class TestPrepareHost: - @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", + @mock.patch("socket.gethostname", side_effect=["ZAM144", "zam347", "jrtest", "jwtest", "runner-6HmDp9Qd-project-2411-concurrent-01"]) @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=True) def test_prepare_host(self, mock_host, mock_user, mock_path): - assert prepare_host() == "/home/testUser/mlair/data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/p/project/cjjsc42/testUser/DATA/toar_daily/" - assert prepare_host() == "/p/project/deepacf/intelliaq/testUser/DATA/toar_daily/" - assert prepare_host() == '/home/testUser/mlair/data/toar_daily/' + assert prepare_host() == "/p/project/deepacf/intelliaq/testUser/DATA/MLAIR/" + assert prepare_host() == '/home/testUser/mlair/data/' @mock.patch("socket.gethostname", return_value="NotExistingHostName") @mock.patch("getpass.getuser", return_value="zombie21") def test_prepare_host_unknown(self, mock_user, mock_host): - assert prepare_host() == os.path.join(os.path.abspath(os.getcwd()), 'data', 'daily') + assert prepare_host() == os.path.join(os.path.abspath(os.getcwd()), 'data') @mock.patch("getpass.getuser", return_value="zombie21") @mock.patch("mlair.configuration.path_config.check_path_and_create", side_effect=PermissionError) @@ -42,13 +41,13 @@ class TestPrepareHost: # assert "does not exist for host 'linux-aa9b'" in e.value.args[0] assert PyTestRegex(r"path '.*' does not exist for host '.*'\.") == e.value.args[0] - @mock.patch("socket.gethostname", side_effect=["linux-aa9b"]) + @mock.patch("socket.gethostname", side_effect=["zam347"]) @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=False) @mock.patch("os.makedirs", side_effect=None) def test_os_path_exists(self, mock_host, mock_user, mock_path, mock_check): path = prepare_host() - assert path == "/home/testUser/mlair/data/toar_daily/" + assert path == "/home/testUser/Data/toar_daily/" class TestSetExperimentName: @@ -80,12 +79,12 @@ class TestSetBootstrapPath: @mock.patch("os.makedirs", side_effect=None) def test_bootstrap_path_is_none(self, mock_makedir): - bootstrap_path = set_bootstrap_path(None, 'TestDataPath/', 'daily') - assert bootstrap_path == os.path.abspath('TestDataPath/../bootstrap_daily') + bootstrap_path = set_bootstrap_path(None, 'TestDataPath/') + assert bootstrap_path == os.path.abspath('TestDataPath/bootstrap') @mock.patch("os.makedirs", side_effect=None) def test_bootstap_path_is_given(self, mock_makedir): - bootstrap_path = set_bootstrap_path('Test/path/to/boots', None, None) + bootstrap_path = set_bootstrap_path('Test/path/to/boots', None) assert bootstrap_path == os.path.abspath('./Test/path/to/boots') diff --git a/test/test_run_modules/test_training.py b/test/test_run_modules/test_training.py index c0b625ef..84cc5f48 100644 --- a/test/test_run_modules/test_training.py +++ b/test/test_run_modules/test_training.py @@ -125,7 +125,8 @@ class TestTraining: @pytest.fixture def data_collection(self, path, window_history_size, window_lead_time, statistics_per_var): - data_prep = DefaultDataHandler.build(['DEBW107'], data_path=os.path.join(os.path.dirname(__file__), 'data'), + data_prep = DefaultDataHandler.build(['DEBW107'], data_path=os.path.join(path, 'data'), + experiment_path=os.path.join(path, 'exp_path'), statistics_per_var=statistics_per_var, station_type="background", network="AIRBASE", sampling="daily", target_dim="variables", target_var="o3", time_dim="datetime", -- GitLab