diff --git a/.gitignore b/.gitignore index 9ac8bb7635bb12d5e3bc32182a90d0f3ba985c58..305a5d1b9420eb62da24772fc1f4b263c1f3efe1 100644 --- a/.gitignore +++ b/.gitignore @@ -60,7 +60,7 @@ Thumbs.db htmlcov/ .pytest_cache /test/data/ -/test/test_modules/data/ +/test/test_run_modules/data/ report.html /TestExperiment/ /testrun_network*/ diff --git a/CI/run_pytest_coverage.sh b/CI/run_pytest_coverage.sh index 45916427f1521843923fb94e49dc661241dc0369..24d916b1a32da714abc2e5de0ac2b4c2790752a9 100644 --- a/CI/run_pytest_coverage.sh +++ b/CI/run_pytest_coverage.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # run coverage twice, 1) for html deploy 2) for success evaluation -python3.6 -m pytest --cov=src --cov-report term --cov-report html test/ | tee coverage_results.out +python3.6 -m pytest --cov=mlair --cov-report term --cov-report html test/ | tee coverage_results.out IS_FAILED=$? diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 3f38e14f8ab8d471e7b2a94813566ce21e1a8748..31746ec889cc82ebbae8de82a05c5cff02a22ac0 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -13,7 +13,8 @@ DEFAULT_START = "1997-01-01" DEFAULT_END = "2017-12-31" DEFAULT_WINDOW_HISTORY_SIZE = 13 DEFAULT_OVERWRITE_LOCAL_DATA = False -DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} +# DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} +DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise"} DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"] # ju[wels} #hdfmll(ogin) DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"] # first part of node names for Juwels (jw[comp], hdfmlc(ompute). DEFAULT_CREATE_NEW_MODEL = True @@ -28,9 +29,9 @@ DEFAULT_TARGET_VAR = "o3" DEFAULT_TARGET_DIM = "variables" DEFAULT_WINDOW_LEAD_TIME = 3 DEFAULT_DIMENSIONS = {"new_index": ["datetime", "Stations"]} -DEFAULT_INTERPOLATION_DIM = "datetime" +DEFAULT_TIME_DIM = "datetime" DEFAULT_INTERPOLATION_METHOD = "linear" -DEFAULT_LIMIT_NAN_FILL = 1 +DEFAULT_INTERPOLATION_LIMIT = 1 DEFAULT_TRAIN_START = "1997-01-01" DEFAULT_TRAIN_END = "2007-12-31" DEFAULT_TRAIN_MIN_LENGTH = 90 diff --git a/mlair/configuration/path_config.py b/mlair/configuration/path_config.py index 0ef082b58cf7028ea4f71e86b6d0c4ecad6ff54d..9b3d6f250d97d93dd1d06004690885f44de30073 100644 --- a/mlair/configuration/path_config.py +++ b/mlair/configuration/path_config.py @@ -33,13 +33,13 @@ def prepare_host(create_new=True, data_path=None, sampling="daily") -> str: elif hostname == "zam347": data_path = f"/home/{user}/Data/toar_{sampling}/" elif hostname == "linux-aa9b": - data_path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + data_path = f"/home/{user}/mlair/data/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] == "jr"): data_path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] in ['jw', 'ju'] or hostname[:5] in ['hdfml']): data_path = f"/p/project/deepacf/intelliaq/{user}/DATA/toar_{sampling}/" elif runner_regex.match(hostname) is not None: - data_path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + data_path = f"/home/{user}/mlair/data/toar_{sampling}/" else: data_path = os.path.join(os.getcwd(), "data", sampling) # raise OSError(f"unknown host '{hostname}'") diff --git a/mlair/data_handling/__init__.py b/mlair/data_handler/__init__.py similarity index 59% rename from mlair/data_handling/__init__.py rename to mlair/data_handler/__init__.py index cb5aa5db0f29cf51d32ed54e810fa9b363d80cc6..451868b838ab7a0d165942e36b5ec6aa03e42721 100644 --- a/mlair/data_handling/__init__.py +++ b/mlair/data_handler/__init__.py @@ -10,6 +10,6 @@ __date__ = '2020-04-17' from .bootstraps import BootStraps -from .data_preparation_join import DataPrepJoin -from .data_generator import DataGenerator -from .data_distributor import Distributor +from .iterator import KerasIterator, DataCollection +from .advanced_data_handler import DefaultDataPreparation, AbstractDataPreparation +from .data_preparation_neighbors import DataPreparationNeighbors diff --git a/mlair/data_handler/advanced_data_handler.py b/mlair/data_handler/advanced_data_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..57a9667f2a42575faa02d50e439252738a8dc8bb --- /dev/null +++ b/mlair/data_handler/advanced_data_handler.py @@ -0,0 +1,356 @@ + +__author__ = 'Lukas Leufen' +__date__ = '2020-07-08' + + +from mlair.helpers import to_list, remove_items +import numpy as np +import xarray as xr +import pickle +import os +import pandas as pd +import datetime as dt +import shutil +import inspect +import copy + +from typing import Union, List, Tuple, Dict +import logging +from functools import reduce +from mlair.data_handler.station_preparation import StationPrep +from mlair.helpers.join import EmptyQueryResult + + +number = Union[float, int] +num_or_list = Union[number, List[number]] + + +class DummyDataSingleStation: # pragma: no cover + + def __init__(self, name, number_of_samples=None): + self.name = name + self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150) + + def get_X(self): + X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(X1, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(14), + "variables": range(5)}) + + def get_Y(self): + Y1 = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(Y1, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(5), + "variables": range(1)}) + + def __str__(self): + return self.name + + +class AbstractDataPreparation: + + _requirements = [] + + def __init__(self, *args, **kwargs): + pass + + @classmethod + def build(cls, *args, **kwargs): + """Return initialised class.""" + return cls(*args, **kwargs) + + @classmethod + def requirements(cls): + """Return requirements and own arguments without duplicates.""" + return list(set(cls._requirements + cls.own_args())) + + @classmethod + def own_args(cls, *args): + return remove_items(inspect.getfullargspec(cls).args, ["self"] + list(args)) + + @classmethod + def transformation(cls, *args, **kwargs): + return None + + def get_X(self, upsampling=False, as_numpy=False): + raise NotImplementedError + + def get_Y(self, upsampling=False, as_numpy=False): + raise NotImplementedError + + def get_data(self, upsampling=False, as_numpy=False): + return self.get_X(upsampling, as_numpy), self.get_Y(upsampling, as_numpy) + + def get_coordinates(self) -> Union[None, Dict]: + return None + + +class DefaultDataPreparation(AbstractDataPreparation): + + _requirements = remove_items(inspect.getfullargspec(StationPrep).args, ["self", "station"]) + + def __init__(self, id_class, data_path, min_length=0, + extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None): + super().__init__() + self.id_class = id_class + self.interpolation_dim = "datetime" + self.min_length = min_length + self._X = None + self._Y = None + self._X_extreme = None + self._Y_extreme = None + _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self)) + self._save_file = os.path.join(data_path, f"data_preparation_{_name_affix}.pickle") + self._collection = self._create_collection() + self.harmonise_X() + self.multiply_extremes(extreme_values, extremes_on_right_tail_only, dim=self.interpolation_dim) + self._store(fresh_store=True) + + @classmethod + def build(cls, station, **kwargs): + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} + sp = StationPrep(station, **sp_keys) + dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} + return cls(sp, **dp_args) + + def _create_collection(self): + return [self.id_class] + + @classmethod + def requirements(cls): + return remove_items(super().requirements(), "id_class") + + def _reset_data(self): + self._X, self._Y, self._X_extreme, self._Y_extreme = None, None, None, None + + def _cleanup(self): + directory = os.path.dirname(self._save_file) + if os.path.exists(directory) is False: + os.makedirs(directory) + if os.path.exists(self._save_file): + shutil.rmtree(self._save_file, ignore_errors=True) + + def _store(self, fresh_store=False): + self._cleanup() if fresh_store is True else None + data = {"X": self._X, "Y": self._Y, "X_extreme": self._X_extreme, "Y_extreme": self._Y_extreme} + with open(self._save_file, "wb") as f: + pickle.dump(data, f) + logging.debug(f"save pickle data to {self._save_file}") + self._reset_data() + + def _load(self): + try: + with open(self._save_file, "rb") as f: + data = pickle.load(f) + logging.debug(f"load pickle data from {self._save_file}") + self._X, self._Y = data["X"], data["Y"] + self._X_extreme, self._Y_extreme = data["X_extreme"], data["Y_extreme"] + except FileNotFoundError: + pass + + def get_data(self, upsampling=False, as_numpy=True): + self._load() + X = self.get_X(upsampling, as_numpy) + Y = self.get_Y(upsampling, as_numpy) + self._reset_data() + return X, Y + + def __repr__(self): + return ";".join(list(map(lambda x: str(x), self._collection))) + + def get_X_original(self): + X = [] + for data in self._collection: + X.append(data.get_X()) + return X + + def get_Y_original(self): + Y = self._collection[0].get_Y() + return Y + + @staticmethod + def _to_numpy(d): + return list(map(lambda x: np.copy(x), d)) + + def get_X(self, upsampling=False, as_numpy=True): + no_data = (self._X is None) + self._load() if no_data is True else None + X = self._X if upsampling is False else self._X_extreme + self._reset_data() if no_data is True else None + return self._to_numpy(X) if as_numpy is True else X + + def get_Y(self, upsampling=False, as_numpy=True): + no_data = (self._Y is None) + self._load() if no_data is True else None + Y = self._Y if upsampling is False else self._Y_extreme + self._reset_data() if no_data is True else None + return self._to_numpy([Y]) if as_numpy is True else Y + + def harmonise_X(self): + X_original, Y_original = self.get_X_original(), self.get_Y_original() + dim = self.interpolation_dim + intersect = reduce(np.intersect1d, map(lambda x: x.coords[dim].values, X_original)) + if len(intersect) < max(self.min_length, 1): + X, Y = None, None + else: + X = list(map(lambda x: x.sel({dim: intersect}), X_original)) + Y = Y_original.sel({dim: intersect}) + self._X, self._Y = X, Y + + def get_observation(self): + return self.id_class.observation.copy().squeeze() + + def get_transformation_Y(self): + return self.id_class.get_transformation_information() + + def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, + timedelta: Tuple[int, str] = (1, 'm'), dim="datetime"): + """ + Multiply extremes. + + This method extracts extreme values from self.labels which are defined in the argument extreme_values. One can + also decide only to extract extremes on the right tail of the distribution. When extreme_values is a list of + floats/ints all values larger (and smaller than negative extreme_values; extraction is performed in standardised + space) than are extracted iteratively. If for example extreme_values = [1.,2.] then a value of 1.5 would be + extracted once (for 0th entry in list), while a 2.5 would be extracted twice (once for each entry). Timedelta is + used to mark those extracted values by adding one min to each timestamp. As TOAR Data are hourly one can + identify those "artificial" data points later easily. Extreme inputs and labels are stored in + self.extremes_history and self.extreme_labels, respectively. + + :param extreme_values: user definition of extreme + :param extremes_on_right_tail_only: if False also multiply values which are smaller then -extreme_values, + if True only extract values larger than extreme_values + :param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime + """ + # check if X or Y is None + if (self._X is None) or (self._Y is None): + logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes") + return + if extreme_values is None: + logging.debug(f"No extreme values given, skip multiply extremes") + self._X_extreme, self._Y_extreme = self._X, self._Y + return + + # check type if inputs + extreme_values = to_list(extreme_values) + for i in extreme_values: + if not isinstance(i, number.__args__): + raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element " + f"{i} is type {type(i)}") + + for extr_val in sorted(extreme_values): + # check if some extreme values are already extracted + if (self._X_extreme is None) or (self._Y_extreme is None): + X = self._X + Y = self._Y + else: # one extr value iteration is done already: self.extremes_label is NOT None... + X = self._X_extreme + Y = self._Y_extreme + + # extract extremes based on occurrence in labels + other_dims = remove_items(list(Y.dims), dim) + if extremes_on_right_tail_only: + extreme_idx = (Y > extr_val).any(dim=other_dims) + else: + extreme_idx = xr.concat([(Y < -extr_val).any(dim=other_dims[0]), + (Y > extr_val).any(dim=other_dims[0])], + dim=other_dims[1]).any(dim=other_dims[1]) + + extremes_X = list(map(lambda x: x.sel(**{dim: extreme_idx}), X)) + self._add_timedelta(extremes_X, dim, timedelta) + # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) + + extremes_Y = Y.sel(**{dim: extreme_idx}) + extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + + self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) + self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) + + @staticmethod + def _add_timedelta(data, dim, timedelta): + for d in data: + d.coords[dim].values += np.timedelta64(*timedelta) + + @classmethod + def transformation(cls, set_stations, **kwargs): + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} + transformation_dict = sp_keys.pop("transformation") + if transformation_dict is None: + return + scope = transformation_dict.pop("scope") + method = transformation_dict.pop("method") + if transformation_dict.pop("mean", None) is not None: + return + mean, std = None, None + for station in set_stations: + try: + sp = StationPrep(station, transformation={"method": method}, **sp_keys) + mean = sp.mean.copy(deep=True) if mean is None else mean.combine_first(sp.mean) + std = sp.std.copy(deep=True) if std is None else std.combine_first(sp.std) + except (AttributeError, EmptyQueryResult): + continue + if mean is None: + return None + mean_estimated = mean.mean("Stations") + std_estimated = std.mean("Stations") + return {"scope": scope, "method": method, "mean": mean_estimated, "std": std_estimated} + + def get_coordinates(self): + return self.id_class.get_coordinates() + + +def run_data_prep(): + + from .data_preparation_neighbors import DataPreparationNeighbors + data = DummyDataSingleStation("main_class") + data.get_X() + data.get_Y() + + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + data_prep = DataPreparationNeighbors(DummyDataSingleStation("main_class"), + path, + neighbors=[DummyDataSingleStation("neighbor1"), + DummyDataSingleStation("neighbor2")], + extreme_values=[1., 1.2]) + data_prep.get_data(upsampling=False) + + +def create_data_prep(): + + from .data_preparation_neighbors import DataPreparationNeighbors + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + station_type = None + network = 'UBA' + sampling = 'daily' + target_dim = 'variables' + target_var = 'o3' + interpolation_dim = 'datetime' + window_history_size = 7 + window_lead_time = 3 + central_station = StationPrep("DEBW011", path, {'o3': 'dma8eu', 'temp': 'maximum'}, {},station_type, network, sampling, target_dim, + target_var, interpolation_dim, window_history_size, window_lead_time) + neighbor1 = StationPrep("DEBW013", path, {'o3': 'dma8eu', 'temp-rea-miub': 'maximum'}, {},station_type, network, sampling, target_dim, + target_var, interpolation_dim, window_history_size, window_lead_time) + neighbor2 = StationPrep("DEBW034", path, {'o3': 'dma8eu', 'temp': 'maximum'}, {}, station_type, network, sampling, target_dim, + target_var, interpolation_dim, window_history_size, window_lead_time) + + data_prep = [] + data_prep.append(DataPreparationNeighbors(central_station, path, neighbors=[neighbor1, neighbor2])) + data_prep.append(DataPreparationNeighbors(neighbor1, path, neighbors=[central_station, neighbor2])) + data_prep.append(DataPreparationNeighbors(neighbor2, path, neighbors=[neighbor1, central_station])) + return data_prep + + +if __name__ == "__main__": + from mlair.data_handler.station_preparation import StationPrep + from mlair.data_handler.iterator import KerasIterator, DataCollection + data_prep = create_data_prep() + data_collection = DataCollection(data_prep) + for data in data_collection: + print(data) + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata", "keras") + keras_it = KerasIterator(data_collection, 100, path, upsampling=True) + keras_it[2] + diff --git a/mlair/data_handler/bootstraps.py b/mlair/data_handler/bootstraps.py new file mode 100644 index 0000000000000000000000000000000000000000..91603b41822b92e28fbd077c502d84707fff746f --- /dev/null +++ b/mlair/data_handler/bootstraps.py @@ -0,0 +1,130 @@ +""" +Collections of bootstrap methods and classes. + +How to use +---------- + +test + +""" + +__author__ = 'Felix Kleinert, Lukas Leufen' +__date__ = '2020-02-07' + + +import os +from collections import Iterator, Iterable +from itertools import chain + +import numpy as np +import xarray as xr + +from mlair.data_handler.advanced_data_handler import AbstractDataPreparation + + +class BootstrapIterator(Iterator): + + _position: int = None + + def __init__(self, data: "BootStraps"): + assert isinstance(data, BootStraps) + self._data = data + self._dimension = data.bootstrap_dimension + self._collection = self._data.bootstraps() + self._position = 0 + + def __next__(self): + """Return next element or stop iteration.""" + try: + index, dimension = self._collection[self._position] + nboot = self._data.number_of_bootstraps + _X, _Y = self._data.data.get_data(as_numpy=False) + _X = list(map(lambda x: x.expand_dims({'boots': range(nboot)}, axis=-1), _X)) + _Y = _Y.expand_dims({"boots": range(nboot)}, axis=-1) + single_variable = _X[index].sel({self._dimension: [dimension]}) + shuffled_variable = self.shuffle(single_variable.values) + shuffled_data = xr.DataArray(shuffled_variable, coords=single_variable.coords, dims=single_variable.dims) + _X[index] = shuffled_data.combine_first(_X[index]).reindex_like(_X[index]) + self._position += 1 + except IndexError: + raise StopIteration() + _X, _Y = self._to_numpy(_X), self._to_numpy(_Y) + return self._reshape(_X), self._reshape(_Y), (index, dimension) + + @staticmethod + def _reshape(d): + if isinstance(d, list): + return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + else: + shape = d.shape + return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) + + @staticmethod + def _to_numpy(d): + if isinstance(d, list): + return list(map(lambda x: x.values, d)) + else: + return d.values + + @staticmethod + def shuffle(data: np.ndarray) -> np.ndarray: + """ + Shuffle randomly from given data (draw elements with replacement). + + :param data: data to shuffle + :return: shuffled data as numpy array + """ + size = data.shape + return np.random.choice(data.reshape(-1, ), size=size) + + +class BootStraps(Iterable): + """ + Main class to perform bootstrap operations. + + This class requires a data handler following the definition of the AbstractDataPreparation, the number of bootstraps + to create and the dimension along this bootstrapping is performed (default dimension is `variables`). + + When iterating on this class, it returns the bootstrapped X, Y and a tuple with (position of variable in X, name of + this variable). The tuple is interesting if X consists on mutliple input streams X_i (e.g. two or more stations) + because it shows which variable of which input X_i has been bootstrapped. All bootstrap combinations can be + retrieved by calling the .bootstraps() method. Further more, by calling the .get_orig_prediction() this class + imitates according to the set number of bootstraps the original prediction + """ + def __init__(self, data: AbstractDataPreparation, number_of_bootstraps: int = 10, + bootstrap_dimension: str = "variables"): + """ + Create iterable class to be ready to iter. + + :param data: a data generator object to get data / history + :param number_of_bootstraps: the number of bootstrap realisations + """ + self.data = data + self.number_of_bootstraps = number_of_bootstraps + self.bootstrap_dimension = bootstrap_dimension + + def __iter__(self): + return BootstrapIterator(self) + + def __len__(self): + return len(self.bootstraps()) + + def bootstraps(self): + l = [] + for i, x in enumerate(self.data.get_X(as_numpy=False)): + l.append(list(map(lambda y: (i, y), x.indexes['variables']))) + return list(chain(*l)) + + def get_orig_prediction(self, path: str, file_name: str, prediction_name: str = "CNN") -> np.ndarray: + """ + Repeat predictions from given file(_name) in path by the number of boots. + + :param path: path to file + :param file_name: file name + :param prediction_name: name of the prediction to select from loaded file (default CNN) + :return: repeated predictions + """ + file = os.path.join(path, file_name) + prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze() + vals = np.tile(prediction.data, (self.number_of_bootstraps, 1)) + return vals[~np.isnan(vals).any(axis=1), :] diff --git a/mlair/data_handler/data_preparation_neighbors.py b/mlair/data_handler/data_preparation_neighbors.py new file mode 100644 index 0000000000000000000000000000000000000000..0c95b242e1046618403ebb6592407ef8b680e890 --- /dev/null +++ b/mlair/data_handler/data_preparation_neighbors.py @@ -0,0 +1,64 @@ + +__author__ = 'Lukas Leufen' +__date__ = '2020-07-17' + + +from mlair.helpers import to_list +from mlair.data_handler.station_preparation import StationPrep +from mlair.data_handler.advanced_data_handler import DefaultDataPreparation +import os + +from typing import Union, List + +number = Union[float, int] +num_or_list = Union[number, List[number]] + + +class DataPreparationNeighbors(DefaultDataPreparation): + + def __init__(self, id_class, data_path, neighbors=None, min_length=0, + extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False): + self.neighbors = to_list(neighbors) if neighbors is not None else [] + super().__init__(id_class, data_path, min_length=min_length, extreme_values=extreme_values, + extremes_on_right_tail_only=extremes_on_right_tail_only) + + @classmethod + def build(cls, station, **kwargs): + sp_keys = {k: kwargs[k] for k in cls._requirements if k in kwargs} + sp = StationPrep(station, **sp_keys) + n_list = [] + for neighbor in kwargs.get("neighbors", []): + n_list.append(StationPrep(neighbor, **sp_keys)) + else: + kwargs["neighbors"] = n_list if len(n_list) > 0 else None + dp_args = {k: kwargs[k] for k in cls.own_args("id_class") if k in kwargs} + return cls(sp, **dp_args) + + def _create_collection(self): + return [self.id_class] + self.neighbors + + def get_coordinates(self, include_neighbors=False): + neighbors = list(map(lambda n: n.get_coordinates(), self.neighbors)) if include_neighbors is True else [] + return [super(DataPreparationNeighbors, self).get_coordinates()].append(neighbors) + + +if __name__ == "__main__": + + a = DataPreparationNeighbors + requirements = a.requirements() + + kwargs = {"path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata"), + "station_type": None, + "network": 'UBA', + "sampling": 'daily', + "target_dim": 'variables', + "target_var": 'o3', + "time_dim": 'datetime', + "window_history_size": 7, + "window_lead_time": 3, + "neighbors": ["DEBW034"], + "data_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata"), + "statistics_per_var": {'o3': 'dma8eu', 'temp': 'maximum'}, + "transformation": None,} + a_inst = a.build("DEBW011", **kwargs) + print(a_inst) diff --git a/mlair/data_handler/iterator.py b/mlair/data_handler/iterator.py new file mode 100644 index 0000000000000000000000000000000000000000..49569405a587920da795820d48f8d968a8142cc7 --- /dev/null +++ b/mlair/data_handler/iterator.py @@ -0,0 +1,213 @@ + +__author__ = 'Lukas Leufen' +__date__ = '2020-07-07' + +from collections import Iterator, Iterable +import keras +import numpy as np +import math +import os +import shutil +import pickle +from typing import Tuple, List + + +class StandardIterator(Iterator): + + _position: int = None + + def __init__(self, collection: list): + assert isinstance(collection, list) + self._collection = collection + self._position = 0 + + def __next__(self): + """Return next element or stop iteration.""" + try: + value = self._collection[self._position] + self._position += 1 + except IndexError: + raise StopIteration() + return value + + +class DataCollection(Iterable): + + def __init__(self, collection: list = None): + if collection is None: + collection = [] + assert isinstance(collection, list) + self._collection = collection + self._mapping = {} + self._set_mapping() + + def __len__(self): + return len(self._collection) + + def __iter__(self) -> Iterator: + return StandardIterator(self._collection) + + def __getitem__(self, index): + if isinstance(index, int): + return self._collection[index] + else: + return self._collection[self._mapping[str(index)]] + + def add(self, element): + self._collection.append(element) + self._mapping[str(element)] = len(self._collection) + + def _set_mapping(self): + for i, e in enumerate(self._collection): + self._mapping[str(e)] = i + + def keys(self): + return list(self._mapping.keys()) + + +class KerasIterator(keras.utils.Sequence): + + def __init__(self, collection: DataCollection, batch_size: int, batch_path: str, shuffle_batches: bool = False, + model=None, upsampling=False, name=None): + self._collection = collection + batch_path = os.path.join(batch_path, str(name if name is not None else id(self))) + self._path = os.path.join(batch_path, "%i.pickle") + self.batch_size = batch_size + self.model = model + self.shuffle = shuffle_batches + self.upsampling = upsampling + self.indexes: list = [] + self._cleanup_path(batch_path) + self._prepare_batches() + + def __len__(self) -> int: + return len(self.indexes) + + def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]: + """Get batch for given index.""" + return self.__data_generation(self.indexes[index]) + + def _get_model_rank(self): + if self.model is not None: + mod_out = self.model.output_shape + if isinstance(mod_out, tuple): # only one output branch: (None, ahead) + mod_rank = 1 + elif isinstance(mod_out, list): # multiple output branches, e.g.: [(None, ahead), (None, ahead)] + mod_rank = len(mod_out) + else: # pragma: no cover + raise TypeError("model output shape must either be tuple or list.") + return mod_rank + else: # no model provided, assume to use single output + return 1 + + def __data_generation(self, index: int) -> Tuple[np.ndarray, np.ndarray]: + """Load pickle data from disk.""" + file = self._path % index + with open(file, "rb") as f: + data = pickle.load(f) + return data["X"], data["Y"] + + @staticmethod + def _concatenate(new: List[np.ndarray], old: List[np.ndarray]) -> List[np.ndarray]: + """Concatenate two lists of data along axis=0.""" + return list(map(lambda n1, n2: np.concatenate((n1, n2), axis=0), old, new)) + + def _get_batch(self, data_list: List[np.ndarray], b: int) -> List[np.ndarray]: + """Get batch according to batch size from data list.""" + return list(map(lambda data: data[b * self.batch_size:(b+1) * self.batch_size, ...], data_list)) + + def _permute_data(self, X, Y): + p = np.random.permutation(len(X[0])) # equiv to .shape[0] + X = list(map(lambda x: x[p], X)) + Y = list(map(lambda x: x[p], Y)) + return X, Y + + def _prepare_batches(self) -> None: + """ + Prepare all batches as locally stored files. + + Walk through all elements of collection and split (or merge) data according to the batch size. Too long data + sets are divided into multiple batches. Not fully filled batches are merged with data from the next collection + element. If data is remaining after the last element, it is saved as smaller batch. All batches are enumerated + beginning from 0. A list with all batch numbers is stored in class's parameter indexes. + """ + index = 0 + remaining = None + mod_rank = self._get_model_rank() + for data in self._collection: + X = data.get_X(upsampling=self.upsampling) + Y = [data.get_Y(upsampling=self.upsampling)[0] for _ in range(mod_rank)] + if self.upsampling: + X, Y = self._permute_data(X, Y) + if remaining is not None: + X, Y = self._concatenate(X, remaining[0]), self._concatenate(Y, remaining[1]) + length = X[0].shape[0] + batches = self._get_number_of_mini_batches(length) + for b in range(batches): + batch_X, batch_Y = self._get_batch(X, b), self._get_batch(Y, b) + self._save_to_pickle(X=batch_X, Y=batch_Y, index=index) + index += 1 + if (batches * self.batch_size) < length: # keep remaining to concatenate with next data element + remaining = (self._get_batch(X, batches), self._get_batch(Y, batches)) + else: + remaining = None + if remaining is not None: # add remaining as smaller batch + self._save_to_pickle(X=remaining[0], Y=remaining[1], index=index) + index += 1 + self.indexes = np.arange(0, index).tolist() + + def _save_to_pickle(self, X: List[np.ndarray], Y: List[np.ndarray], index: int) -> None: + """Save data as pickle file with variables X and Y and given index as <index>.pickle .""" + data = {"X": X, "Y": Y} + file = self._path % index + with open(file, "wb") as f: + pickle.dump(data, f) + + def _get_number_of_mini_batches(self, number_of_samples: int) -> int: + """Return number of mini batches as the floored ration of number of samples to batch size.""" + return math.floor(number_of_samples / self.batch_size) + + @staticmethod + def _cleanup_path(path: str, create_new: bool = True) -> None: + """First remove existing path, second create empty path if enabled.""" + if os.path.exists(path): + shutil.rmtree(path) + if create_new is True: + os.makedirs(path) + + def on_epoch_end(self) -> None: + """Randomly shuffle indexes if enabled.""" + if self.shuffle is True: + np.random.shuffle(self.indexes) + + +class DummyData: # pragma: no cover + + def __init__(self, number_of_samples=np.random.randint(100, 150)): + self.number_of_samples = number_of_samples + + def get_X(self): + X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables + X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 2)) # samples, window, variables + X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 2)) # samples, window, variables + return [X1, X2, X3] + + def get_Y(self): + Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5, 1)) # samples, window, variables + Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 5, 1)) # samples, window, variables + return [Y1, Y2] + + +if __name__ == "__main__": + + collection = [] + for _ in range(3): + collection.append(DummyData(50)) + + data_collection = DataCollection(collection=collection) + + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + iterator = KerasIterator(data_collection, 25, path, shuffle=True) + + for data in data_collection: + print(data) \ No newline at end of file diff --git a/mlair/data_handling/data_preparation.py b/mlair/data_handler/station_preparation.py similarity index 56% rename from mlair/data_handling/data_preparation.py rename to mlair/data_handler/station_preparation.py index 1dce5c87c2b076621ee08ae0f18906fd47d95e95..ff8496ab30a3b6392ea2314ef2526c80e0f57591 100644 --- a/mlair/data_handling/data_preparation.py +++ b/mlair/data_handler/station_preparation.py @@ -1,13 +1,13 @@ """Data Preparation class to handle data processing for machine learning.""" -__author__ = 'Lukas Leufen' -__date__ = '2020-06-29' +__author__ = 'Lukas Leufen, Felix Kleinert' +__date__ = '2020-07-20' import datetime as dt import logging import os from functools import reduce -from typing import Union, List, Iterable, Tuple +from typing import Union, List, Iterable, Tuple, Dict import numpy as np import pandas as pd @@ -24,61 +24,178 @@ number = Union[float, int] num_or_list = Union[number, List[number]] data_or_none = Union[xr.DataArray, None] +# defaults +DEFAULT_STATION_TYPE = "background" +DEFAULT_NETWORK = "AIRBASE" +DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', + 'pblheight': 'maximum'} +DEFAULT_WINDOW_LEAD_TIME = 3 +DEFAULT_WINDOW_HISTORY_SIZE = 13 +DEFAULT_TIME_DIM = "datetime" +DEFAULT_TARGET_VAR = "o3" +DEFAULT_TARGET_DIM = "variables" +DEFAULT_SAMPLING = "daily" +DEFAULT_INTERPOLATION_METHOD = "linear" + + +class AbstractStationPrep(object): + def __init__(self): #, path, station, statistics_per_var, transformation, **kwargs): + pass + + def get_X(self): + raise NotImplementedError -class AbstractDataPrep(object): - """ - This class prepares data to be used in neural networks. - - The instance searches for local stored data, that meet the given demands. If no local data is found, the DataPrep - instance will load data from TOAR database and store this data locally to use the next time. For the moment, there - is only support for daily aggregated time series. The aggregation can be set manually and differ for each variable. - - After data loading, different data pre-processing steps can be executed to prepare the data for further - applications. Especially the following methods can be used for the pre-processing step: - - - interpolate: interpolate between data points by using xarray's interpolation method - - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on \ - interval [0, 1] are not implemented yet. - - make window history: represent the history (time steps before) for training/ testing; X - - make labels: create target vector with given leading time steps for training/ testing; y - - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. \ - Use this method after the creation of the window history and labels to clean up the data cube. - - To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA, - "Umweltbundesamt") and the variables to use. Further options can be set in the instance. + def get_Y(self): + raise NotImplementedError - * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable. - * `start`: define a start date for the data cube creation. Default: Use the first entry in time series - * `end`: set the end date for the data cube. Default: Use last date in time series. - * `store_data_locally`: store recently downloaded data on local disk. Default: True - * set further parameters for xarray's interpolation methods to modify the interpolation scheme - """ +class StationPrep(AbstractStationPrep): - def __init__(self, path: str, station: Union[str, List[str]], variables: List[str], **kwargs): - """Construct instance.""" - self.path = os.path.abspath(path) + def __init__(self, station, data_path, statistics_per_var, station_type=DEFAULT_STATION_TYPE, + network=DEFAULT_NETWORK, sampling=DEFAULT_SAMPLING, target_dim=DEFAULT_TARGET_DIM, + target_var=DEFAULT_TARGET_VAR, time_dim=DEFAULT_TIME_DIM, + window_history_size=DEFAULT_WINDOW_HISTORY_SIZE, window_lead_time=DEFAULT_WINDOW_LEAD_TIME, + interpolation_limit: int = 0, interpolation_method: str = DEFAULT_INTERPOLATION_METHOD, + overwrite_local_data: bool = False, transformation=None, store_data_locally: bool = True, + min_length: int = 0, start=None, end=None, **kwargs): + super().__init__() # path, station, statistics_per_var, transformation, **kwargs) self.station = helpers.to_list(station) - self.variables = variables - self.mean: data_or_none = None - self.std: data_or_none = None - self.history: data_or_none = None - self.label: data_or_none = None - self.observation: data_or_none = None - self.extremes_history: data_or_none = None - self.extremes_label: data_or_none = None - self.kwargs = kwargs + self.path = os.path.abspath(data_path) + self.statistics_per_var = statistics_per_var + self.transformation = self.setup_transformation(transformation) + + self.station_type = station_type + self.network = network + self.sampling = sampling + self.target_dim = target_dim + self.target_var = target_var + self.time_dim = time_dim + self.window_history_size = window_history_size + self.window_lead_time = window_lead_time + + self.interpolation_limit = interpolation_limit + self.interpolation_method = interpolation_method + + self.overwrite_local_data = overwrite_local_data + self.store_data_locally = store_data_locally + self.min_length = min_length + self.start = start + self.end = end + + # internal self.data = None self.meta = None + self.variables = kwargs.get('variables', list(statistics_per_var.keys())) + self.history = None + self.label = None + self.observation = None + + # internal for transformation + self.mean = None + self.std = None + self.max = None + self.min = None self._transform_method = None - self.statistics_per_var = kwargs.get("statistics_per_var", None) - self.sampling = kwargs.get("sampling", "daily") - if self.statistics_per_var is not None or self.sampling == "hourly": - self.load_data() + + self.kwargs = kwargs + # self.kwargs["overwrite_local_data"] = overwrite_local_data + + # self.make_samples() + self.setup_samples() + + def __str__(self): + return self.station[0] + + def __len__(self): + assert len(self.get_X()) == len(self.get_Y()) + return len(self.get_X()) + + @property + def shape(self): + return self.data.shape, self.get_X().shape, self.get_Y().shape + + def __repr__(self): + return f"StationPrep(station={self.station}, data_path='{self.path}', " \ + f"statistics_per_var={self.statistics_per_var}, " \ + f"station_type='{self.station_type}', network='{self.network}', " \ + f"sampling='{self.sampling}', target_dim='{self.target_dim}', target_var='{self.target_var}', " \ + f"time_dim='{self.time_dim}', window_history_size={self.window_history_size}, " \ + f"window_lead_time={self.window_lead_time}, interpolation_limit={self.interpolation_limit}, " \ + f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data}, " \ + f"transformation={self._print_transformation_as_string}, **{self.kwargs})" + + @property + def _print_transformation_as_string(self): + str_name = '' + if self.transformation is None: + str_name = f'{None}' else: - raise NotImplementedError("Either select hourly data or provide statistics_per_var.") + for k, v in self.transformation.items(): + if v is not None: + try: + v_pr = f"xr.DataArray.from_dict({v.to_dict()})" + except AttributeError: + v_pr = f"'{v}'" + str_name += f"'{k}':{v_pr}, " + str_name = f"{{{str_name}}}" + return str_name + + def get_transposed_history(self) -> xr.DataArray: + """Return history. - def load_data(self, source_name=""): + :return: history with dimensions datetime, window, Stations, variables. + """ + return self.history.transpose("datetime", "window", "Stations", "variables").copy() + + def get_transposed_label(self) -> xr.DataArray: + """Return label. + + :return: label with dimensions datetime*, window*, Stations, variables. + """ + return self.label.squeeze("Stations").transpose("datetime", "window").copy() + + def get_X(self): + return self.get_transposed_history() + + def get_Y(self): + return self.get_transposed_label() + + def get_coordinates(self): + coords = self.meta.loc[["station_lon", "station_lat"]].astype(float) + return coords.rename(index={"station_lon": "lon", "station_lat": "lat"}).to_dict()[str(self)] + + def call_transform(self, inverse=False): + self.transform(dim=self.time_dim, method=self.transformation["method"], + mean=self.transformation['mean'], std=self.transformation["std"], + min_val=self.transformation["min"], max_val=self.transformation["max"], + inverse=inverse + ) + + def set_transformation(self, transformation: dict): + if self._transform_method is not None: + self.call_transform(inverse=True) + self.transformation = self.setup_transformation(transformation) + self.call_transform() + self.make_samples() + + def setup_samples(self): + """ + Setup samples. This method prepares and creates samples X, and labels Y. + """ + self.load_data() + self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) + if self.transformation is not None: + self.call_transform() + self.make_samples() + + def make_samples(self): + self.make_history_window(self.target_dim, self.window_history_size, self.time_dim) + self.make_labels(self.target_dim, self.target_var, self.time_dim, self.window_lead_time) + self.make_observation(self.target_dim, self.target_var, self.time_dim) + self.remove_nan(self.time_dim) + + def read_data_from_disk(self, source_name=""): """ Load data and meta data either from local disk (preferred) or download new data by using a custom download method. @@ -90,7 +207,7 @@ class AbstractDataPrep(object): check_path_and_create(self.path) file_name = self._set_file_name() meta_file = self._set_meta_file_name() - if self.kwargs.get('overwrite_local_data', False): + if self.overwrite_local_data is True: logging.debug(f"overwrite_local_data is true, therefore reload {file_name}{source_name}") if os.path.exists(file_name): os.remove(file_name) @@ -114,24 +231,111 @@ class AbstractDataPrep(object): data = self._slice_prep(data) self.data = self.check_for_negative_concentrations(data) - def download_data(self, file_name, meta_file) -> [xr.DataArray, pd.DataFrame]: + def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: """ - Download data and meta. + Download data from TOAR database using the JOIN interface. + + Data is transformed to a xarray dataset. If class attribute store_data_locally is true, data is additionally + stored locally using given names for file and meta file. :param file_name: name of file to save data to (containing full path) :param meta_file: name of the meta data file (also containing full path) - """ - raise NotImplementedError + + :return: downloaded data and its meta data + """ + df_all = {} + df, meta = join.download_join(station_name=self.station, stat_var=self.statistics_per_var, + station_type=self.station_type, network_name=self.network, sampling=self.sampling) + df_all[self.station[0]] = df + # convert df_all to xarray + xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} + xarr = xr.Dataset(xarr).to_array(dim='Stations') + if self.store_data_locally is True: + # save locally as nc/csv file + xarr.to_netcdf(path=file_name) + meta.to_csv(meta_file) + return xarr, meta + + def download_data(self, file_name, meta_file): + data, meta = self.download_data_from_join(file_name, meta_file) + return data, meta def check_station_meta(self): """ - Placeholder function to implement some additional station meta data check if desired. + Search for the entries in meta data and compare the value with the requested values. - Ideally, this method should raise a FileNotFoundError if a value mismatch to load fresh data from a source. If - this method is not required for your application just inherit and add the `pass` command inside the method. The - NotImplementedError is more a reminder that you could use it. + Will raise a FileNotFoundError if the values mismatch. """ - raise NotImplementedError + if self.station_type is not None: + check_dict = {"station_type": self.station_type, "network_name": self.network} + for (k, v) in check_dict.items(): + if v is None: + continue + if self.meta.at[k, self.station[0]] != v: + logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != " + f"{self.meta.at[k, self.station[0]]} (local). Raise FileNotFoundError to trigger new " + f"grapping from web.") + raise FileNotFoundError + + def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray: + """ + Set all negative concentrations to zero. + + Names of all concentrations are extracted from https://join.fz-juelich.de/services/rest/surfacedata/ + #2.1 Parameters. Currently, this check is applied on "benzene", "ch4", "co", "ethane", "no", "no2", "nox", + "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", and "toluene". + + :param data: data array containing variables to check + :param minimum: minimum value, by default this should be 0 + + :return: corrected data + """ + chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", + "propane", "so2", "toluene"] + used_chem_vars = list(set(chem_vars) & set(self.variables)) + data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) + return data + + def shift(self, dim: str, window: int) -> xr.DataArray: + """ + Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0). + + :param dim: dimension along shift is applied + :param window: number of steps to shift (corresponds to the window length) + + :return: shifted data + """ + start = 1 + end = 1 + if window <= 0: + start = window + else: + end = window + 1 + res = [] + for w in range(start, end): + res.append(self.data.shift({dim: -w})) + window_array = self.create_index_array('window', range(start, end), squeeze_dim=self.target_dim) + res = xr.concat(res, dim=window_array) + return res + + @staticmethod + def create_index_array(index_name: str, index_value: Iterable[int], squeeze_dim: str) -> xr.DataArray: + """ + Create an 1D xr.DataArray with given index name and value. + + :param index_name: name of dimension + :param index_value: values of this dimension + + :return: this array + """ + ind = pd.DataFrame({'val': index_value}, index=index_value) + # res = xr.Dataset.from_dataframe(ind).to_array().rename({'index': index_name}).squeeze(dim=squeez/e_dim, drop=True) + res = xr.Dataset.from_dataframe(ind).to_array(squeeze_dim).rename({'index': index_name}).squeeze( + dim=squeeze_dim, + drop=True + ) + res.name = index_name + return res def _set_file_name(self): all_vars = sorted(self.statistics_per_var.keys()) @@ -141,11 +345,6 @@ class AbstractDataPrep(object): all_vars = sorted(self.statistics_per_var.keys()) return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(all_vars)}_meta.csv") - def __repr__(self): - """Represent class attributes.""" - return f"AbstractDataPrep(path='{self.path}', station={self.station}, variables={self.variables}, " \ - f"**{self.kwargs})" - def interpolate(self, dim: str, method: str = 'linear', limit: int = None, use_coordinate: Union[bool, str] = True, **kwargs): """ @@ -187,126 +386,6 @@ class AbstractDataPrep(object): self.data = self.data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate, **kwargs) - @staticmethod - def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None: - """ - Support inverse_transformation method. - - Validate if all required statistics are available for given method. E.g. centering requires mean only, whereas - normalisation requires mean and standard deviation. Will raise an AttributeError on missing requirements. - - :param mean: data with all mean values - :param std: data with all standard deviation values - :param method: name of transformation method - """ - msg = "" - if method in ['standardise', 'centre'] and mean is None: - msg += "mean, " - if method == 'standardise' and std is None: - msg += "std, " - if len(msg) > 0: - raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}") - - def inverse_transform(self) -> None: - """ - Perform inverse transformation. - - Will raise an AssertionError, if no transformation was performed before. Checks first, if all required - statistics are available for inverse transformation. Class attributes data, mean and std are overwritten by - new data afterwards. Thereby, mean, std, and the private transform method are set to None to indicate, that the - current data is not transformed. - """ - - def f_inverse(data, mean, std, method_inverse): - if method_inverse == 'standardise': - return statistics.standardise_inverse(data, mean, std), None, None - elif method_inverse == 'centre': - return statistics.centre_inverse(data, mean), None, None - elif method_inverse == 'normalise': - raise NotImplementedError - else: - raise NotImplementedError - - if self._transform_method is None: - raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.") - self.check_inverse_transform_params(self.mean, self.std, self._transform_method) - self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) - self._transform_method = None - - def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None, - std=None) -> None: - """ - Transform data according to given transformation settings. - - This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0 - and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale - (`method=centre`). Furthermore, this sets an internal instance attribute for later inverse transformation. This - method will raise an AssertionError if an internal transform method was already set ('inverse=False') or if the - internal transform method, internal mean and internal standard deviation weren't set ('inverse=True'). - - :param string/int dim: This param is not used for inverse transformation. - | for xarray.DataArray as string: name of dimension which should be standardised - | for pandas.DataFrame as int: axis of dimension which should be standardised - :param method: Choose the transformation method from 'standardise' and 'centre'. 'normalise' is not implemented - yet. This param is not used for inverse transformation. - :param inverse: Switch between transformation and inverse transformation. - - :return: xarray.DataArrays or pandas.DataFrames: - #. mean: Mean of data - #. std: Standard deviation of data - #. data: Standardised data - """ - - def f(data): - if method == 'standardise': - return statistics.standardise(data, dim) - elif method == 'centre': - return statistics.centre(data, dim) - elif method == 'normalise': - # use min/max of data or given min/max - raise NotImplementedError - else: - raise NotImplementedError - - def f_apply(data): - if method == "standardise": - return mean, std, statistics.standardise_apply(data, mean, std) - elif method == "centre": - return mean, None, statistics.centre_apply(data, mean) - else: - raise NotImplementedError - - if not inverse: - if self._transform_method is not None: - raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with " - f"{self._transform_method}. Please perform inverse transformation of data first.") - self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data) - self._transform_method = method - else: - self.inverse_transform() - - def get_transformation_information(self, variable: str) -> Tuple[data_or_none, data_or_none, str]: - """ - Extract transformation statistics and method. - - Get mean and standard deviation for given variable and the transformation method if set. If a transformation - depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are - returned with None as fill value. - - :param variable: Variable for which the information on transformation is requested. - - :return: mean, standard deviation and transformation method - """ - try: - mean = self.mean.sel({'variables': variable}).values - except AttributeError: - mean = None - try: - std = self.std.sel({'variables': variable}).values - except AttributeError: - std = None - return mean, std, self._transform_method - def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None: """ Create a xr.DataArray containing history data. @@ -324,28 +403,6 @@ class AbstractDataPrep(object): window = -abs(window) self.history = self.shift(dim_name_of_shift, window).sel({dim_name_of_inputs: self.variables}) - def shift(self, dim: str, window: int) -> xr.DataArray: - """ - Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0). - - :param dim: dimension along shift is applied - :param window: number of steps to shift (corresponds to the window length) - - :return: shifted data - """ - start = 1 - end = 1 - if window <= 0: - start = window - else: - end = window + 1 - res = [] - for w in range(start, end): - res.append(self.data.shift({dim: -w})) - window_array = self.create_index_array('window', range(start, end)) - res = xr.concat(res, dim=window_array) - return res - def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, window: int) -> None: """ @@ -390,8 +447,7 @@ class AbstractDataPrep(object): intersect = reduce(np.intersect1d, (non_nan_history.coords[dim].values, non_nan_label.coords[dim].values, non_nan_observation.coords[dim].values)) - min_length = self.kwargs.get("min_length", 0) - if len(intersect) < max(min_length, 1): + if len(intersect) < max(self.min_length, 1): self.history = None self.label = None self.observation = None @@ -400,21 +456,6 @@ class AbstractDataPrep(object): self.label = self.label.sel({dim: intersect}) self.observation = self.observation.sel({dim: intersect}) - @staticmethod - def create_index_array(index_name: str, index_value: Iterable[int]) -> xr.DataArray: - """ - Create an 1D xr.DataArray with given index name and value. - - :param index_name: name of dimension - :param index_value: values of this dimension - - :return: this array - """ - ind = pd.DataFrame({'val': index_value}, index=index_value) - res = xr.Dataset.from_dataframe(ind).to_array().rename({'index': index_name}).squeeze(dim='variable', drop=True) - res.name = index_name - return res - def _slice_prep(self, data: xr.DataArray, coord: str = 'datetime') -> xr.DataArray: """ Set start and end date for slicing and execute self._slice(). @@ -424,8 +465,8 @@ class AbstractDataPrep(object): :return: sliced data """ - start = self.kwargs.get('start', data.coords[coord][0].values) - end = self.kwargs.get('end', data.coords[coord][-1].values) + start = self.start if self.start is not None else data.coords[coord][0].values + end = self.end if self.end is not None else data.coords[coord][-1].values return self._slice(data, start, end, coord) @staticmethod @@ -461,98 +502,200 @@ class AbstractDataPrep(object): data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) return data - def get_transposed_history(self) -> xr.DataArray: - """Return history. + @staticmethod + def setup_transformation(transformation: Dict): + """ + Set up transformation by extracting all relevant information. - :return: history with dimensions datetime, window, Stations, variables. + Extract all information from transformation dictionary. Possible keys are method, mean, std, min, max. + * If a transformation should be applied on base of existing values, these need to be provided in the respective + keys "mean" and "std" (again only if required for given method). + + :param transformation: the transformation dictionary as described above. + + :return: updated transformation dictionary """ - return self.history.transpose("datetime", "window", "Stations", "variables").copy() + if transformation is None: + return + elif not isinstance(transformation, dict): + raise TypeError(f"`transformation' must be either `None' or dict like e.g. `{{'method': 'standardise'}}," + f" but transformation is of type {type(transformation)}.") + transformation = transformation.copy() + method = transformation.get("method", None) + mean = transformation.get("mean", None) + std = transformation.get("std", None) + max_val = transformation.get("max", None) + min_val = transformation.get("min", None) + + transformation["method"] = method + transformation["mean"] = mean + transformation["std"] = std + transformation["max"] = max_val + transformation["min"] = min_val + return transformation + + def load_data(self): + try: + self.read_data_from_disk() + except FileNotFoundError: + self.download_data() + self.load_data() - def get_transposed_label(self) -> xr.DataArray: - """Return label. + def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None, + std=None, min_val=None, max_val=None) -> None: + """ + Transform data according to given transformation settings. + + This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0 + and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale + (`method=centre`). Furthermore, this sets an internal instance attribute for later inverse transformation. This + method will raise an AssertionError if an internal transform method was already set ('inverse=False') or if the + internal transform method, internal mean and internal standard deviation weren't set ('inverse=True'). + + :param string/int dim: This param is not used for inverse transformation. + | for xarray.DataArray as string: name of dimension which should be standardised + | for pandas.DataFrame as int: axis of dimension which should be standardised + :param method: Choose the transformation method from 'standardise' and 'centre'. 'normalise' is not implemented + yet. This param is not used for inverse transformation. + :param inverse: Switch between transformation and inverse transformation. + :param mean: Used for transformation (if required by 'method') based on external data. If 'None' the mean is + calculated over the data in this class instance. + :param std: Used for transformation (if required by 'method') based on external data. If 'None' the std is + calculated over the data in this class instance. + :param min_val: Used for transformation (if required by 'method') based on external data. If 'None' min_val is + extracted from the data in this class instance. + :param max_val: Used for transformation (if required by 'method') based on external data. If 'None' max_val is + extracted from the data in this class instance. - :return: label with dimensions datetime, window, Stations, variables. + :return: xarray.DataArrays or pandas.DataFrames: + #. mean: Mean of data + #. std: Standard deviation of data + #. data: Standardised data """ - return self.label.squeeze("Stations").transpose("datetime", "window").copy() - def get_extremes_history(self) -> xr.DataArray: - """Return extremes history. + def f(data): + if method == 'standardise': + return statistics.standardise(data, dim) + elif method == 'centre': + return statistics.centre(data, dim) + elif method == 'normalise': + # use min/max of data or given min/max + raise NotImplementedError + else: + raise NotImplementedError + + def f_apply(data): + if method == "standardise": + return mean, std, statistics.standardise_apply(data, mean, std) + elif method == "centre": + return mean, None, statistics.centre_apply(data, mean) + else: + raise NotImplementedError + + if not inverse: + if self._transform_method is not None: + raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with " + f"{self._transform_method}. Please perform inverse transformation of data first.") + # apply transformation on local data instance (f) if mean is None, else apply by using mean (and std) from + # external data. + self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data) + + # set transform method to find correct method for inverse transformation. + self._transform_method = method + else: + self.inverse_transform() - :return: extremes history with dimensions datetime, window, Stations, variables. + @staticmethod + def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None: """ - return self.extremes_history.transpose("datetime", "window", "Stations", "variables").copy() + Support inverse_transformation method. + + Validate if all required statistics are available for given method. E.g. centering requires mean only, whereas + normalisation requires mean and standard deviation. Will raise an AttributeError on missing requirements. - def get_extremes_label(self) -> xr.DataArray: - """Return extremes label. + :param mean: data with all mean values + :param std: data with all standard deviation values + :param method: name of transformation method + """ + msg = "" + if method in ['standardise', 'centre'] and mean is None: + msg += "mean, " + if method == 'standardise' and std is None: + msg += "std, " + if len(msg) > 0: + raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}") - :return: extremes label with dimensions datetime, window, Stations, variables. + def inverse_transform(self) -> None: """ - return self.extremes_label.squeeze("Stations").transpose("datetime", "window").copy() + Perform inverse transformation. - def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, - timedelta: Tuple[int, str] = (1, 'm')): + Will raise an AssertionError, if no transformation was performed before. Checks first, if all required + statistics are available for inverse transformation. Class attributes data, mean and std are overwritten by + new data afterwards. Thereby, mean, std, and the private transform method are set to None to indicate, that the + current data is not transformed. """ - Multiply extremes. - This method extracts extreme values from self.labels which are defined in the argument extreme_values. One can - also decide only to extract extremes on the right tail of the distribution. When extreme_values is a list of - floats/ints all values larger (and smaller than negative extreme_values; extraction is performed in standardised - space) than are extracted iteratively. If for example extreme_values = [1.,2.] then a value of 1.5 would be - extracted once (for 0th entry in list), while a 2.5 would be extracted twice (once for each entry). Timedelta is - used to mark those extracted values by adding one min to each timestamp. As TOAR Data are hourly one can - identify those "artificial" data points later easily. Extreme inputs and labels are stored in - self.extremes_history and self.extreme_labels, respectively. + def f_inverse(data, mean, std, method_inverse): + if method_inverse == 'standardise': + return statistics.standardise_inverse(data, mean, std), None, None + elif method_inverse == 'centre': + return statistics.centre_inverse(data, mean), None, None + elif method_inverse == 'normalise': + raise NotImplementedError + else: + raise NotImplementedError + + if self._transform_method is None: + raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.") + self.check_inverse_transform_params(self.mean, self.std, self._transform_method) + self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) + self._transform_method = None + # update X and Y + self.make_samples() - :param extreme_values: user definition of extreme - :param extremes_on_right_tail_only: if False also multiply values which are smaller then -extreme_values, - if True only extract values larger than extreme_values - :param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime + def get_transformation_information(self, variable: str = None) -> Tuple[data_or_none, data_or_none, str]: """ - # check if labels or history is None - if (self.label is None) or (self.history is None): - logging.debug(f"{self.station} has `None' labels, skip multiply extremes") - return + Extract transformation statistics and method. + + Get mean and standard deviation for given variable and the transformation method if set. If a transformation + depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are + returned with None as fill value. + + :param variable: Variable for which the information on transformation is requested. - # check type if inputs - extreme_values = helpers.to_list(extreme_values) - for i in extreme_values: - if not isinstance(i, number.__args__): - raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element " - f"{i} is type {type(i)}") - - for extr_val in sorted(extreme_values): - # check if some extreme values are already extracted - if (self.extremes_label is None) or (self.extremes_history is None): - # extract extremes based on occurance in labels - if extremes_on_right_tail_only: - extreme_label_idx = (self.label > extr_val).any(axis=0).values.reshape(-1, ) - else: - extreme_label_idx = np.concatenate(((self.label < -extr_val).any(axis=0).values.reshape(-1, 1), - (self.label > extr_val).any(axis=0).values.reshape(-1, 1)), - axis=1).any(axis=1) - extremes_label = self.label[..., extreme_label_idx] - extremes_history = self.history[..., extreme_label_idx, :] - extremes_label.datetime.values += np.timedelta64(*timedelta) - extremes_history.datetime.values += np.timedelta64(*timedelta) - self.extremes_label = extremes_label # .squeeze('Stations').transpose('datetime', 'window') - self.extremes_history = extremes_history # .transpose('datetime', 'window', 'Stations', 'variables') - else: # one extr value iteration is done already: self.extremes_label is NOT None... - if extremes_on_right_tail_only: - extreme_label_idx = (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, ) - else: - extreme_label_idx = np.concatenate( - ((self.extremes_label < -extr_val).any(axis=0).values.reshape(-1, 1), - (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, 1) - ), axis=1).any(axis=1) - # check on existing extracted extremes to minimise computational costs for comparison - extremes_label = self.extremes_label[..., extreme_label_idx] - extremes_history = self.extremes_history[..., extreme_label_idx, :] - extremes_label.datetime.values += np.timedelta64(*timedelta) - extremes_history.datetime.values += np.timedelta64(*timedelta) - self.extremes_label = xr.concat([self.extremes_label, extremes_label], dim='datetime') - self.extremes_history = xr.concat([self.extremes_history, extremes_history], dim='datetime') + :return: mean, standard deviation and transformation method + """ + variable = self.target_var if variable is None else variable + try: + mean = self.mean.sel({'variables': variable}).values + except AttributeError: + mean = None + try: + std = self.std.sel({'variables': variable}).values + except AttributeError: + std = None + return mean, std, self._transform_method if __name__ == "__main__": - dp = AbstractDataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) - print(dp) + # dp = AbstractDataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) + # print(dp) + statistics_per_var = {'o3': 'dma8eu', 'temp-rea-miub': 'maximum'} + sp = StationPrep(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122', + statistics_per_var=statistics_per_var, station_type='background', + network='UBA', sampling='daily', target_dim='variables', target_var='o3', + time_dim='datetime', window_history_size=7, window_lead_time=3, + interpolation_limit=0 + ) # transformation={'method': 'standardise'}) + # sp.set_transformation({'method': 'standardise', 'mean': sp.mean+2, 'std': sp.std+1}) + sp2 = StationPrep(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122', + statistics_per_var=statistics_per_var, station_type='background', + network='UBA', sampling='daily', target_dim='variables', target_var='o3', + time_dim='datetime', window_history_size=7, window_lead_time=3, + transformation={'method': 'standardise'}) + sp2.transform(inverse=True) + sp.get_X() + sp.get_Y() + print(len(sp)) + print(sp.shape) + print(sp) diff --git a/mlair/data_handling/bootstraps.py b/mlair/data_handling/bootstraps.py deleted file mode 100644 index 4e72b2b81476d04aec819cc6be0fdfd585e5eaf9..0000000000000000000000000000000000000000 --- a/mlair/data_handling/bootstraps.py +++ /dev/null @@ -1,383 +0,0 @@ -""" -Collections of bootstrap methods and classes. - -How to use ----------- - -test - -""" - -__author__ = 'Felix Kleinert, Lukas Leufen' -__date__ = '2020-02-07' - - -import logging -import os -import re -from typing import List, Union, Pattern, Tuple - -import dask.array as da -import keras -import numpy as np -import xarray as xr - -from mlair import helpers -from mlair.data_handling.data_generator import DataGenerator - - -class BootStrapGenerator(keras.utils.Sequence): - """ - Generator that returns bootstrapped history objects for given boot index while iteration. - - generator for bootstraps as keras sequence inheritance. Initialise with number of boots, the original history, the - shuffled data, all used variables and the current shuffled variable. While iterating over this generator, it returns - the bootstrapped history for given boot index (this is the iterator index) in the same format like the original - history ready to use. Note, that in some cases some samples can contain nan values (in these cases the entire data - row is null, not only single entries). - """ - - def __init__(self, number_of_boots: int, history: xr.DataArray, shuffled: xr.DataArray, variables: List[str], - shuffled_variable: str): - """ - Set up the generator. - - :param number_of_boots: number of bootstrap realisations - :param history: original history (the ground truth) - :param shuffled: the shuffled history - :param variables: list with all variables of interest - :param shuffled_variable: name of the variable that shall be bootstrapped - """ - self.number_of_boots = number_of_boots - self.variables = variables - self.history_orig = history - self.history = history.sel(variables=helpers.remove_items(self.variables, shuffled_variable)) - self.shuffled = shuffled.sel(variables=shuffled_variable) - - def __len__(self) -> int: - """ - Return number of bootstraps. - - :return: number of bootstraps - """ - return self.number_of_boots - - def __getitem__(self, index: int) -> xr.DataArray: - """ - Return bootstrapped history for given bootstrap index in same index structure like the original history object. - - :param index: boot index e [0, nboots-1] - :return: bootstrapped history ready to use - """ - logging.debug(f"boot: {index}") - boot_hist = self.history.copy() - boot_hist = boot_hist.combine_first(self.__get_shuffled(index)) - return boot_hist.reindex_like(self.history_orig) - - def __get_shuffled(self, index: int) -> xr.DataArray: - """ - Return shuffled data for given boot index from shuffled attribute. - - :param index: boot index e [0, nboots-1] - :return: shuffled data - """ - shuffled_var = self.shuffled.sel(boots=index).expand_dims("variables").drop("boots") - return shuffled_var.transpose("datetime", "window", "Stations", "variables") - - -class CreateShuffledData: - """ - Verify and create shuffled data for all data contained in given data generator class. - - Starts automatically on initialisation, no further calls are required. Check and new creations are all performed - inside bootstrap_path. - """ - - def __init__(self, data: DataGenerator, number_of_bootstraps: int, bootstrap_path: str): - """ - Shuffled data is automatically created in initialisation. - - :param data: data to shuffle - :param number_of_bootstraps: - :param bootstrap_path: Path to find and store the bootstraps - """ - self.data = data - self.number_of_bootstraps = number_of_bootstraps - self.bootstrap_path = bootstrap_path - self.create_shuffled_data() - - def create_shuffled_data(self) -> None: - """ - Create shuffled data. - - Use original test data, add dimension 'boots' with length number of bootstraps and insert randomly selected - variables. If there is a suitable local file for requested window size and number of bootstraps, no additional - file will be created inside this function. - """ - logging.info("create / check shuffled bootstrap data") - variables_str = '_'.join(sorted(self.data.variables)) - window = self.data.window_history_size - for station in self.data.stations: - valid, nboot = self.valid_bootstrap_file(station, variables_str, window) - if not valid: - logging.info(f'create bootstap data for {station}') - hist = self.data.get_data_generator(station).get_transposed_history() - file_path = self._set_file_path(station, variables_str, window, nboot) - hist = hist.expand_dims({'boots': range(nboot)}, axis=-1) - shuffled_variable = [] - chunks = (100, *hist.shape[1:3], hist.shape[-1]) - for i, var in enumerate(hist.coords['variables']): - single_variable = hist.sel(variables=var).values - shuffled_variable.append(self.shuffle(single_variable, chunks=chunks)) - shuffled_variable_da = da.stack(shuffled_variable, axis=-2).rechunk("auto") - shuffled_data = xr.DataArray(shuffled_variable_da, coords=hist.coords, dims=hist.dims) - shuffled_data.to_netcdf(file_path) - - def _set_file_path(self, station: str, variables: str, window: int, nboots: int) -> str: - """ - Set file name. - - Set file name following naming convention <station>_<var1>_<var2>_..._hist<window>_nboots<nboots>_shuffled.nc - and create joined path using bootstrap_path attribute set on initialisation. - - :param station: station name - :param variables: variables already preprocessed as single string with all variables seperated by underscore - :param window: window length - :param nboots: number of boots - :return: full file path - """ - file_name = f"{station}_{variables}_hist{window}_nboots{nboots}_shuffled.nc" - return os.path.join(self.bootstrap_path, file_name) - - def valid_bootstrap_file(self, station: str, variables: str, window: int) -> [bool, Union[None, int]]: - """ - Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. - - If a match was found, this method returns a tuple (True, None). In any other case, it returns (False, - max_nboot), where max_nboot is the highest boot number found in the local storage. A match is defined so that - the window length is ge than given window size form args and the number of boots is also ge than the given - number of boots from this class. Furthermore, this functions deletes local files, if the match the station - pattern but don't fit the window and bootstrap condition. This is performed, because it is assumed, that the - corresponding file will be created with a longer or at the least same window size and numbers of bootstraps. - - :param station: name of the station to validate - :param variables: all variables already merged in single string seperated by underscore - :param window: required window size - :return: tuple containing information if valid file was found first and second the number of boots that needs to - be used for the new boot creation (this is only relevant, if no valid file was found - otherwise the return - statement is anyway None). - """ - regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled") - max_nboot = self.number_of_bootstraps - for file in os.listdir(self.bootstrap_path): - match = regex.match(file) - if match: - window_file = int(match.group(1)) - nboot_file = int(match.group(2)) - max_nboot = max([max_nboot, nboot_file]) - if (window_file >= window) and (nboot_file >= self.number_of_bootstraps): - return True, None - else: - os.remove(os.path.join(self.bootstrap_path, file)) - return False, max_nboot - - @staticmethod - def shuffle(data: da.array, chunks: Tuple) -> da.core.Array: - """ - Shuffle randomly from given data (draw elements with replacement). - - :param data: data to shuffle - :param chunks: chunk size for dask - :return: shuffled data as dask core array (not computed yet) - """ - size = data.shape - return da.random.choice(data.reshape(-1, ), size=size, chunks=chunks) - - -class BootStraps: - """ - Main class to perform bootstrap operations. - - This class requires a DataGenerator object and a path, where to find and store all data related to the bootstrap - operation. In initialisation, this class will automatically call the class CreateShuffleData to set up the shuffled - data sets. How to use BootStraps: - - * call .get_generator(<station>, <variable>) to get a generator for given station and variable combination that \ - iterates over all bootstrap realisations (as keras sequence) - * call .get_labels(<station>) to get the measured observations in the same format as bootstrap predictions - * call .get_bootstrap_predictions(<station>, <variable>) to get the bootstrapped predictions - * call .get_orig_prediction(<station>) to get the non-bootstrapped predictions (referred as original predictions) - """ - - def __init__(self, data: DataGenerator, bootstrap_path: str, number_of_bootstraps: int = 10): - """ - Automatically check and create (if needed) shuffled data on initialisation. - - :param data: a data generator object to get data / history - :param bootstrap_path: path to find and store the bootstrap data - :param number_of_bootstraps: the number of bootstrap realisations - """ - self.data = data - self.number_of_bootstraps = number_of_bootstraps - self.bootstrap_path = bootstrap_path - CreateShuffledData(data, number_of_bootstraps, bootstrap_path) - - @property - def stations(self) -> List[str]: - """ - Station property inherits directly from data generator object. - - :return: list with all stations - """ - return self.data.stations - - @property - def variables(self) -> List[str]: - """ - Variables property inherits directly from data generator object. - - :return: list with all variables - """ - return self.data.variables - - @property - def window_history_size(self) -> int: - """ - Window history size property inherits directly from data generator object. - - :return: the window history size - """ - return self.data.window_history_size - - def get_generator(self, station: str, variable: str) -> BootStrapGenerator: - """ - Return the actual generator to use for the bootstrap evaluation. - - The generator requires information on station and bootstrapped variable. There is only a loop on the bootstrap - realisation and not on stations or variables. - - :param station: name of the station - :param variable: name of the variable to bootstrap - :return: BootStrapGenerator class ready to use. - """ - hist, _ = self.data[station] - shuffled_data = self._load_shuffled_data(station, self.variables).reindex_like(hist) - return BootStrapGenerator(self.number_of_bootstraps, hist, shuffled_data, self.variables, variable) - - def get_labels(self, station: str) -> np.ndarray: - """ - Repeat labels for given key by the number of boots and returns as single array. - - :param station: name of station - :return: repeated labels as single array - """ - labels = self.data[station][1] - return np.tile(labels.data, (self.number_of_bootstraps, 1)) - - def get_orig_prediction(self, path: str, file_name: str, prediction_name: str = "CNN") -> np.ndarray: - """ - Repeat predictions from given file(_name) in path by the number of boots. - - :param path: path to file - :param file_name: file name - :param prediction_name: name of the prediction to select from loaded file (default CNN) - :return: repeated predictions - """ - file = os.path.join(path, file_name) - prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze() - vals = np.tile(prediction.data, (self.number_of_bootstraps, 1)) - return vals[~np.isnan(vals).any(axis=1), :] - - def _load_shuffled_data(self, station: str, variables: List[str]) -> xr.DataArray: - """ - Load shuffled data from bootstrap path. - - Data is stored as '<station>_<var1>_<var2>_..._hist<histsize>_nboots<nboots>_shuffled.nc', e.g. - 'DEBW107_cloudcover_no_no2_temp_u_v_hist13_nboots20_shuffled.nc' - - :param station: name of station - :param variables: list of variables - :return: shuffled data as xarray - """ - file_name = self._get_shuffled_data_file(station, variables) - shuffled_data = xr.open_dataarray(file_name, chunks=100) - return shuffled_data - - def _get_shuffled_data_file(self, station: str, variables: List[str]) -> str: - """ - Look for data file using regular expressions and returns found file or raise FileNotFoundError. - - :param station: name of station - :param variables: name of variables - :return: found file with complete path - """ - files = os.listdir(self.bootstrap_path) - regex = self._create_file_regex(station, variables) - file = self._filter_files(regex, files, self.window_history_size, self.number_of_bootstraps) - if file: - return os.path.join(self.bootstrap_path, file) - else: - raise FileNotFoundError(f"Could not find a file to match pattern {regex}") - - @staticmethod - def _create_file_regex(station: str, variables: List[str]) -> Pattern: - """ - Create regex for given station and variables. - - With this regex, it is possible to look for shuffled data with pattern: - `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc` - - :param station: station name to use as prefix - :param variables: variables to add after station - :return: compiled regular expression - """ - var_regex = "".join([rf"(_\w+)*_{v}(_\w+)*" for v in sorted(variables)]) - regex = re.compile(rf"{station}{var_regex}_hist(\d+)_nboots(\d+)_shuffled\.nc") - return regex - - @staticmethod - def _filter_files(regex: Pattern, files: List[str], window: int, nboot: int) -> Union[str, None]: - """ - Filter list of files by regex. - - Regex has to be structured to match the following string structure - `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc`. Hist and nboots values have to be included as - group. All matches are compared to given window and nboot parameters. A valid file must have the same value (or - larger) than these parameters and contain all variables. - - :param regex: compiled regular expression pattern following the style from method description - :param files: list of file names to filter - :param window: minimum length of window to look for - :param nboot: minimal number of boots to search - :return: matching file name or None, if no valid file was found - """ - for f in files: - match = regex.match(f) - if match: - last = match.lastindex - if (int(match.group(last - 1)) >= window) and (int(match.group(last)) >= nboot): - return f - - -if __name__ == "__main__": - - from mlair.run_modules.experiment_setup import ExperimentSetup - from mlair.run_modules.run_environment import RunEnvironment - from mlair.run_modules.pre_processing import PreProcessing - - formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]' - logging.basicConfig(format=formatter, level=logging.INFO) - - with RunEnvironment() as run_env: - ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'], - station_type='background', trainable=True, window_history_size=9) - PreProcessing() - - data = run_env.data_store.get("generator", "general.test") - path = run_env.data_store.get("bootstrap_path", "general") - number_bootstraps = 10 - - boots = BootStraps(data, path, number_bootstraps) - for b in boots.boot_strap_generator(): - a, c = b - logging.info(f"len is {len(boots.get_boot_strap_meta())}") diff --git a/mlair/data_handling/data_distributor.py b/mlair/data_handling/data_distributor.py deleted file mode 100644 index bba5f2636f802e2d6843ef4a5ba5e6537c70dd61..0000000000000000000000000000000000000000 --- a/mlair/data_handling/data_distributor.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Data Distribution Module. - -How to use ----------- - -Create distributor object from a generator object and parse it to the fit generator method. Provide the number of -steps per epoch with distributor's length method. - -.. code-block:: python - - model = YourKerasModel() - data_generator = DataGenerator(*args, **kwargs) - data_distributor = Distributor(data_generator, model, **kwargs) - history = model.fit_generator(generator=data_distributor.distribute_on_batches(), - steps_per_epoch=len(data_distributor), - epochs=10,) - -Additionally, a validation data set can be parsed using the length and distribute methods. -""" - -from __future__ import generator_stop - -__author__ = "Lukas Leufen, Felix Kleinert" -__date__ = '2019-12-05' - -import math - -import keras -import numpy as np - -from mlair.data_handling.data_generator import DataGenerator - - -class Distributor(keras.utils.Sequence): - """Distribute data generator elements according to mini batch size.""" - - def __init__(self, generator: DataGenerator, model: keras.models, batch_size: int = 256, - permute_data: bool = False, upsampling: bool = False): - """ - Set up distributor. - - :param generator: The generator object must be iterable and return inputs and targets on each iteration - :param model: a keras model with one or more output branches - :param batch_size: batch size to use - :param permute_data: data is randomly permuted if enabled on each train step - :param upsampling: upsample data with upsample extremes data from generator object and shuffle data or use only - the standard input data. - """ - self.generator = generator - self.model = model - self.batch_size = batch_size - self.do_data_permutation = permute_data - self.upsampling = upsampling - - def _get_model_rank(self): - mod_out = self.model.output_shape - if isinstance(mod_out, tuple): - # only one output branch: (None, ahead) - mod_rank = 1 - elif isinstance(mod_out, list): - # multiple output branches, e.g.: [(None, ahead), (None, ahead)] - mod_rank = len(mod_out) - else: # pragma: no cover - raise TypeError("model output shape must either be tuple or list.") - return mod_rank - - def _get_number_of_mini_batches(self, values): - return math.ceil(values.shape[0] / self.batch_size) - - def _permute_data(self, x, y): - """ - Permute inputs x and labels y if permutation is enabled in instance. - - :param x: inputs - :param y: labels - :return: permuted or original data - """ - if self.do_data_permutation: - p = np.random.permutation(len(x)) # equiv to .shape[0] - x = x[p] - y = y[p] - return x, y - - def distribute_on_batches(self, fit_call=True): - """ - Create generator object to distribute mini batches. - - Split data from given generator object (usually for single station) according to the given batch size. Also - perform upsampling if enabled and random shuffling (either if data permutation is enabled or if upsampling is - enabled). Lastly multiply targets if provided model has multiple output branches. - - :param fit_call: switch to exit while loop after first iteration. This is used to determine the length of all - distributed mini batches. For default, fit_call is True to obtain infinite loop for training. - :return: yields next mini batch - """ - while True: - for k, v in enumerate(self.generator): - # get rank of output - mod_rank = self._get_model_rank() - # get data - x_total = np.copy(v[0]) - y_total = np.copy(v[1]) - if self.upsampling: - try: - s = self.generator.get_data_generator(k) - x_total = np.concatenate([x_total, np.copy(s.get_extremes_history())], axis=0) - y_total = np.concatenate([y_total, np.copy(s.get_extremes_label())], axis=0) - except AttributeError: # no extremes history / labels available, copy will fail - pass - # get number of mini batches - num_mini_batches = self._get_number_of_mini_batches(x_total) - # permute order for mini-batches - x_total, y_total = self._permute_data(x_total, y_total) - for prev, curr in enumerate(range(1, num_mini_batches + 1)): - x = x_total[prev * self.batch_size:curr * self.batch_size, ...] - y = [y_total[prev * self.batch_size:curr * self.batch_size, ...] for _ in range(mod_rank)] - if x is not None: # pragma: no branch - yield x, y - if (k + 1) == len(self.generator) and curr == num_mini_batches and not fit_call: - return - - def __len__(self) -> int: - """ - Total number of distributed mini batches. - - :return: the length of the distribute on batches object - """ - num_batch = 0 - for _ in self.distribute_on_batches(fit_call=False): - num_batch += 1 - return num_batch diff --git a/mlair/data_handling/data_generator.py b/mlair/data_handling/data_generator.py deleted file mode 100644 index 0088d00a95bf4d741bd3c71d6c0fcb011915d94f..0000000000000000000000000000000000000000 --- a/mlair/data_handling/data_generator.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Data Generator class to handle large arrays for machine learning.""" - -__author__ = 'Felix Kleinert, Lukas Leufen' -__date__ = '2019-11-07' - -import logging -import os -import pickle -from typing import Union, List, Tuple, Any, Dict - -import dask.array as da -import keras -import xarray as xr - -from mlair import helpers -from mlair.data_handling.data_preparation import AbstractDataPrep -from mlair.helpers.join import EmptyQueryResult - -number = Union[float, int] -num_or_list = Union[number, List[number]] -data_or_none = Union[xr.DataArray, None] - - -class DataGenerator(keras.utils.Sequence): - """ - This class is a generator to handle large arrays for machine learning. - - .. code-block:: python - - data_generator = DataGenerator(**args, **kwargs) - - Data generator item can be called manually by position (integer) or station id (string). Methods also accept lists - with exactly one entry of integer or string. - - .. code-block:: - - # select generator elements by position index - first_element = data_generator.get_data_generator([0]) # 1st element - n_element = data_generator.get_data_generator([4]) # 5th element - - # select by name - station_xy = data_generator.get_data_generator(["station_xy"]) # will raise KeyError if not available - - If used as iterator or directly called by get item method, the data generator class returns transposed labels and - history object from underlying data preparation class DataPrep. - - .. code-block:: python - - # select history and label by position - hist, labels = data_generator[0] - # by name - hist, labels = data_generator["station_xy"] - # as iterator - for (hist, labels) in data_generator: - pass - - This class can also be used with keras' fit_generator and predict_generator. Individual stations are the iterables. - """ - - def __init__(self, data_path: str, stations: Union[str, List[str]], variables: List[str], - interpolation_dim: str, target_dim: str, target_var: str, station_type: str = None, - interpolation_method: str = "linear", limit_nan_fill: int = 1, window_history_size: int = 7, - window_lead_time: int = 4, transformation: Dict = None, extreme_values: num_or_list = None, - data_preparation=None, **kwargs): - """ - Set up data generator. - - :param data_path: path to data - :param stations: list with all stations to include - :param variables: list with all used variables - :param interpolation_dim: dimension along which interpolation is applied - :param target_dim: dimension of target variable - :param target_var: name of target variable - :param station_type: TOAR station type classification (background, traffic) - :param interpolation_method: method of interpolation - :param limit_nan_fill: maximum gab in data to fill by interpolation - :param window_history_size: length of the history window - :param window_lead_time: lenght of the label window - :param transformation: transformation method to apply on data - :param extreme_values: set up the extreme value upsampling - :param kwargs: additional kwargs that are used in either DataPrep (transformation, start / stop period, ...) - or extreme values - """ - self.data_path = os.path.abspath(data_path) - self.data_path_tmp = os.path.join(os.path.abspath(data_path), "tmp") - if not os.path.exists(self.data_path_tmp): - os.makedirs(self.data_path_tmp) - self.stations = helpers.to_list(stations) - self.variables = variables - self.interpolation_dim = interpolation_dim - self.target_dim = target_dim - self.target_var = target_var - self.station_type = station_type - self.interpolation_method = interpolation_method - self.limit_nan_fill = limit_nan_fill - self.window_history_size = window_history_size - self.window_lead_time = window_lead_time - self.extreme_values = extreme_values - self.DataPrep = data_preparation if data_preparation is not None else AbstractDataPrep - self.kwargs = kwargs - self.transformation = self.setup_transformation(transformation) - - def __repr__(self): - """Display all class attributes.""" - return f"DataGenerator(path='{self.data_path}', stations={self.stations}, " \ - f"variables={self.variables}, station_type={self.station_type}, " \ - f"interpolation_dim='{self.interpolation_dim}', target_dim='{self.target_dim}', " \ - f"target_var='{self.target_var}', **{self.kwargs})" - - def __len__(self): - """Return the number of stations.""" - return len(self.stations) - - def __iter__(self) -> "DataGenerator": - """ - Define the __iter__ part of the iterator protocol to iterate through this generator. - - Sets the private attribute `_iterator` to 0. - """ - self._iterator = 0 - return self - - def __next__(self) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Get the data generator, and return the history and label data of this generator. - - This is the implementation of the __next__ method of the iterator protocol. - """ - if self._iterator < self.__len__(): - data = self.get_data_generator() - self._iterator += 1 - if data.history is not None and data.label is not None: # pragma: no branch - return data.get_transposed_history(), data.get_transposed_label() - else: - self.__next__() # pragma: no cover - else: - raise StopIteration - - def __getitem__(self, item: Union[str, int]) -> Tuple[xr.DataArray, xr.DataArray]: - """ - Define the get item method for this generator. - - Retrieve data from generator and return history and labels. - - :param item: station key to choose the data generator. - :return: The generator's time series of history data and its labels - """ - data = self.get_data_generator(key=item) - return data.get_transposed_history(), data.get_transposed_label() - - def setup_transformation(self, transformation: Dict): - """ - Set up transformation by extracting all relevant information. - - Extract all information from transformation dictionary. Possible keys are scope. method, mean, and std. Scope - can either be station or data. Station scope means, that data transformation is performed for each station - independently (somehow like batch normalisation), whereas data scope means a transformation applied on the - entire data set. - - * If using data scope, mean and standard deviation (each only if required by transformation method) can either - be calculated accurate or as an estimate (faster implementation). This must be set in dictionary either - as "mean": "accurate" or "mean": "estimate". In both cases, the required statistics are calculated and saved. - After this calculations, the mean key is overwritten by the actual values to use. - * If using station scope, no additional information is required. - * If a transformation should be applied on base of existing values, these need to be provided in the respective - keys "mean" and "std" (again only if required for given method). - - :param transformation: the transformation dictionary as described above. - - :return: updated transformation dictionary - """ - if transformation is None: - return - transformation = transformation.copy() - scope = transformation.get("scope", "station") - method = transformation.get("method", "standardise") - mean = transformation.get("mean", None) - std = transformation.get("std", None) - if scope == "data": - if isinstance(mean, str): - if mean == "accurate": - mean, std = self.calculate_accurate_transformation(method) - elif mean == "estimate": - mean, std = self.calculate_estimated_transformation(method) - else: - raise ValueError(f"given mean attribute must either be equal to strings 'accurate' or 'estimate' or" - f"be an array with already calculated means. Given was: {mean}") - elif scope == "station": - mean, std = None, None - else: - raise ValueError(f"Scope argument can either be 'station' or 'data'. Given was: {scope}") - transformation["method"] = method - transformation["mean"] = mean - transformation["std"] = std - return transformation - - def calculate_accurate_transformation(self, method: str) -> Tuple[data_or_none, data_or_none]: - """ - Calculate accurate transformation statistics. - - Use all stations of this generator and calculate mean and standard deviation on entire data set using dask. - Because there can be much data, this can take a while. - - :param method: name of transformation method - - :return: accurate calculated mean and std (depending on transformation) - """ - tmp = [] - mean = None - std = None - for station in self.stations: - try: - data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type, - **self.kwargs) - chunks = (1, 100, data.data.shape[2]) - tmp.append(da.from_array(data.data.data, chunks=chunks)) - except EmptyQueryResult: - continue - tmp = da.concatenate(tmp, axis=1) - if method in ["standardise", "centre"]: - mean = da.nanmean(tmp, axis=1).compute() - mean = xr.DataArray(mean.flatten(), coords={"variables": sorted(self.variables)}, dims=["variables"]) - if method == "standardise": - std = da.nanstd(tmp, axis=1).compute() - std = xr.DataArray(std.flatten(), coords={"variables": sorted(self.variables)}, dims=["variables"]) - else: - raise NotImplementedError - return mean, std - - def calculate_estimated_transformation(self, method): - """ - Calculate estimated transformation statistics. - - Use all stations of this generator and calculate mean and standard deviation first for each station separately. - Afterwards, calculate the average mean and standard devation as estimated statistics. Because this method does - not consider the length of each data set, the estimated mean distinguishes from the real data mean. Furthermore, - the estimated standard deviation is assumed to be the mean (also not weighted) of all deviations. But this is - mathematically not true, but still a rough and faster estimation of the true standard deviation. Do not use this - method for further statistical calculation. However, in the scope of data preparation for machine learning, this - approach is decent ("it is just scaling"). - - :param method: name of transformation method - - :return: accurate calculated mean and std (depending on transformation) - """ - data = [[]] * len(self.variables) - coords = {"variables": self.variables, "Stations": range(0)} - mean = xr.DataArray(data, coords=coords, dims=["variables", "Stations"]) - std = xr.DataArray(data, coords=coords, dims=["variables", "Stations"]) - for station in self.stations: - try: - data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type, - **self.kwargs) - data.transform("datetime", method=method) - mean = mean.combine_first(data.mean) - std = std.combine_first(data.std) - data.transform("datetime", method=method, inverse=True) - except EmptyQueryResult: - continue - return mean.mean("Stations") if mean.shape[1] > 0 else None, std.mean("Stations") if std.shape[1] > 0 else None - - def get_data_generator(self, key: Union[str, int] = None, load_local_tmp_storage: bool = True, - save_local_tmp_storage: bool = True) -> AbstractDataPrep: - """ - Create DataPrep object and preprocess data for given key. - - Select data for given key, create a DataPrep object and - * apply transformation (optional) - * interpolate - * make history, labels, and observation - * remove nans - * upsample extremes (optional). - Processed data can be stored locally in a .pickle file. If load local tmp storage is enabled, the get data - generator tries first to load data from local pickle file and only creates a new DataPrep object if it couldn't - load this data from disk. - - :param key: station key to choose the data generator. - :param load_local_tmp_storage: say if data should be processed from scratch or loaded as already processed data - from tmp pickle file to save computational time (but of course more disk space required). - :param save_local_tmp_storage: save processed data as temporal file locally (default True) - - :return: preprocessed data as a DataPrep instance - """ - station = self.get_station_key(key) - try: - if not load_local_tmp_storage: - raise FileNotFoundError - data = self._load_pickle_data(station, self.variables) - except FileNotFoundError: - logging.debug(f"load not pickle data for {station}") - data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type, - **self.kwargs) - if self.transformation is not None: - data.transform("datetime", **helpers.remove_items(self.transformation, "scope")) - data.interpolate(self.interpolation_dim, method=self.interpolation_method, limit=self.limit_nan_fill) - data.make_history_window(self.target_dim, self.window_history_size, self.interpolation_dim) - data.make_labels(self.target_dim, self.target_var, self.interpolation_dim, self.window_lead_time) - data.make_observation(self.target_dim, self.target_var, self.interpolation_dim) - data.remove_nan(self.interpolation_dim) - if self.extreme_values is not None: - kwargs = {"extremes_on_right_tail_only": self.kwargs.get("extremes_on_right_tail_only", False)} - data.multiply_extremes(self.extreme_values, **kwargs) - if save_local_tmp_storage: - self._save_pickle_data(data) - return data - - def _save_pickle_data(self, data: Any): - """ - Save given data locally as .pickle in self.data_path_tmp with name '<station>_<var1>_<var2>_..._<varX>.pickle'. - - :param data: any data, that should be saved - """ - date = f"{self.kwargs.get('start')}_{self.kwargs.get('end')}" - vars = '_'.join(sorted(data.variables)) - station = ''.join(data.station) - file = os.path.join(self.data_path_tmp, f"{station}_{vars}_{date}_.pickle") - with open(file, "wb") as f: - pickle.dump(data, f) - logging.debug(f"save pickle data to {file}") - - def _load_pickle_data(self, station: Union[str, List[str]], variables: List[str]) -> Any: - """ - Load locally saved data from self.data_path_tmp and name '<station>_<var1>_<var2>_..._<varX>.pickle'. - - :param station: station to load - :param variables: list of variables to load - :return: loaded data - """ - date = f"{self.kwargs.get('start')}_{self.kwargs.get('end')}" - vars = '_'.join(sorted(variables)) - station = ''.join(station) - file = os.path.join(self.data_path_tmp, f"{station}_{vars}_{date}_.pickle") - with open(file, "rb") as f: - data = pickle.load(f) - logging.debug(f"load pickle data from {file}") - return data - - def get_station_key(self, key: Union[None, str, int, List[Union[None, str, int]]]) -> str: - """ - Return a valid station key or raise KeyError if this wasn't possible. - - :param key: station key to choose the data generator. - :return: station key (id from database) - """ - # extract value if given as list - if isinstance(key, list): - if len(key) == 1: - key = key[0] - else: - raise KeyError(f"More than one key was given: {key}") - # return station name either from key or the recent element from iterator - if key is None: - return self.stations[self._iterator] - else: - if isinstance(key, int): - if key < self.__len__(): - return self.stations[key] - else: - raise KeyError(f"{key} is not in range(0, {self.__len__()})") - elif isinstance(key, str): - if key in self.stations: - return key - else: - raise KeyError(f"{key} is not in stations") - else: - raise KeyError(f"Key has to be from Union[str, int]. Given was {key} ({type(key)})") diff --git a/mlair/data_handling/data_preparation_join.py b/mlair/data_handling/data_preparation_join.py deleted file mode 100644 index 516be5b3d4cebdbca4e9328f4886988008efbeb8..0000000000000000000000000000000000000000 --- a/mlair/data_handling/data_preparation_join.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Data Preparation class to handle data processing for machine learning.""" - -__author__ = 'Felix Kleinert, Lukas Leufen' -__date__ = '2019-10-16' - -import datetime as dt -import inspect -import logging -from typing import Union, List - -import pandas as pd -import xarray as xr - -from mlair import helpers -from mlair.helpers import join -from mlair.data_handling.data_preparation import AbstractDataPrep - -# define a more general date type for type hinting -date = Union[dt.date, dt.datetime] -str_or_list = Union[str, List[str]] -number = Union[float, int] -num_or_list = Union[number, List[number]] -data_or_none = Union[xr.DataArray, None] - - -class DataPrepJoin(AbstractDataPrep): - """ - This class prepares data to be used in neural networks. - - The instance searches for local stored data, that meet the given demands. If no local data is found, the DataPrep - instance will load data from TOAR database and store this data locally to use the next time. For the moment, there - is only support for daily aggregated time series. The aggregation can be set manually and differ for each variable. - - After data loading, different data pre-processing steps can be executed to prepare the data for further - applications. Especially the following methods can be used for the pre-processing step: - - - interpolate: interpolate between data points by using xarray's interpolation method - - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on \ - interval [0, 1] are not implemented yet. - - make window history: represent the history (time steps before) for training/ testing; X - - make labels: create target vector with given leading time steps for training/ testing; y - - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. \ - Use this method after the creation of the window history and labels to clean up the data cube. - - To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA, - "Umweltbundesamt") and the variables to use. Further options can be set in the instance. - - * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable. - * `start`: define a start date for the data cube creation. Default: Use the first entry in time series - * `end`: set the end date for the data cube. Default: Use last date in time series. - * `store_data_locally`: store recently downloaded data on local disk. Default: True - * set further parameters for xarray's interpolation methods to modify the interpolation scheme - - """ - - def __init__(self, path: str, station: Union[str, List[str]], variables: List[str], network: str = None, - station_type: str = None, **kwargs): - self.network = network - self.station_type = station_type - params = helpers.remove_items(inspect.getfullargspec(AbstractDataPrep.__init__).args, "self") - kwargs = {**{k: v for k, v in locals().items() if k in params and v is not None}, **kwargs} - super().__init__(**kwargs) - - def download_data(self, file_name, meta_file): - """ - Download data and meta from join. - - :param file_name: name of file to save data to (containing full path) - :param meta_file: name of the meta data file (also containing full path) - """ - data, meta = self.download_data_from_join(file_name, meta_file) - return data, meta - - def check_station_meta(self): - """ - Search for the entries in meta data and compare the value with the requested values. - - Will raise a FileNotFoundError if the values mismatch. - """ - if self.station_type is not None: - check_dict = {"station_type": self.station_type, "network_name": self.network} - for (k, v) in check_dict.items(): - if v is None: - continue - if self.meta.at[k, self.station[0]] != v: - logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != " - f"{self.meta.at[k, self.station[0]]} (local). Raise FileNotFoundError to trigger new " - f"grapping from web.") - raise FileNotFoundError - - def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: - """ - Download data from TOAR database using the JOIN interface. - - Data is transformed to a xarray dataset. If class attribute store_data_locally is true, data is additionally - stored locally using given names for file and meta file. - - :param file_name: name of file to save data to (containing full path) - :param meta_file: name of the meta data file (also containing full path) - - :return: downloaded data and its meta data - """ - df_all = {} - df, meta = join.download_join(station_name=self.station, stat_var=self.statistics_per_var, - station_type=self.station_type, network_name=self.network, sampling=self.sampling) - df_all[self.station[0]] = df - # convert df_all to xarray - xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} - xarr = xr.Dataset(xarr).to_array(dim='Stations') - if self.kwargs.get('store_data_locally', True): - # save locally as nc/csv file - xarr.to_netcdf(path=file_name) - meta.to_csv(meta_file) - return xarr, meta - - def __repr__(self): - """Represent class attributes.""" - return f"Dataprep(path='{self.path}', network='{self.network}', station={self.station}, " \ - f"variables={self.variables}, station_type={self.station_type}, **{self.kwargs})" - - -if __name__ == "__main__": - dp = DataPrepJoin('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) - print(dp) diff --git a/mlair/helpers/__init__.py b/mlair/helpers/__init__.py index 546713b3f18f2cb64c1527b57d1e9e2138e927aa..9e2f612c86dc0477693567210493fbdcf3002954 100644 --- a/mlair/helpers/__init__.py +++ b/mlair/helpers/__init__.py @@ -3,4 +3,4 @@ from .testing import PyTestRegex, PyTestAllEqual from .time_tracking import TimeTracking, TimeTrackingWrapper from .logger import Logger -from .helpers import remove_items, float_round, dict_to_xarray, to_list +from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py index 968ee5385f5a44cdbbce5653a864875011874150..b12d9028747aa677802c4a99e35852b514128e4c 100644 --- a/mlair/helpers/helpers.py +++ b/mlair/helpers/helpers.py @@ -92,3 +92,10 @@ def remove_items(obj: Union[List, Dict], items: Any): return remove_from_dict(obj, items) else: raise TypeError(f"{inspect.stack()[0][3]} does not support type {type(obj)}.") + + +def extract_value(encapsulated_value): + try: + return extract_value(encapsulated_value[0]) + except TypeError: + return encapsulated_value diff --git a/mlair/model_modules/linear_model.py b/mlair/model_modules/linear_model.py index e556f0358a2a5e5247f7b6cc7d416af25a8a664d..341c787e3060fd7e7cc3ff468ba40add9b9936d2 100644 --- a/mlair/model_modules/linear_model.py +++ b/mlair/model_modules/linear_model.py @@ -42,21 +42,27 @@ class OrdinaryLeastSquaredModel: return self.ordinary_least_squared_model(self.x, self.y) def _set_x_y_from_generator(self): - data_x = None - data_y = None + data_x, data_y = None, None for item in self.generator: - x = self.reshape_xarray_to_numpy(item[0]) - y = item[1].values - data_x = np.concatenate((data_x, x), axis=0) if data_x is not None else x - data_y = np.concatenate((data_y, y), axis=0) if data_y is not None else y - self.x = data_x - self.y = data_y + x, y = item.get_data(as_numpy=True) + x = self.flatten(x) + data_x = self._concatenate(x, data_x) + data_y = self._concatenate(y, data_y) + self.x, self.y = np.concatenate(data_x, axis=1), data_y[0] + + def _concatenate(self, new, old): + return list(map(lambda n1, n2: np.concatenate((n1, n2), axis=0), old, new)) if old is not None else new def predict(self, data): """Apply OLS model on data.""" - data = sm.add_constant(self.reshape_xarray_to_numpy(data), has_constant="add") + data = sm.add_constant(np.concatenate(self.flatten(data), axis=1), has_constant="add") return np.atleast_2d(self.model.predict(data)) + @staticmethod + def flatten(data): + shapes = list(map(lambda x: x.shape, data)) + return list(map(lambda x, shape: x.reshape(shape[0], -1), data, shapes)) + @staticmethod def reshape_xarray_to_numpy(data): """Reshape xarray data to numpy data and flatten.""" diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index b1779ecd7c087519e8cb8e78b2c9998214d12758..56e7b4c347a69781854a9cf8ad9a719f7d6ac8b9 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -139,7 +139,7 @@ class AbstractModelClass(ABC): the corresponding loss function. """ - def __init__(self) -> None: + def __init__(self, shape_inputs, shape_outputs) -> None: """Predefine internal attributes for model and loss.""" self.__model = None self.model_name = self.__class__.__name__ @@ -153,6 +153,8 @@ class AbstractModelClass(ABC): 'target_tensors': None } self.__compile_options = self.__allowed_compile_options + self.shape_inputs = shape_inputs + self.shape_outputs = self.__extract_from_tuple(shape_outputs) def __getattr__(self, name: str) -> Any: """ @@ -273,6 +275,11 @@ class AbstractModelClass(ABC): raise ValueError( f"Got different values or arguments for same argument: self.{allow_k}={new_v_attr.__class__} and '{allow_k}': {new_v_dic.__class__}") + @staticmethod + def __extract_from_tuple(tup): + """Return element of tuple if it contains only a single element.""" + return tup[0] if isinstance(tup, tuple) and len(tup) == 1 else tup + @staticmethod def __compare_keras_optimizers(first, second): if first.__class__ == second.__class__ and first.__module__ == 'keras.optimizers': @@ -340,24 +347,19 @@ class MyLittleModel(AbstractModelClass): Dense layer. """ - def __init__(self, window_history_size, window_lead_time, channels): + def __init__(self, shape_inputs: list, shape_outputs: list): """ Sets model and loss depending on the given arguments. - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param regularizer: <not used here> - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer + :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables)) + :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast)) """ - super().__init__() + assert len(shape_inputs) == 1 + assert len(shape_outputs) == 1 + super().__init__(shape_inputs[0], shape_outputs[0]) # settings - self.window_history_size = window_history_size - self.window_lead_time = window_lead_time - self.channels = channels self.dropout_rate = 0.1 self.regularizer = keras.regularizers.l2(0.1) self.activation = keras.layers.PReLU @@ -370,17 +372,10 @@ class MyLittleModel(AbstractModelClass): def set_model(self): """ Build the model. - - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer - :return: built keras model """ # add 1 to window_size to include current time step t0 - x_input = keras.layers.Input(shape=(self.window_history_size + 1, 1, self.channels)) + x_input = keras.layers.Input(shape=self.shape_inputs) x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1'.format("major"))(x_input) x_in = self.activation(name='{}_conv_act'.format("major"))(x_in) x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in) @@ -391,16 +386,16 @@ class MyLittleModel(AbstractModelClass): x_in = self.activation()(x_in) x_in = keras.layers.Dense(16, name='{}_Dense_16'.format("major"))(x_in) x_in = self.activation()(x_in) - x_in = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("major"))(x_in) + x_in = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("major"))(x_in) out_main = self.activation()(x_in) self.model = keras.Model(inputs=x_input, outputs=[out_main]) def set_compile_options(self): self.initial_lr = 1e-2 - self.optimizer = keras.optimizers.SGD(lr=self.initial_lr, momentum=0.9) + self.optimizer = keras.optimizers.adam(lr=self.initial_lr) self.lr_decay = mlair.model_modules.keras_extensions.LearningRateDecay(base_lr=self.initial_lr, drop=.94, epochs_drop=10) - self.compile_options = {"loss": keras.losses.mean_squared_error, "metrics": ["mse", "mae"]} + self.compile_options = {"loss": [keras.losses.mean_squared_error], "metrics": ["mse", "mae"]} class MyBranchedModel(AbstractModelClass): @@ -412,24 +407,19 @@ class MyBranchedModel(AbstractModelClass): Dense layer. """ - def __init__(self, window_history_size, window_lead_time, channels): + def __init__(self, shape_inputs: list, shape_outputs: list): """ Sets model and loss depending on the given arguments. - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param regularizer: <not used here> - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer + :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables)) + :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast)) """ - super().__init__() + assert len(shape_inputs) == 1 + assert len(shape_outputs) == 1 + super().__init__(shape_inputs[0], shape_outputs[0]) # settings - self.window_history_size = window_history_size - self.window_lead_time = window_lead_time - self.channels = channels self.dropout_rate = 0.1 self.regularizer = keras.regularizers.l2(0.1) self.activation = keras.layers.PReLU @@ -442,32 +432,25 @@ class MyBranchedModel(AbstractModelClass): def set_model(self): """ Build the model. - - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer - :return: built keras model """ # add 1 to window_size to include current time step t0 - x_input = keras.layers.Input(shape=(self.window_history_size + 1, 1, self.channels)) + x_input = keras.layers.Input(shape=self.shape_inputs) x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1'.format("major"))(x_input) x_in = self.activation(name='{}_conv_act'.format("major"))(x_in) x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in) x_in = keras.layers.Dropout(self.dropout_rate, name='{}_Dropout_1'.format("major"))(x_in) x_in = keras.layers.Dense(64, name='{}_Dense_64'.format("major"))(x_in) x_in = self.activation()(x_in) - out_minor_1 = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("minor_1"))(x_in) + out_minor_1 = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("minor_1"))(x_in) out_minor_1 = self.activation(name="minor_1")(out_minor_1) x_in = keras.layers.Dense(32, name='{}_Dense_32'.format("major"))(x_in) x_in = self.activation()(x_in) - out_minor_2 = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("minor_2"))(x_in) + out_minor_2 = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("minor_2"))(x_in) out_minor_2 = self.activation(name="minor_2")(out_minor_2) x_in = keras.layers.Dense(16, name='{}_Dense_16'.format("major"))(x_in) x_in = self.activation()(x_in) - x_in = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("major"))(x_in) + x_in = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("major"))(x_in) out_main = self.activation(name="main")(x_in) self.model = keras.Model(inputs=x_input, outputs=[out_minor_1, out_minor_2, out_main]) @@ -482,24 +465,19 @@ class MyBranchedModel(AbstractModelClass): class MyTowerModel(AbstractModelClass): - def __init__(self, window_history_size, window_lead_time, channels): + def __init__(self, shape_inputs: list, shape_outputs: list): """ Sets model and loss depending on the given arguments. - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param regularizer: <not used here> - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer + :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables)) + :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast)) """ - super().__init__() + assert len(shape_inputs) == 1 + assert len(shape_outputs) == 1 + super().__init__(shape_inputs[0], shape_outputs[0]) # settings - self.window_history_size = window_history_size - self.window_lead_time = window_lead_time - self.channels = channels self.dropout_rate = 1e-2 self.regularizer = keras.regularizers.l2(0.1) self.initial_lr = 1e-2 @@ -515,13 +493,6 @@ class MyTowerModel(AbstractModelClass): def set_model(self): """ Build the model. - - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer - :return: built keras model """ activation = self.activation conv_settings_dict1 = { @@ -555,9 +526,7 @@ class MyTowerModel(AbstractModelClass): ########################################## inception_model = InceptionModelBase() - X_input = keras.layers.Input( - shape=( - self.window_history_size + 1, 1, self.channels)) # add 1 to window_size to include current time step t0 + X_input = keras.layers.Input(shape=self.shape_inputs) X_in = inception_model.inception_block(X_input, conv_settings_dict1, pool_settings_dict1, regularizer=self.regularizer, @@ -579,7 +548,7 @@ class MyTowerModel(AbstractModelClass): # out_main = flatten_tail(X_in, 'Main', activation=activation, bound_weight=True, dropout_rate=self.dropout_rate, # reduction_filter=64, inner_neurons=64, output_neurons=self.window_lead_time) - out_main = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.window_lead_time, + out_main = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.shape_outputs, output_activation='linear', reduction_filter=64, name='Main', bound_weight=True, dropout_rate=self.dropout_rate, kernel_regularizer=self.regularizer @@ -594,24 +563,19 @@ class MyTowerModel(AbstractModelClass): class MyPaperModel(AbstractModelClass): - def __init__(self, window_history_size, window_lead_time, channels): + def __init__(self, shape_inputs: list, shape_outputs: list): """ Sets model and loss depending on the given arguments. - :param activation: activation function - :param window_history_size: number of historical time steps included in the input data - :param channels: number of variables used in input data - :param regularizer: <not used here> - :param dropout_rate: dropout rate used in the model [0, 1) - :param window_lead_time: number of time steps to forecast in the output layer + :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables)) + :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast)) """ - super().__init__() + assert len(shape_inputs) == 1 + assert len(shape_outputs) == 1 + super().__init__(shape_inputs[0], shape_outputs[0]) # settings - self.window_history_size = window_history_size - self.window_lead_time = window_lead_time - self.channels = channels self.dropout_rate = .3 self.regularizer = keras.regularizers.l2(0.001) self.initial_lr = 1e-3 @@ -676,9 +640,7 @@ class MyPaperModel(AbstractModelClass): ########################################## inception_model = InceptionModelBase() - X_input = keras.layers.Input( - shape=( - self.window_history_size + 1, 1, self.channels)) # add 1 to window_size to include current time step t0 + X_input = keras.layers.Input(shape=self.shape_inputs) pad_size = PadUtils.get_padding_for_same(first_kernel) # X_in = adv_pad.SymmetricPadding2D(padding=pad_size)(X_input) @@ -696,7 +658,7 @@ class MyPaperModel(AbstractModelClass): padding=self.padding) # out_minor1 = flatten_tail(X_in, 'minor_1', False, self.dropout_rate, self.window_lead_time, # self.activation, 32, 64) - out_minor1 = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.window_lead_time, + out_minor1 = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.shape_outputs, output_activation='linear', reduction_filter=32, name='minor_1', bound_weight=False, dropout_rate=self.dropout_rate, kernel_regularizer=self.regularizer @@ -714,7 +676,7 @@ class MyPaperModel(AbstractModelClass): # batch_normalisation=True) ############################################# - out_main = flatten_tail(X_in, inner_neurons=64 * 2, activation=activation, output_neurons=self.window_lead_time, + out_main = flatten_tail(X_in, inner_neurons=64 * 2, activation=activation, output_neurons=self.shape_outputs, output_activation='linear', reduction_filter=64 * 2, name='Main', bound_weight=False, dropout_rate=self.dropout_rate, kernel_regularizer=self.regularizer diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index ff5c2bc3ee2ef0923ac50f91ce5acd6807e1eb2e..5cc449aac88ebab58689656820769fe7751f6098 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -19,7 +19,7 @@ import xarray as xr from matplotlib.backends.backend_pdf import PdfPages from mlair import helpers -from mlair.data_handling import DataGenerator +from mlair.data_handler.iterator import DataCollection from mlair.helpers import TimeTrackingWrapper logging.getLogger('matplotlib').setLevel(logging.WARNING) @@ -236,12 +236,10 @@ class PlotStationMap(AbstractPlotClass): import cartopy.crs as ccrs if generators is not None: - for color, gen in generators.items(): - for k, v in enumerate(gen): - station_coords = gen.get_data_generator(k).meta.loc[['station_lon', 'station_lat']] - # station_names = gen.get_data_generator(k).meta.loc[['station_id']] - IDx, IDy = float(station_coords.loc['station_lon'].values), float( - station_coords.loc['station_lat'].values) + for color, data_collection in generators.items(): + for station in data_collection: + coords = station.get_coordinates() + IDx, IDy = coords["lon"], coords["lat"] self._ax.plot(IDx, IDy, mfc=color, mec='k', marker='s', markersize=6, transform=ccrs.PlateCarree()) def _plot(self, generators: Dict): @@ -713,6 +711,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ data = helpers.dict_to_xarray(data, "station").sortby(self._x_name) self._labels = [str(i) + "d" for i in data.coords["ahead"].values] + if "station" not in data.dims: + data = data.expand_dims("station") return data.to_dataframe("data").reset_index(level=[0, 1, 2]) def _label_add(self, score_only: bool): @@ -785,8 +785,8 @@ class PlotTimeSeries: def _plot(self, plot_folder): pdf_pages = self._create_pdf_pages(plot_folder) - start, end = self._get_time_range(self._load_data(self._stations[0])) for pos, station in enumerate(self._stations): + start, end = self._get_time_range(self._load_data(self._stations[0])) data = self._load_data(station) fig, axes, factor = self._create_subplots(start, end) nan_list = [] @@ -896,11 +896,12 @@ class PlotAvailability(AbstractPlotClass): """ - def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily", - summary_name="data availability"): + def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", sampling="daily", + summary_name="data availability", time_dimension="datetime"): """Initialise.""" # create standard Gantt plot for all stations (currently in single pdf file with single page) super().__init__(plot_folder, "data_availability") + self.dim = time_dimension self.sampling = self._get_sampling(sampling) plot_dict = self._prepare_data(generators) lgd = self._plot(plot_dict) @@ -923,34 +924,30 @@ class PlotAvailability(AbstractPlotClass): elif sampling == "hourly": return "h" - def _prepare_data(self, generators: Dict[str, DataGenerator]): + def _prepare_data(self, generators: Dict[str, DataCollection]): plt_dict = {} - for subset, generator in generators.items(): - stations = generator.stations - for station in stations: - station_data = generator.get_data_generator(station) - labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean() + for subset, data_collection in generators.items(): + for station in data_collection: + labels = station.get_Y(as_numpy=False).resample({self.dim: self.sampling}, skipna=True).mean() labels_bool = labels.sel(window=1).notnull() - group = (labels_bool != labels_bool.shift(datetime=1)).cumsum() + group = (labels_bool != labels_bool.shift({self.dim: 1})).cumsum() plot_data = pd.DataFrame({"avail": labels_bool.values, "group": group.values}, - index=labels.datetime.values) + index=labels.coords[self.dim].values) t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0])) t2 = [i[1:] for i in t if i[0]] - if plt_dict.get(station) is None: - plt_dict[station] = {subset: t2} + if plt_dict.get(str(station)) is None: + plt_dict[str(station)] = {subset: t2} else: - plt_dict[station].update({subset: t2}) + plt_dict[str(station)].update({subset: t2}) return plt_dict - def _summarise_data(self, generators: Dict[str, DataGenerator], summary_name: str): + def _summarise_data(self, generators: Dict[str, DataCollection], summary_name: str): plt_dict = {} - for subset, generator in generators.items(): + for subset, data_collection in generators.items(): all_data = None - stations = generator.stations - for station in stations: - station_data = generator.get_data_generator(station) - labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean() + for station in data_collection: + labels = station.get_Y(as_numpy=False).resample({self.dim: self.sampling}, skipna=True).mean() labels_bool = labels.sel(window=1).notnull() if all_data is None: all_data = labels_bool @@ -959,8 +956,9 @@ class PlotAvailability(AbstractPlotClass): all_data = np.logical_or(tmp, labels_bool).combine_first( all_data) # apply logical on merge and fill missing with all_data - group = (all_data != all_data.shift(datetime=1)).cumsum() - plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values}, index=all_data.datetime.values) + group = (all_data != all_data.shift({self.dim: 1})).cumsum() + plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values}, + index=all_data.coords[self.dim].values) t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0])) t2 = [i[1:] for i in t if i[0]] if plt_dict.get(summary_name) is None: diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index d93b8c02641acf3127cd63d0814709cc1f56cee2..407465ad4cd99b85c3c5b37eb2aef6e9e71c6424 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -13,12 +13,12 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_HPC_LOGIN_LIST, DEFAULT_HPC_HOST_LIST, DEFAULT_CREATE_NEW_MODEL, DEFAULT_TRAINABLE, \ DEFAULT_FRACTION_OF_TRAINING, DEFAULT_EXTREME_VALUES, DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY, DEFAULT_PERMUTE_DATA, \ DEFAULT_BATCH_SIZE, DEFAULT_EPOCHS, DEFAULT_TARGET_VAR, DEFAULT_TARGET_DIM, DEFAULT_WINDOW_LEAD_TIME, \ - DEFAULT_DIMENSIONS, DEFAULT_INTERPOLATION_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_LIMIT_NAN_FILL, \ + DEFAULT_DIMENSIONS, DEFAULT_TIME_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_INTERPOLATION_LIMIT, \ DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \ DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST -from mlair.data_handling import DataPrepJoin +from mlair.data_handler.advanced_data_handler import DefaultDataPreparation from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.model_class import MyLittleModel as VanillaModel @@ -50,8 +50,6 @@ class ExperimentSetup(RunEnvironment): * `plot_path` [.] * `forecast_path` [.] * `stations` [.] - * `network` [.] - * `station_type` [.] * `statistics_per_var` [.] * `variables` [.] * `start` [.] @@ -66,7 +64,7 @@ class ExperimentSetup(RunEnvironment): # interpolation self._set_param("dimensions", dimensions, default={'new_index': ['datetime', 'Stations']}) - self._set_param("interpolation_dim", interpolation_dim, default='datetime') + self._set_param("time_dim", time_dim, default='datetime') self._set_param("interpolation_method", interpolation_method, default='linear') self._set_param("limit_nan_fill", limit_nan_fill, default=1) @@ -116,10 +114,6 @@ class ExperimentSetup(RunEnvironment): investigations are stored outside this structure. :param stations: list of stations or single station to use in experiment. If not provided, stations are set to :py:const:`default stations <DEFAULT_STATIONS>`. - :param network: name of network to restrict to use only stations from this measurement network. Default is - `AIRBASE` . - :param station_type: restrict network type to one of TOAR's categories (background, traffic, industrial). Default is - `None` to use all categories. :param variables: list of all variables to use. Valid names can be found in `Section 2.1 Parameters <https://join.fz-juelich.de/services/rest/surfacedata/>`_. If not provided, this parameter is filled with keys from ``statistics_per_var``. @@ -140,7 +134,7 @@ class ExperimentSetup(RunEnvironment): :param window_lead_time: number of time steps to predict by model (default 3). Time steps `t_0+1` to `t_0+w` are predicted. :param dimensions: - :param interpolation_dim: + :param time_dim: :param interpolation_method: :param limit_nan_fill: :param train_start: @@ -209,8 +203,6 @@ class ExperimentSetup(RunEnvironment): def __init__(self, experiment_date=None, stations: Union[str, List[str]] = None, - network: str = None, - station_type: str = None, variables: Union[str, List[str]] = None, statistics_per_var: Dict = None, start: str = None, @@ -220,16 +212,16 @@ class ExperimentSetup(RunEnvironment): target_dim=None, window_lead_time: int = None, dimensions=None, - interpolation_dim=None, + time_dim=None, interpolation_method=None, - limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, + interpolation_limit=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, use_all_stations_on_all_data_sets=None, trainable: bool = None, fraction_of_train: float = None, experiment_path=None, plot_path: str = None, forecast_path: str = None, overwrite_local_data = None, sampling: str = "daily", create_new_model = None, bootstrap_path=None, permute_data_on_training = None, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None, extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None, number_of_bootstraps=None, - create_new_bootstraps=None, data_path: str = None, login_nodes=None, hpc_hosts=None, model=None, - batch_size=None, epochs=None, data_preparation=None): + create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None, + hpc_hosts=None, model=None, batch_size=None, epochs=None, data_preparation=None, **kwargs): # create run framework super().__init__() @@ -265,6 +257,9 @@ class ExperimentSetup(RunEnvironment): logging.info(f"Experiment path is: {experiment_path}") path_config.check_path_and_create(self.data_store.get("experiment_path")) + # batch path (temporary) + self._set_param("batch_path", batch_path, default=os.path.join(experiment_path, "batch_data")) + # set model path self._set_param("model_path", None, os.path.join(experiment_path, "model")) path_config.check_path_and_create(self.data_store.get("model_path")) @@ -285,8 +280,6 @@ class ExperimentSetup(RunEnvironment): # setup for data self._set_param("stations", stations, default=DEFAULT_STATIONS) - self._set_param("network", network, default=DEFAULT_NETWORK) - self._set_param("station_type", station_type, default=DEFAULT_STATION_TYPE) self._set_param("statistics_per_var", statistics_per_var, default=DEFAULT_VAR_ALL_DICT) self._set_param("variables", variables, default=list(self.data_store.get("statistics_per_var").keys())) self._set_param("start", start, default=DEFAULT_START) @@ -297,7 +290,7 @@ class ExperimentSetup(RunEnvironment): self._set_param("sampling", sampling) self._set_param("transformation", transformation, default=DEFAULT_TRANSFORMATION) self._set_param("transformation", None, scope="preprocessing") - self._set_param("data_preparation", data_preparation, default=DataPrepJoin) + self._set_param("data_preparation", data_preparation, default=DefaultDataPreparation) # target self._set_param("target_var", target_var, default=DEFAULT_TARGET_VAR) @@ -306,9 +299,9 @@ class ExperimentSetup(RunEnvironment): # interpolation self._set_param("dimensions", dimensions, default=DEFAULT_DIMENSIONS) - self._set_param("interpolation_dim", interpolation_dim, default=DEFAULT_INTERPOLATION_DIM) + self._set_param("time_dim", time_dim, default=DEFAULT_TIME_DIM) self._set_param("interpolation_method", interpolation_method, default=DEFAULT_INTERPOLATION_METHOD) - self._set_param("limit_nan_fill", limit_nan_fill, default=DEFAULT_LIMIT_NAN_FILL) + self._set_param("interpolation_limit", interpolation_limit, default=DEFAULT_INTERPOLATION_LIMIT) # train set parameters self._set_param("start", train_start, default=DEFAULT_TRAIN_START, scope="train") @@ -344,6 +337,7 @@ class ExperimentSetup(RunEnvironment): self._set_param("number_of_bootstraps", number_of_bootstraps, default=DEFAULT_NUMBER_OF_BOOTSTRAPS, scope="general.postprocessing") self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing") + self._set_param("neighbors", ["DEBW030"]) # TODO: just for testing # check variables, statistics and target variable self._check_target_var() @@ -352,6 +346,15 @@ class ExperimentSetup(RunEnvironment): # set model architecture class self._set_param("model_class", model, VanillaModel) + # set remaining kwargs + if len(kwargs) > 0: + for k, v in kwargs.items(): + if len(self.data_store.search_name(k)) == 0: + self._set_param(k, v) + else: + raise KeyError(f"Given argument {k} with value {v} cannot be set for this experiment due to a " + f"conflict with an existing entry with same naming: {k}={self.data_store.get(k)}") + def _set_param(self, param: str, value: Any, default: Any = None, scope: str = "general") -> None: """Set given parameter and log in debug.""" if value is None and default is not None: @@ -391,6 +394,7 @@ class ExperimentSetup(RunEnvironment): if not set(target_var).issubset(stat.keys()): raise ValueError(f"Could not find target variable {target_var} in statistics_per_var.") + if __name__ == "__main__": formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]' logging.basicConfig(format=formatter, level=logging.DEBUG) diff --git a/mlair/run_modules/model_setup.py b/mlair/run_modules/model_setup.py index 9b282c50c7ebccb740fe98b5159eb086aa8828c9..3dc56f01c4f37ce9fc53086d837386af81e5f53d 100644 --- a/mlair/run_modules/model_setup.py +++ b/mlair/run_modules/model_setup.py @@ -34,8 +34,6 @@ class ModelSetup(RunEnvironment): * `trainable` [.] * `create_new_model` [.] * `generator` [train] - * `window_lead_time` [.] - * `window_history_size` [.] * `model_class` [.] Optional objects @@ -73,7 +71,7 @@ class ModelSetup(RunEnvironment): def _run(self): # set channels depending on inputs - self._set_channels() + self._set_shapes() # build model graph using settings from my_model_settings() self.build_model() @@ -94,10 +92,12 @@ class ModelSetup(RunEnvironment): # report settings self.report_model() - def _set_channels(self): - """Set channels as number of variables of train generator.""" - channels = self.data_store.get("generator", "train")[0][0].shape[-1] - self.data_store.set("channels", channels, self.scope) + def _set_shapes(self): + """Set input and output shapes from train collection.""" + shape = list(map(lambda x: x.shape[1:], self.data_store.get("data_collection", "train")[0].get_X())) + self.data_store.set("shape_inputs", shape, self.scope) + shape = list(map(lambda y: y.shape[1:], self.data_store.get("data_collection", "train")[0].get_Y())) + self.data_store.set("shape_outputs", shape, self.scope) def compile_model(self): """ @@ -134,8 +134,8 @@ class ModelSetup(RunEnvironment): logging.info('no weights to reload...') def build_model(self): - """Build model using window_history_size, window_lead_time and channels from data store.""" - args_list = ["window_history_size", "window_lead_time", "channels"] + """Build model using input and output shapes from data store.""" + args_list = ["shape_inputs", "shape_outputs"] args = self.data_store.create_args_dict(args_list, self.scope) model = self.data_store.get("model_class") self.model = model(**args) @@ -165,7 +165,7 @@ class ModelSetup(RunEnvironment): v = ",".join(self._clean_name(str(u)) for u in v) if "<" in str(v): v = self._clean_name(str(v)) - df.loc[k] = v + df.loc[k] = str(v) df.sort_index(inplace=True) column_format = "ll" path = os.path.join(self.data_store.get("experiment_path"), "latex_report") diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index d390ecf05b2e3144b15edba0e30da7eb2b7e430c..d4f409ec503ba0ae37bdd1d1bec4b0207eec453c 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -13,9 +13,9 @@ import numpy as np import pandas as pd import xarray as xr -from mlair.data_handling import BootStraps, Distributor, DataGenerator, DataPrepJoin +from mlair.data_handler import BootStraps, KerasIterator from mlair.helpers.datastore import NameNotFoundInDataStore -from mlair.helpers import TimeTracking, statistics +from mlair.helpers import TimeTracking, statistics, extract_value from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules.model_class import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \ @@ -42,7 +42,7 @@ class PostProcessing(RunEnvironment): * `model_path` [.] * `target_var` [.] * `sampling` [.] - * `window_lead_time` [.] + * `output_shape` [model] * `evaluate_bootstraps` [postprocessing] and if enabled: * `create_new_bootstraps` [postprocessing] @@ -65,14 +65,16 @@ class PostProcessing(RunEnvironment): self.model: keras.Model = self._load_model() self.ols_model = None self.batch_size: int = self.data_store.get_default("batch_size", "model", 64) - self.test_data: DataGenerator = self.data_store.get("generator", "test") - self.test_data_distributed = Distributor(self.test_data, self.model, self.batch_size) - self.train_data: DataGenerator = self.data_store.get("generator", "train") - self.val_data: DataGenerator = self.data_store.get("generator", "val") - self.train_val_data: DataGenerator = self.data_store.get("generator", "train_val") + self.test_data = self.data_store.get("data_collection", "test") + batch_path = self.data_store.get("batch_path", scope="test") + self.test_data_distributed = KerasIterator(self.test_data, self.batch_size, model=self.model, name="test", batch_path=batch_path) + self.train_data = self.data_store.get("data_collection", "train") + self.val_data = self.data_store.get("data_collection", "val") + self.train_val_data = self.data_store.get("data_collection", "train_val") self.plot_path: str = self.data_store.get("plot_path") self.target_var = self.data_store.get("target_var") self._sampling = self.data_store.get("sampling") + self.window_lead_time = extract_value(self.data_store.get("shape_outputs", "model")) self.skill_scores = None self.bootstrap_skill_scores = None self._run() @@ -141,34 +143,29 @@ class PostProcessing(RunEnvironment): bootstrap_path = self.data_store.get("bootstrap_path") forecast_path = self.data_store.get("forecast_path") number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing") - - # set bootstrap class - bootstraps = BootStraps(self.test_data, bootstrap_path, number_of_bootstraps) - - # create bootstrapped predictions for all stations and variables and save it to disk dims = ["index", "ahead", "type"] - for station in bootstraps.stations: - with TimeTracking(name=station): - logging.info(station) - for var in bootstraps.variables: - station_bootstrap = bootstraps.get_generator(station, var) - - # make bootstrap predictions - bootstrap_predictions = self.model.predict_generator(generator=station_bootstrap, - workers=2, - use_multiprocessing=True) - if isinstance(bootstrap_predictions, list): # if model is branched model - bootstrap_predictions = bootstrap_predictions[-1] - # save bootstrap predictions separately for each station and variable combination - bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1) - shape = bootstrap_predictions.shape - coords = (range(shape[0]), range(1, shape[1] + 1)) - tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims) - file_name = os.path.join(forecast_path, f"bootstraps_{var}_{station}.nc") - tmp.to_netcdf(file_name) + for station in self.test_data: + logging.info(str(station)) + X, Y = None, None + bootstraps = BootStraps(station, number_of_bootstraps) + for boot in bootstraps: + X, Y, (index, dimension) = boot + # make bootstrap predictions + bootstrap_predictions = self.model.predict(X) + if isinstance(bootstrap_predictions, list): # if model is branched model + bootstrap_predictions = bootstrap_predictions[-1] + # save bootstrap predictions separately for each station and variable combination + bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1) + shape = bootstrap_predictions.shape + coords = (range(shape[0]), range(1, shape[1] + 1)) + var = f"{index}_{dimension}" + tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims) + file_name = os.path.join(forecast_path, f"bootstraps_{station}_{var}.nc") + tmp.to_netcdf(file_name) + else: # store also true labels for each station - labels = np.expand_dims(bootstraps.get_labels(station), axis=-1) - file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc") + labels = np.expand_dims(Y, axis=-1) + file_name = os.path.join(forecast_path, f"bootstraps_{station}_labels.nc") labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=dims) labels.to_netcdf(file_name) @@ -186,42 +183,50 @@ class PostProcessing(RunEnvironment): # extract all requirements from data store bootstrap_path = self.data_store.get("bootstrap_path") forecast_path = self.data_store.get("forecast_path") - window_lead_time = self.data_store.get("window_lead_time") number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing") - bootstraps = BootStraps(self.test_data, bootstrap_path, number_of_bootstraps) - + forecast_file = f"forecasts_norm_%s_test.nc" + bootstraps = BootStraps(self.test_data[0], number_of_bootstraps).bootstraps() skill_scores = statistics.SkillScores(None) score = {} - for station in self.test_data.stations: + for station in self.test_data: logging.info(station) # get station labels - file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc") + file_name = os.path.join(forecast_path, f"bootstraps_{str(station)}_labels.nc") labels = xr.open_dataarray(file_name) shape = labels.shape # get original forecasts - orig = bootstraps.get_orig_prediction(forecast_path, f"forecasts_norm_{station}_test.nc").reshape(shape) + orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps) + orig = orig.reshape(shape) coords = (range(shape[0]), range(1, shape[1] + 1), ["orig"]) orig = xr.DataArray(orig, coords=coords, dims=["index", "ahead", "type"]) # calculate skill scores for each variable - skill = pd.DataFrame(columns=range(1, window_lead_time + 1)) - for boot in self.test_data.variables: - file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc") + skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1)) + for boot_set in bootstraps: + boot_var = f"{boot_set[0]}_{boot_set[1]}" + file_name = os.path.join(forecast_path, f"bootstraps_{station}_{boot_var}.nc") boot_data = xr.open_dataarray(file_name) boot_data = boot_data.combine_first(labels).combine_first(orig) boot_scores = [] - for ahead in range(1, window_lead_time + 1): + for ahead in range(1, self.window_lead_time + 1): data = boot_data.sel(ahead=ahead) boot_scores.append( - skill_scores.general_skill_score(data, forecast_name=boot, reference_name="orig")) - skill.loc[boot] = np.array(boot_scores) + skill_scores.general_skill_score(data, forecast_name=boot_var, reference_name="orig")) + skill.loc[boot_var] = np.array(boot_scores) # collect all results in single dictionary - score[station] = xr.DataArray(skill, dims=["boot_var", "ahead"]) + score[str(station)] = xr.DataArray(skill, dims=["boot_var", "ahead"]) return score + @staticmethod + def get_orig_prediction(path, file_name, number_of_bootstraps, prediction_name="CNN"): + file = os.path.join(path, file_name) + prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze() + vals = np.tile(prediction.data, (number_of_bootstraps, 1)) + return vals[~np.isnan(vals).any(axis=1), :] + def _load_model(self) -> keras.models: """ Load NN model either from data store or from local path. @@ -259,12 +264,13 @@ class PostProcessing(RunEnvironment): path = self.data_store.get("forecast_path") plot_list = self.data_store.get("plot_list", "postprocessing") + time_dimension = self.data_store.get("time_dim") if self.bootstrap_skill_scores is not None and "PlotBootstrapSkillScore" in plot_list: PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN") if "PlotConditionalQuantiles" in plot_list: - PlotConditionalQuantiles(self.test_data.stations, data_pred_path=path, plot_folder=self.plot_path) + PlotConditionalQuantiles(self.test_data.keys(), data_pred_path=path, plot_folder=self.plot_path) if "PlotStationMap" in plot_list: if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts") or self.data_store.get( "hostname")[:6] in self.data_store.get("hpc_hosts"): @@ -273,7 +279,7 @@ class PostProcessing(RunEnvironment): else: PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) if "PlotMonthlySummary" in plot_list: - PlotMonthlySummary(self.test_data.stations, path, r"forecasts_%s_test.nc", self.target_var, + PlotMonthlySummary(self.test_data.keys(), path, r"forecasts_%s_test.nc", self.target_var, plot_folder=self.plot_path) if "PlotClimatologicalSkillScore" in plot_list: PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN") @@ -282,16 +288,16 @@ class PostProcessing(RunEnvironment): if "PlotCompetitiveSkillScore" in plot_list: PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN") if "PlotTimeSeries" in plot_list: - PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, + PlotTimeSeries(self.test_data.keys(), path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling) if "PlotAvailability" in plot_list: avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data} - PlotAvailability(avail_data, plot_folder=self.plot_path) + PlotAvailability(avail_data, plot_folder=self.plot_path, time_dimension=time_dimension) def calculate_test_score(self): """Evaluate test score of model and save locally.""" - test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(), - use_multiprocessing=False, verbose=0, steps=1) + test_score = self.model.evaluate_generator(generator=self.test_data_distributed, + use_multiprocessing=True, verbose=0, steps=1) path = self.data_store.get("model_path") with open(os.path.join(path, "test_scores.txt"), "a") as f: for index, item in enumerate(test_score): @@ -311,24 +317,26 @@ class PostProcessing(RunEnvironment): be found inside `forecast_path`. """ logging.debug("start make_prediction") - for i, _ in enumerate(self.test_data): - data = self.test_data.get_data_generator(i) - input_data = data.get_transposed_history() + time_dimension = self.data_store.get("time_dim") + for i, data in enumerate(self.test_data): + input_data = data.get_X() + target_data = data.get_Y(as_numpy=False) + observation_data = data.get_observation() # get scaling parameters - mean, std, transformation_method = data.get_transformation_information(variable=self.target_var) + mean, std, transformation_method = data.get_transformation_Y() for normalised in [True, False]: # create empty arrays nn_prediction, persistence_prediction, ols_prediction, observation = self._create_empty_prediction_arrays( - data, count=4) + target_data, count=4) # nn forecast nn_prediction = self._create_nn_forecast(input_data, nn_prediction, mean, std, transformation_method, normalised) # persistence - persistence_prediction = self._create_persistence_forecast(data, persistence_prediction, mean, std, + persistence_prediction = self._create_persistence_forecast(observation_data, persistence_prediction, mean, std, transformation_method, normalised) # ols @@ -336,11 +344,12 @@ class PostProcessing(RunEnvironment): normalised) # observation - observation = self._create_observation(data, observation, mean, std, transformation_method, normalised) + observation = self._create_observation(target_data, observation, mean, std, transformation_method, normalised) # merge all predictions - full_index = self.create_fullindex(data.data.indexes['datetime'], self._get_frequency()) - all_predictions = self.create_forecast_arrays(full_index, list(data.label.indexes['window']), + full_index = self.create_fullindex(observation_data.indexes[time_dimension], self._get_frequency()) + all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes['window']), + time_dimension, CNN=nn_prediction, persi=persistence_prediction, obs=observation, @@ -349,7 +358,7 @@ class PostProcessing(RunEnvironment): # save all forecasts locally path = self.data_store.get("forecast_path") prefix = "forecasts_norm" if normalised else "forecasts" - file = os.path.join(path, f"{prefix}_{data.station[0]}_test.nc") + file = os.path.join(path, f"{prefix}_{str(data)}_test.nc") all_predictions.to_netcdf(file) def _get_frequency(self) -> str: @@ -358,14 +367,14 @@ class PostProcessing(RunEnvironment): return getter.get(self._sampling, None) @staticmethod - def _create_observation(data: DataPrepJoin, _, mean: xr.DataArray, std: xr.DataArray, transformation_method: str, + def _create_observation(data, _, mean: xr.DataArray, std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray: """ Create observation as ground truth from given data. Inverse transformation is applied to the ground truth to get the output in the original space. - :param data: transposed observation from DataPrep + :param data: observation :param mean: mean of target value transformation :param std: standard deviation of target value transformation :param transformation_method: target values transformation method @@ -373,10 +382,9 @@ class PostProcessing(RunEnvironment): :return: filled data array with observation """ - obs = data.label.copy() if not normalised: - obs = statistics.apply_inverse_transformation(obs, mean, std, transformation_method) - return obs + data = statistics.apply_inverse_transformation(data, mean, std, transformation_method) + return data def _create_ols_forecast(self, input_data: xr.DataArray, ols_prediction: xr.DataArray, mean: xr.DataArray, std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray: @@ -397,12 +405,11 @@ class PostProcessing(RunEnvironment): tmp_ols = self.ols_model.predict(input_data) if not normalised: tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method) - tmp_ols = np.expand_dims(tmp_ols, axis=1) target_shape = ols_prediction.values.shape ols_prediction.values = np.swapaxes(tmp_ols, 2, 0) if target_shape != tmp_ols.shape else tmp_ols return ols_prediction - def _create_persistence_forecast(self, data: DataPrepJoin, persistence_prediction: xr.DataArray, mean: xr.DataArray, + def _create_persistence_forecast(self, data, persistence_prediction: xr.DataArray, mean: xr.DataArray, std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray: """ Create persistence forecast with given data. @@ -410,7 +417,7 @@ class PostProcessing(RunEnvironment): Persistence is deviated from the value at t=0 and applied to all following time steps (t+1, ..., t+window). Inverse transformation is applied to the forecast to get the output in the original space. - :param data: DataPrep + :param data: observation :param persistence_prediction: empty array in right shape to fill with data :param mean: mean of target value transformation :param std: standard deviation of target value transformation @@ -419,12 +426,10 @@ class PostProcessing(RunEnvironment): :return: filled data array with persistence predictions """ - tmp_persi = data.observation.copy().sel({'window': 0}) + tmp_persi = data.copy() if not normalised: tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method) - window_lead_time = self.data_store.get("window_lead_time") - persistence_prediction.values = np.expand_dims(np.tile(tmp_persi.squeeze('Stations'), (window_lead_time, 1)), - axis=1) + persistence_prediction.values = np.tile(tmp_persi, (self.window_lead_time, 1)).T return persistence_prediction def _create_nn_forecast(self, input_data: xr.DataArray, nn_prediction: xr.DataArray, mean: xr.DataArray, @@ -449,18 +454,20 @@ class PostProcessing(RunEnvironment): if not normalised: tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method) if isinstance(tmp_nn, list): - nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn[-1], axis=1), 2, 0) + nn_prediction.values = tmp_nn[-1] elif tmp_nn.ndim == 3: - nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn[-1, ...], axis=1), 2, 0) + nn_prediction.values = tmp_nn[-1, ...] elif tmp_nn.ndim == 2: - nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn, axis=1), 2, 0) + nn_prediction.values = tmp_nn else: raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {tmp_nn.dims}.") return nn_prediction @staticmethod - def _create_empty_prediction_arrays(generator, count=1): - return [generator.label.copy() for _ in range(count)] + def _create_empty_prediction_arrays(target_data, count=1): + """ + Create array to collect all predictions. Expand target data by a station dimension. """ + return [target_data.copy() for _ in range(count)] @staticmethod def create_fullindex(df: Union[xr.DataArray, pd.DataFrame, pd.DatetimeIndex], freq: str) -> pd.DataFrame: @@ -488,7 +495,7 @@ class PostProcessing(RunEnvironment): return index @staticmethod - def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], **kwargs): + def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], time_dimension, **kwargs): """ Combine different forecast types into single xarray. @@ -503,12 +510,8 @@ class PostProcessing(RunEnvironment): res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan), coords=[index.index, ahead_names, keys], dims=['index', 'ahead', 'type']) for k, v in kwargs.items(): - try: - match_index = np.stack(set(res.index.values) & set(v.index.values)) - res.loc[match_index, :, k] = v.loc[match_index] - except AttributeError: # v is xarray type and has no attribute .index - match_index = np.stack(set(res.index.values) & set(v.indexes['datetime'].values)) - res.loc[match_index, :, k] = v.sel({'datetime': match_index}).squeeze('Stations').transpose() + match_index = np.stack(set(res.index.values) & set(v.indexes[time_dimension].values)) + res.loc[match_index, :, k] = v.loc[match_index] return res def _get_external_data(self, station: str) -> Union[xr.DataArray, None]: @@ -521,12 +524,15 @@ class PostProcessing(RunEnvironment): :param station: name of station to load external data. """ try: - data = self.train_val_data.get_data_generator(station) - mean, std, transformation_method = data.get_transformation_information(variable=self.target_var) - external_data = self._create_observation(data, None, mean, std, transformation_method, normalised=False) - external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"]) - return external_data.rename({'datetime': 'index'}) - except KeyError: + data = self.train_val_data[station] + # target_data = data.get_Y(as_numpy=False) + observation = data.get_observation() + mean, std, transformation_method = data.get_transformation_Y() + # external_data = self._create_observation(target_data, None, mean, std, transformation_method, normalised=False) + # external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"]) + external_data = self._create_observation(observation, None, mean, std, transformation_method, normalised=False) + return external_data.rename({external_data.dims[0]: 'index'}) + except IndexError: return None def calculate_skill_scores(self) -> Tuple[Dict, Dict]: @@ -540,15 +546,14 @@ class PostProcessing(RunEnvironment): :return: competitive and climatological skill scores """ path = self.data_store.get("forecast_path") - window_lead_time = self.data_store.get("window_lead_time") skill_score_competitive = {} skill_score_climatological = {} - for station in self.test_data.stations: - file = os.path.join(path, f"forecasts_{station}_test.nc") + for station in self.test_data: + file = os.path.join(path, f"forecasts_{str(station)}_test.nc") data = xr.open_dataarray(file) skill_score = statistics.SkillScores(data) external_data = self._get_external_data(station) - skill_score_competitive[station] = skill_score.skill_scores(window_lead_time) + skill_score_competitive[station] = skill_score.skill_scores(self.window_lead_time) skill_score_climatological[station] = skill_score.climatological_skill_scores(external_data, - window_lead_time) + self.window_lead_time) return skill_score_competitive, skill_score_climatological diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 243daf20e4e99331fb32ed89769dbf584c235110..b4185df2f6699cb20ac96e32661433e7a6164abc 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -5,22 +5,17 @@ __date__ = '2019-11-25' import logging import os -from typing import Tuple, Dict, List +from typing import Tuple import numpy as np import pandas as pd -from mlair.data_handling import DataGenerator +from mlair.data_handler import DataCollection from mlair.helpers import TimeTracking from mlair.configuration import path_config from mlair.helpers.join import EmptyQueryResult from mlair.run_modules.run_environment import RunEnvironment -DEFAULT_ARGS_LIST = ["data_path", "stations", "variables", "interpolation_dim", "target_dim", "target_var"] -DEFAULT_KWARGS_LIST = ["limit_nan_fill", "window_history_size", "window_lead_time", "statistics_per_var", "min_length", - "station_type", "overwrite_local_data", "start", "end", "sampling", "transformation", - "extreme_values", "extremes_on_right_tail_only", "network", "data_preparation"] - class PreProcessing(RunEnvironment): """ @@ -59,10 +54,11 @@ class PreProcessing(RunEnvironment): self._run() def _run(self): - args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope="preprocessing") - kwargs = self.data_store.create_args_dict(DEFAULT_KWARGS_LIST, scope="preprocessing") stations = self.data_store.get("stations") - valid_stations = self.check_valid_stations(args, kwargs, stations, load_tmp=False, save_tmp=False, name="all") + data_preparation = self.data_store.get("data_preparation") + _, valid_stations = self.validate_station(data_preparation, stations, "preprocessing", overwrite_local_data=True) + if len(valid_stations) == 0: + raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) self.split_train_val_test() self.report_pre_processing() @@ -70,16 +66,14 @@ class PreProcessing(RunEnvironment): def report_pre_processing(self): """Log some metrics on data and create latex report.""" logging.debug(20 * '##') - n_train = len(self.data_store.get('generator', 'train')) - n_val = len(self.data_store.get('generator', 'val')) - n_test = len(self.data_store.get('generator', 'test')) + n_train = len(self.data_store.get('data_collection', 'train')) + n_val = len(self.data_store.get('data_collection', 'val')) + n_test = len(self.data_store.get('data_collection', 'test')) n_total = n_train + n_val + n_test logging.debug(f"Number of all stations: {n_total}") logging.debug(f"Number of training stations: {n_train}") logging.debug(f"Number of val stations: {n_val}") logging.debug(f"Number of test stations: {n_test}") - logging.debug(f"TEST SHAPE OF GENERATOR CALL: {self.data_store.get('generator', 'test')[0][0].shape}" - f"{self.data_store.get('generator', 'test')[0][1].shape}") self.create_latex_report() def create_latex_report(self): @@ -121,11 +115,12 @@ class PreProcessing(RunEnvironment): set_names = ["train", "val", "test"] df = pd.DataFrame(columns=meta_data + set_names) for set_name in set_names: - data: DataGenerator = self.data_store.get("generator", set_name) - for station in data.stations: - df.loc[station, set_name] = data.get_data_generator(station).get_transposed_label().shape[0] - if df.loc[station, meta_data].isnull().any(): - df.loc[station, meta_data] = data.get_data_generator(station).meta.loc[meta_data].values.flatten() + data = self.data_store.get("data_collection", set_name) + for station in data: + station_name = str(station.id_class) + df.loc[station_name, set_name] = station.get_Y()[0].shape[0] + if df.loc[station_name, meta_data].isnull().any(): + df.loc[station_name, meta_data] = station.id_class.meta.loc[meta_data].values.flatten() df.loc["# Samples", set_name] = df.loc[:, set_name].sum() df.loc["# Stations", set_name] = df.loc[:, set_name].count() df[meta_round] = df[meta_round].astype(float).round(precision) @@ -147,7 +142,7 @@ class PreProcessing(RunEnvironment): Split data into subsets. Currently: train, val, test and train_val (actually this is only the merge of train and val, but as an separate - generator). IMPORTANT: Do not change to order of the execution of create_set_split. The train subset needs + data_collection). IMPORTANT: Do not change to order of the execution of create_set_split. The train subset needs always to be executed at first, to set a proper transformation. """ fraction_of_training = self.data_store.get("fraction_of_training") @@ -184,40 +179,20 @@ class PreProcessing(RunEnvironment): return train_index, val_index, test_index, train_val_index def create_set_split(self, index_list: slice, set_name: str) -> None: - """ - Create subsets and store in data store. - - Create the subset for given split index and stores the DataGenerator with given set name in data store as - `generator`. Check for all valid stations using the default (kw)args for given scope and create the - DataGenerator for all valid stations. Also set all transformation information, if subset is training set. Make - sure, that the train set is executed first, and all other subsets afterwards. - - :param index_list: list of all stations to use for the set. If attribute use_all_stations_on_all_data_sets=True, - this list is ignored. - :param set_name: name to load/save all information from/to data store. - """ - args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope=set_name) - kwargs = self.data_store.create_args_dict(DEFAULT_KWARGS_LIST, scope=set_name) - stations = args["stations"] + # get set stations + stations = self.data_store.get("stations", scope=set_name) if self.data_store.get("use_all_stations_on_all_data_sets"): set_stations = stations else: set_stations = stations[index_list] logging.debug(f"{set_name.capitalize()} stations (len={len(set_stations)}): {set_stations}") - # validate set - set_stations = self.check_valid_stations(args, kwargs, set_stations, load_tmp=False, name=set_name) - self.data_store.set("stations", set_stations, scope=set_name) - # create set generator and store - set_args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope=set_name) - data_set = DataGenerator(**set_args, **kwargs) - self.data_store.set("generator", data_set, scope=set_name) - # extract transformation from train set - if set_name == "train": - self.data_store.set("transformation", data_set.transformation) + # create set data_collection and store + data_preparation = self.data_store.get("data_preparation") + collection, valid_stations = self.validate_station(data_preparation, set_stations, set_name) + self.data_store.set("stations", valid_stations, scope=set_name) + self.data_store.set("data_collection", collection, scope=set_name) - @staticmethod - def check_valid_stations(args: Dict, kwargs: Dict, all_stations: List[str], load_tmp=True, save_tmp=True, - name=None): + def validate_station(self, data_preparation, set_stations, set_name=None, overwrite_local_data=False): """ Check if all given stations in `all_stations` are valid. @@ -225,7 +200,7 @@ class PreProcessing(RunEnvironment): loading time are logged in debug mode. :param args: Dictionary with required parameters for DataGenerator class (`data_path`, `network`, `stations`, - `variables`, `interpolation_dim`, `target_dim`, `target_var`). + `variables`, `time_dim`, `target_dim`, `target_var`). :param kwargs: positional parameters for the DataGenerator class (e.g. `start`, `interpolation_method`, `window_lead_time`). :param all_stations: All stations to check. @@ -234,26 +209,31 @@ class PreProcessing(RunEnvironment): :return: Corrected list containing only valid station IDs. """ t_outer = TimeTracking() - t_inner = TimeTracking(start=False) - logging.info(f"check valid stations started{' (%s)' % name if name else ''}") + logging.info(f"check valid stations started{' (%s)' % (set_name if set_name is not None else 'all')}") + # calculate transformation using train data + if set_name == "train": + self.transformation(data_preparation, set_stations) + # start station check + collection = DataCollection() valid_stations = [] - - # all required arguments of the DataGenerator can be found in args, positional arguments in args and kwargs - data_gen = DataGenerator(**args, **kwargs) - for pos, station in enumerate(all_stations): - t_inner.run() - logging.info(f"check station {station} ({pos + 1} / {len(all_stations)})") + kwargs = self.data_store.create_args_dict(data_preparation.requirements(), scope=set_name) + for station in set_stations: try: - data = data_gen.get_data_generator(key=station, load_local_tmp_storage=load_tmp, - save_local_tmp_storage=save_tmp) - if data.history is None: - raise AttributeError + dp = data_preparation.build(station, name_affix=set_name, **kwargs) + collection.add(dp) valid_stations.append(station) - logging.debug( - f'{station}: history_shape = {data.history.transpose("datetime", "window", "Stations", "variables").shape}') - logging.debug(f"{station}: loading time = {t_inner}") except (AttributeError, EmptyQueryResult): continue - logging.info(f"run for {t_outer} to check {len(all_stations)} station(s). Found {len(valid_stations)}/" - f"{len(all_stations)} valid stations.") - return valid_stations + logging.info(f"run for {t_outer} to check {len(set_stations)} station(s). Found {len(collection)}/" + f"{len(set_stations)} valid stations.") + return collection, valid_stations + + def transformation(self, data_preparation, stations): + if hasattr(data_preparation, "transformation"): + kwargs = self.data_store.create_args_dict(data_preparation.requirements(), scope="train") + transformation_dict = data_preparation.transformation(stations, **kwargs) + if transformation_dict is not None: + self.data_store.set("transformation", transformation_dict) + + + diff --git a/mlair/run_modules/training.py b/mlair/run_modules/training.py index 23347a30b6e55c6903154128aab055d39045c965..f8909e15341f959455b1e8da0b0cb7502bdfa81b 100644 --- a/mlair/run_modules/training.py +++ b/mlair/run_modules/training.py @@ -11,7 +11,7 @@ from typing import Union import keras from keras.callbacks import Callback, History -from mlair.data_handling import Distributor +from mlair.data_handler import KerasIterator from mlair.model_modules.keras_extensions import CallbackHandler from mlair.plotting.training_monitoring import PlotModelHistory, PlotModelLearningRate from mlair.run_modules.run_environment import RunEnvironment @@ -65,9 +65,9 @@ class Training(RunEnvironment): """Set up and run training.""" super().__init__() self.model: keras.Model = self.data_store.get("model", "model") - self.train_set: Union[Distributor, None] = None - self.val_set: Union[Distributor, None] = None - self.test_set: Union[Distributor, None] = None + self.train_set: Union[KerasIterator, None] = None + self.val_set: Union[KerasIterator, None] = None + self.test_set: Union[KerasIterator, None] = None self.batch_size = self.data_store.get("batch_size") self.epochs = self.data_store.get("epochs") self.callbacks: CallbackHandler = self.data_store.get("callbacks", "model") @@ -104,9 +104,9 @@ class Training(RunEnvironment): :param mode: name of set, should be from ["train", "val", "test"] """ - gen = self.data_store.get("generator", mode) - kwargs = self.data_store.create_args_dict(["permute_data", "upsampling"], scope=mode) - setattr(self, f"{mode}_set", Distributor(gen, self.model, self.batch_size, **kwargs)) + collection = self.data_store.get("data_collection", mode) + kwargs = self.data_store.create_args_dict(["upsampling", "shuffle_batches", "batch_path"], scope=mode) + setattr(self, f"{mode}_set", KerasIterator(collection, self.batch_size, model=self.model, name=mode, **kwargs)) def set_generators(self) -> None: """ @@ -130,15 +130,15 @@ class Training(RunEnvironment): """ logging.info(f"Train with {len(self.train_set)} mini batches.") logging.info(f"Train with option upsampling={self.train_set.upsampling}.") - logging.info(f"Train with option data_permutation={self.train_set.do_data_permutation}.") + logging.info(f"Train with option shuffle={self.train_set.shuffle}.") checkpoint = self.callbacks.get_checkpoint() if not os.path.exists(checkpoint.filepath) or self._create_new_model: - history = self.model.fit_generator(generator=self.train_set.distribute_on_batches(), + history = self.model.fit_generator(generator=self.train_set, steps_per_epoch=len(self.train_set), epochs=self.epochs, verbose=2, - validation_data=self.val_set.distribute_on_batches(), + validation_data=self.val_set, validation_steps=len(self.val_set), callbacks=self.callbacks.get_callbacks(as_dict=False)) else: @@ -148,11 +148,11 @@ class Training(RunEnvironment): self.model = keras.models.load_model(checkpoint.filepath) hist: History = self.callbacks.get_callback_by_name("hist") initial_epoch = max(hist.epoch) + 1 - _ = self.model.fit_generator(generator=self.train_set.distribute_on_batches(), + _ = self.model.fit_generator(generator=self.train_set, steps_per_epoch=len(self.train_set), epochs=self.epochs, verbose=2, - validation_data=self.val_set.distribute_on_batches(), + validation_data=self.val_set, validation_steps=len(self.val_set), callbacks=self.callbacks.get_callbacks(as_dict=False), initial_epoch=initial_epoch) @@ -234,7 +234,7 @@ class Training(RunEnvironment): def report_training(self): data = {"mini batches": len(self.train_set), "upsampling extremes": self.train_set.upsampling, - "shuffling": self.train_set.do_data_permutation, + "shuffling": self.train_set.shuffle, "created new model": self._create_new_model, "epochs": self.epochs, "batch size": self.batch_size} diff --git a/mlair/run_script.py b/mlair/run_script.py index 55e20e1e6914de27fc9d13893edacc504ab554f7..00a28f686bf392f76787b56a48790999e9fa5c05 100644 --- a/mlair/run_script.py +++ b/mlair/run_script.py @@ -6,17 +6,15 @@ import inspect def run(stations=None, - station_type=None, trainable=None, create_new_model=None, window_history_size=None, experiment_date="testrun", - network=None, variables=None, statistics_per_var=None, start=None, end=None, target_var=None, target_dim=None, window_lead_time=None, dimensions=None, - interpolate_method=None, interpolate_dim=None, limit_nan_fill=None, + interpolation_method=None, interpolation_dim=None, interpolation_limit=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, use_all_stations_on_all_data_sets=None, fraction_of_train=None, experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, @@ -29,15 +27,17 @@ def run(stations=None, model=None, batch_size=None, epochs=None, - data_preparation=None): + data_preparation=None, + **kwargs): params = inspect.getfullargspec(DefaultWorkflow).args - kwargs = {k: v for k, v in locals().items() if k in params and v is not None} + kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None} - workflow = DefaultWorkflow(**kwargs) + workflow = DefaultWorkflow(**kwargs_default, **kwargs) workflow.run() if __name__ == "__main__": - - run() + from mlair.model_modules.model_class import MyBranchedModel + run(statistics_per_var={'o3': 'dma8eu', "temp": "maximum"}, trainable=True, + create_new_model=True, model=MyBranchedModel, station_type="background") diff --git a/mlair/workflows/abstract_workflow.py b/mlair/workflows/abstract_workflow.py index f187ff11e849960b4a63eddd5d11e2ce1ddf2a11..d3fe480fdfe09393fbf2051d8795735e9217a8ad 100644 --- a/mlair/workflows/abstract_workflow.py +++ b/mlair/workflows/abstract_workflow.py @@ -26,4 +26,4 @@ class Workflow: """Run workflow embedded in a run environment and according to the stage's ordering.""" with RunEnvironment(): for stage, kwargs in self._registry.items(): - stage(**kwargs) \ No newline at end of file + stage(**kwargs) diff --git a/mlair/workflows/default_workflow.py b/mlair/workflows/default_workflow.py index f42c0389d81f655fb0c8582a15e42acc853f757d..3dba7e6c5c5773fa4d74860b2cba67a5804123b7 100644 --- a/mlair/workflows/default_workflow.py +++ b/mlair/workflows/default_workflow.py @@ -14,17 +14,15 @@ class DefaultWorkflow(Workflow): the mentioned ordering.""" def __init__(self, stations=None, - station_type=None, trainable=None, create_new_model=None, window_history_size=None, experiment_date="testrun", - network=None, variables=None, statistics_per_var=None, start=None, end=None, target_var=None, target_dim=None, window_lead_time=None, dimensions=None, - interpolate_method=None, interpolate_dim=None, limit_nan_fill=None, + interpolation_method=None, time_dim=None, limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, use_all_stations_on_all_data_sets=None, fraction_of_train=None, experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, @@ -37,13 +35,14 @@ class DefaultWorkflow(Workflow): model=None, batch_size=None, epochs=None, - data_preparation=None): + data_preparation=None, + **kwargs): super().__init__() # extract all given kwargs arguments params = remove_items(inspect.getfullargspec(self.__init__).args, "self") - kwargs = {k: v for k, v in locals().items() if k in params and v is not None} - self._setup(**kwargs) + kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None} + self._setup(**kwargs_default, **kwargs) def _setup(self, **kwargs): """Set up default workflow.""" @@ -59,17 +58,15 @@ class DefaultWorkflowHPC(Workflow): Training and PostProcessing in exact the mentioned ordering.""" def __init__(self, stations=None, - station_type=None, trainable=None, create_new_model=None, window_history_size=None, experiment_date="testrun", - network=None, variables=None, statistics_per_var=None, start=None, end=None, target_var=None, target_dim=None, window_lead_time=None, dimensions=None, - interpolate_method=None, interpolate_dim=None, limit_nan_fill=None, + interpolation_method=None, time_dim=None, limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, use_all_stations_on_all_data_sets=None, fraction_of_train=None, experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, @@ -82,13 +79,13 @@ class DefaultWorkflowHPC(Workflow): model=None, batch_size=None, epochs=None, - data_preparation=None): + data_preparation=None, **kwargs): super().__init__() # extract all given kwargs arguments params = remove_items(inspect.getfullargspec(self.__init__).args, "self") - kwargs = {k: v for k, v in locals().items() if k in params and v is not None} - self._setup(**kwargs) + kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None} + self._setup(**kwargs_default, **kwargs) def _setup(self, **kwargs): """Set up default workflow.""" diff --git a/requirements.txt b/requirements.txt index 71bb1338effff38092510982d4a2c1f37f7b026a..7da29a05b748531fd4ec327ff17f432ff1ecaabb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,9 +38,9 @@ pydot==1.4.1 pyparsing==2.4.6 pyproj==2.5.0 pyshp==2.1.0 -pytest==5.3.5 -pytest-cov==2.8.1 -pytest-html==2.0.1 +pytest==6.0.0 +pytest-cov==2.10.0 +pytest-html==2.1.1 pytest-lazy-fixture==0.6.3 pytest-metadata==1.8.0 pytest-sugar diff --git a/test/test_configuration/test_path_config.py b/test/test_configuration/test_path_config.py index 128ddfceeed53920e6424d8d5d8f6addf5451c44..b97763632922fc2aaffaf267cfbc76ff99e25b6f 100644 --- a/test/test_configuration/test_path_config.py +++ b/test/test_configuration/test_path_config.py @@ -16,12 +16,12 @@ class TestPrepareHost: @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=True) def test_prepare_host(self, mock_host, mock_user, mock_path): - assert prepare_host() == "/home/testUser/machinelearningtools/data/toar_daily/" + assert prepare_host() == "/home/testUser/mlair/data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/p/project/cjjsc42/testUser/DATA/toar_daily/" assert prepare_host() == "/p/project/deepacf/intelliaq/testUser/DATA/toar_daily/" - assert prepare_host() == '/home/testUser/machinelearningtools/data/toar_daily/' + assert prepare_host() == '/home/testUser/mlair/data/toar_daily/' @mock.patch("socket.gethostname", return_value="NotExistingHostName") @mock.patch("getpass.getuser", return_value="zombie21") @@ -48,7 +48,7 @@ class TestPrepareHost: @mock.patch("os.makedirs", side_effect=None) def test_os_path_exists(self, mock_host, mock_user, mock_path, mock_check): path = prepare_host() - assert path == "/home/testUser/machinelearningtools/data/toar_daily/" + assert path == "/home/testUser/mlair/data/toar_daily/" class TestSetExperimentName: diff --git a/test/test_data_handling/test_bootstraps.py b/test/test_data_handler/old_t_bootstraps.py similarity index 98% rename from test/test_data_handling/test_bootstraps.py rename to test/test_data_handler/old_t_bootstraps.py index 0d5f3a69b08fa646b66691e1265b9bfe05f114a5..9616ed3f457d74e44e8a9eae5a3ed862fa804011 100644 --- a/test/test_data_handling/test_bootstraps.py +++ b/test/test_data_handler/old_t_bootstraps.py @@ -7,9 +7,8 @@ import numpy as np import pytest import xarray as xr -from mlair.data_handling.bootstraps import BootStraps, CreateShuffledData, BootStrapGenerator -from mlair.data_handling.data_generator import DataGenerator -from mlair.data_handling import DataPrepJoin +from mlair.data_handler.bootstraps import BootStraps +from src.data_handler import DataPrepJoin @pytest.fixture diff --git a/test/test_data_handling/test_data_generator.py b/test/test_data_handler/old_t_data_generator.py similarity index 98% rename from test/test_data_handling/test_data_generator.py rename to test/test_data_handler/old_t_data_generator.py index 413d25dd4ac2fe722600bc44f5b2307388e8307a..9198923e2f75601f2ce7e6dc18a663da647eaadb 100644 --- a/test/test_data_handling/test_data_generator.py +++ b/test/test_data_handler/old_t_data_generator.py @@ -6,8 +6,7 @@ import numpy as np import pytest import xarray as xr -from mlair.data_handling.data_generator import DataGenerator -from mlair.data_handling import DataPrepJoin +from mlair.data_hander import DataPrepJoin from mlair.helpers.join import EmptyQueryResult @@ -80,7 +79,7 @@ class TestDataGenerator: assert gen.stations == ['DEBW107'] assert gen.variables == ['o3', 'temp'] assert gen.station_type is None - assert gen.interpolation_dim == 'datetime' + assert gen.time_dim == 'datetime' assert gen.target_dim == 'variables' assert gen.target_var == 'o3' assert gen.interpolation_method == "linear" diff --git a/test/test_data_handling/test_data_preparation.py b/test/test_data_handler/old_t_data_preparation.py similarity index 99% rename from test/test_data_handling/test_data_preparation.py rename to test/test_data_handler/old_t_data_preparation.py index ebd351b020ce8a5902cbe7ed201876ce610b8f6a..586e17158a93880e2a98bf64189fa947299a64f3 100644 --- a/test/test_data_handling/test_data_preparation.py +++ b/test/test_data_handler/old_t_data_preparation.py @@ -8,8 +8,8 @@ import pandas as pd import pytest import xarray as xr -from mlair.data_handling.data_preparation import AbstractDataPrep -from mlair.data_handling import DataPrepJoin as DataPrep +from mlair.data_handler.data_preparation import AbstractDataPrep +from mlair.data_handler import DataPrepJoin as DataPrep from mlair.helpers.join import EmptyQueryResult diff --git a/test/test_data_handler/test_iterator.py b/test/test_data_handler/test_iterator.py new file mode 100644 index 0000000000000000000000000000000000000000..ff81fc7b89b2cede0f47cdf209e77e373cd0d656 --- /dev/null +++ b/test/test_data_handler/test_iterator.py @@ -0,0 +1,228 @@ + +from mlair.data_handler.iterator import DataCollection, StandardIterator, KerasIterator +from mlair.helpers.testing import PyTestAllEqual +from mlair.model_modules.model_class import MyLittleModel, MyBranchedModel + +import numpy as np +import pytest +import mock +import os +import shutil + + +class TestStandardIterator: + + @pytest.fixture + def collection(self): + return list(range(10)) + + def test_blank(self): + std_iterator = object.__new__(StandardIterator) + assert std_iterator._position is None + + def test_init(self, collection): + std_iterator = StandardIterator(collection) + assert std_iterator._collection == list(range(10)) + assert std_iterator._position == 0 + + def test_next(self, collection): + std_iterator = StandardIterator(collection) + for i in range(10): + assert i == next(std_iterator) + with pytest.raises(StopIteration): + next(std_iterator) + std_iterator = StandardIterator(collection) + for e, i in enumerate(iter(std_iterator)): + assert i == e + + +class TestDataCollection: + + @pytest.fixture + def collection(self): + return list(range(10)) + + def test_init(self, collection): + data_collection = DataCollection(collection) + assert data_collection._collection == collection + + def test_iter(self, collection): + data_collection = DataCollection(collection) + assert isinstance(iter(data_collection), StandardIterator) + for e, i in enumerate(data_collection): + assert i == e + + +class DummyData: + + def __init__(self, number_of_samples=np.random.randint(100, 150)): + self.number_of_samples = number_of_samples + + def get_X(self, upsampling=False, as_numpy=True): + X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables + X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 2)) # samples, window, variables + X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 2)) # samples, window, variables + return [X1, X2, X3] + + def get_Y(self, upsampling=False, as_numpy=True): + Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5, 1)) # samples, window, variables + Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 5, 1)) # samples, window, variables + return [Y1, Y2] + + +class TestKerasIterator: + + @pytest.fixture + def collection(self): + coll = [] + for i in range(3): + coll.append(DummyData(50 + i)) + data_coll = DataCollection(collection=coll) + return data_coll + + @pytest.fixture + def path(self): + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None + yield p + shutil.rmtree(p, ignore_errors=True) + + def test_init(self, collection, path): + iterator = KerasIterator(collection, 25, path) + assert isinstance(iterator._collection, DataCollection) + assert iterator._path == os.path.join(path, str(id(iterator)), "%i.pickle") + assert iterator.batch_size == 25 + assert iterator.shuffle is False + + def test_cleanup_path(self, path): + assert os.path.exists(path) is False + iterator = object.__new__(KerasIterator) + iterator._cleanup_path(path, create_new=False) + assert os.path.exists(path) is False + iterator._cleanup_path(path) + assert os.path.exists(path) is True + iterator._cleanup_path(path, create_new=False) + assert os.path.exists(path) is False + + def test_get_number_of_mini_batches(self): + iterator = object.__new__(KerasIterator) + iterator.batch_size = 36 + assert iterator._get_number_of_mini_batches(30) == 0 + assert iterator._get_number_of_mini_batches(40) == 1 + assert iterator._get_number_of_mini_batches(72) == 2 + + def test_len(self): + iterator = object.__new__(KerasIterator) + iterator.indexes = [0, 1, 2, 3, 4, 5] + assert len(iterator) == 6 + + def test_concatenate(self): + arr1 = DummyData(10).get_X() + arr2 = DummyData(50).get_X() + iterator = object.__new__(KerasIterator) + new_arr = iterator._concatenate(arr2, arr1) + test_arr = [np.concatenate((arr1[0], arr2[0]), axis=0), + np.concatenate((arr1[1], arr2[1]), axis=0), + np.concatenate((arr1[2], arr2[2]), axis=0)] + for i in range(3): + assert PyTestAllEqual([new_arr[i], test_arr[i]]) + + def test_get_batch(self): + arr = DummyData(20).get_X() + iterator = object.__new__(KerasIterator) + iterator.batch_size = 19 + batch1 = iterator._get_batch(arr, 0) + assert batch1[0].shape[0] == 19 + batch2 = iterator._get_batch(arr, 1) + assert batch2[0].shape[0] == 1 + + def test_save_to_pickle(self, path): + os.makedirs(path) + d = DummyData(20) + X, Y = d.get_X(), d.get_Y() + iterator = object.__new__(KerasIterator) + iterator._path = os.path.join(path, "%i.pickle") + assert os.path.exists(iterator._path % 2) is False + iterator._save_to_pickle(X=X, Y=Y, index=2) + assert os.path.exists(iterator._path % 2) is True + + def test_prepare_batches(self, collection, path): + iterator = object.__new__(KerasIterator) + iterator._collection = collection + iterator.batch_size = 50 + iterator.indexes = [] + iterator.model = None + iterator.upsampling = False + iterator._path = os.path.join(path, "%i.pickle") + os.makedirs(path) + iterator._prepare_batches() + assert len(os.listdir(path)) == 4 + assert len(iterator.indexes) == 4 + assert len(iterator) == 4 + assert iterator.indexes == [0, 1, 2, 3] + + def test_prepare_batches_no_remaining(self, path): + iterator = object.__new__(KerasIterator) + iterator._collection = DataCollection([DummyData(50)]) + iterator.batch_size = 50 + iterator.indexes = [] + iterator.model = None + iterator.upsampling = False + iterator._path = os.path.join(path, "%i.pickle") + os.makedirs(path) + iterator._prepare_batches() + assert len(os.listdir(path)) == 1 + assert len(iterator.indexes) == 1 + assert len(iterator) == 1 + assert iterator.indexes == [0] + + def test_data_generation(self, collection, path): + iterator = KerasIterator(collection, 50, path) + X, Y = iterator._KerasIterator__data_generation(0) + expected = next(iter(collection)) + assert PyTestAllEqual([X, expected.get_X()]) + assert PyTestAllEqual([Y, expected.get_Y()]) + + def test_getitem(self, collection, path): + iterator = KerasIterator(collection, 50, path) + X, Y = iterator[0] + expected = next(iter(collection)) + assert PyTestAllEqual([X, expected.get_X()]) + assert PyTestAllEqual([Y, expected.get_Y()]) + reversed(iterator.indexes) + X, Y = iterator[3] + assert PyTestAllEqual([X, expected.get_X()]) + assert PyTestAllEqual([Y, expected.get_Y()]) + + def test_on_epoch_end(self): + iterator = object.__new__(KerasIterator) + iterator.indexes = [0, 1, 2, 3, 4] + iterator.shuffle = False + iterator.on_epoch_end() + assert iterator.indexes == [0, 1, 2, 3, 4] + iterator.shuffle = True + while iterator.indexes == sorted(iterator.indexes): + iterator.on_epoch_end() + assert iterator.indexes != [0, 1, 2, 3, 4] + assert sorted(iterator.indexes) == [0, 1, 2, 3, 4] + + def test_get_model_rank_no_model(self): + iterator = object.__new__(KerasIterator) + iterator.model = None + assert iterator._get_model_rank() == 1 + + def test_get_model_rank_single_output_branch(self): + iterator = object.__new__(KerasIterator) + iterator.model = MyLittleModel(shape_inputs=[(14, 1, 2)], shape_outputs=[(3,)]) + assert iterator._get_model_rank() == 1 + + def test_get_model_rank_multiple_output_branch(self): + iterator = object.__new__(KerasIterator) + iterator.model = MyBranchedModel(shape_inputs=[(14, 1, 2)], shape_outputs=[(3,)]) + assert iterator._get_model_rank() == 3 + + def test_get_model_rank_error(self): + iterator = object.__new__(KerasIterator) + iterator.model = mock.MagicMock(return_value=1) + with pytest.raises(TypeError): + iterator._get_model_rank() diff --git a/test/test_data_handling/test_data_distributor.py b/test/test_data_handling/test_data_distributor.py deleted file mode 100644 index d01133b58c37567f557543e7a4663717d15d71c7..0000000000000000000000000000000000000000 --- a/test/test_data_handling/test_data_distributor.py +++ /dev/null @@ -1,121 +0,0 @@ -import math -import os - -import keras -import numpy as np -import pytest - -from mlair.data_handling.data_distributor import Distributor -from mlair.data_handling.data_generator import DataGenerator -from mlair.data_handling import DataPrepJoin -from test.test_modules.test_training import my_test_model - - -class TestDistributor: - - @pytest.fixture - def generator(self): - return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), 'DEBW107', ['o3', 'temp'], - 'datetime', 'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, - data_preparation=DataPrepJoin) - - @pytest.fixture - def generator_two_stations(self): - return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), ['DEBW107', 'DEBW013'], - ['o3', 'temp'], 'datetime', 'variables', 'o3', - statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, - data_preparation=DataPrepJoin) - - @pytest.fixture - def model(self): - return my_test_model(keras.layers.PReLU, 5, 3, 0.1, False) - - @pytest.fixture - def model_with_minor_branch(self): - return my_test_model(keras.layers.PReLU, 5, 3, 0.1, True) - - @pytest.fixture - def distributor(self, generator, model): - return Distributor(generator, model) - - def test_init_defaults(self, distributor): - assert distributor.batch_size == 256 - assert distributor.do_data_permutation is False - - def test_get_model_rank(self, distributor, model_with_minor_branch): - assert distributor._get_model_rank() == 1 - distributor.model = model_with_minor_branch - assert distributor._get_model_rank() == 2 - distributor.model = 1 - - def test_get_number_of_mini_batches(self, distributor): - values = np.zeros((2311, 19)) - assert distributor._get_number_of_mini_batches(values) == math.ceil(2311 / distributor.batch_size) - - def test_distribute_on_batches_single_loop(self, generator_two_stations, model): - d = Distributor(generator_two_stations, model) - for e in d.distribute_on_batches(fit_call=False): - assert e[0].shape[0] <= d.batch_size - - def test_distribute_on_batches_infinite_loop(self, generator_two_stations, model): - d = Distributor(generator_two_stations, model) - elements = [] - for i, e in enumerate(d.distribute_on_batches()): - if i < len(d): - elements.append(e[0]) - elif i == 2 * len(d): # check if all elements are repeated - assert np.testing.assert_array_equal(e[0], elements[i - len(d)]) is None - else: # break when 3rd iteration starts (is called as infinite loop) - break - - def test_len(self, distributor): - assert len(distributor) == math.ceil(len(distributor.generator[0][0]) / 256) - - def test_len_two_stations(self, generator_two_stations, model): - gen = generator_two_stations - d = Distributor(gen, model) - expected = math.ceil(len(gen[0][0]) / 256) + math.ceil(len(gen[1][0]) / 256) - assert len(d) == expected - - def test_permute_data_no_permutation(self, distributor): - x = np.array(range(20)).reshape(2, 10).T - y = np.array(range(10)).reshape(10, 1) - x_perm, y_perm = distributor._permute_data(x, y) - assert np.testing.assert_equal(x, x_perm) is None - assert np.testing.assert_equal(y, y_perm) is None - - def test_permute_data(self, distributor): - x = np.array(range(20)).reshape(2, 10).T - y = np.array(range(10)).reshape(10, 1) - distributor.do_data_permutation = True - x_perm, y_perm = distributor._permute_data(x, y) - assert x_perm[0, 0] == y_perm[0] - assert x_perm[0, 1] == y_perm[0] + 10 - assert x_perm[5, 0] == y_perm[5] - assert x_perm[5, 1] == y_perm[5] + 10 - assert x_perm[-1, 0] == y_perm[-1] - assert x_perm[-1, 1] == y_perm[-1] + 10 - # resort x_perm and compare if equal to x - x_perm.sort(axis=0) - y_perm.sort(axis=0) - assert np.testing.assert_equal(x, x_perm) is None - assert np.testing.assert_equal(y, y_perm) is None - - def test_distribute_on_batches_upsampling_no_extremes_given(self, generator, model): - d = Distributor(generator, model, upsampling=True) - gen_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_transposed_label().shape[0] - num_mini_batches = math.ceil(gen_len / d.batch_size) - i = 0 - for i, e in enumerate(d.distribute_on_batches(fit_call=False)): - assert e[0].shape[0] <= d.batch_size - assert i + 1 == num_mini_batches - - def test_distribute_on_batches_upsampling(self, generator, model): - generator.extreme_values = [1] - d = Distributor(generator, model, upsampling=True) - gen_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_transposed_label().shape[0] - extr_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_extremes_label().shape[0] - i = 0 - for i, e in enumerate(d.distribute_on_batches(fit_call=False)): - assert e[0].shape[0] <= d.batch_size - assert i + 1 == math.ceil((gen_len + extr_len) / d.batch_size) diff --git a/test/test_model_modules/test_model_class.py b/test/test_model_modules/test_model_class.py index 6025516ba01abdcb35ea65b9c4570d5a8b0928b5..3e77fd17c4cd8151fe76816abf0bef323adb2e96 100644 --- a/test/test_model_modules/test_model_class.py +++ b/test/test_model_modules/test_model_class.py @@ -12,7 +12,7 @@ class Paddings: class AbstractModelSubClass(AbstractModelClass): def __init__(self): - super().__init__() + super().__init__(shape_inputs=(12, 1, 2), shape_outputs=3) self.test_attr = "testAttr" @@ -20,7 +20,7 @@ class TestAbstractModelClass: @pytest.fixture def amc(self): - return AbstractModelClass() + return AbstractModelClass(shape_inputs=(14, 1, 2), shape_outputs=(3,)) @pytest.fixture def amsc(self): @@ -31,6 +31,8 @@ class TestAbstractModelClass: # assert amc.loss is None assert amc.model_name == "AbstractModelClass" assert amc.custom_objects == {} + assert amc.shape_inputs == (14, 1, 2) + assert amc.shape_outputs == 3 def test_model_property(self, amc): amc.model = keras.Model() @@ -179,8 +181,10 @@ class TestAbstractModelClass: assert amc.compile == amc.model.compile def test_get_settings(self, amc, amsc): - assert amc.get_settings() == {"model_name": "AbstractModelClass"} - assert amsc.get_settings() == {"test_attr": "testAttr", "model_name": "AbstractModelSubClass"} + assert amc.get_settings() == {"model_name": "AbstractModelClass", "shape_inputs": (14, 1, 2), + "shape_outputs": 3} + assert amsc.get_settings() == {"test_attr": "testAttr", "model_name": "AbstractModelSubClass", + "shape_inputs": (12, 1, 2), "shape_outputs": 3} def test_custom_objects(self, amc): amc.custom_objects = {"Test": 123} @@ -200,7 +204,7 @@ class TestMyPaperModel: @pytest.fixture def mpm(self): - return MyPaperModel(window_history_size=6, window_lead_time=4, channels=9) + return MyPaperModel(shape_inputs=[(7, 1, 9)], shape_outputs=[(4,)]) def test_init(self, mpm): # check if loss number of loss functions fit to model outputs diff --git a/test/test_modules/test_experiment_setup.py b/test/test_run_modules/test_experiment_setup.py similarity index 94% rename from test/test_modules/test_experiment_setup.py rename to test/test_run_modules/test_experiment_setup.py index 0f1f7a0cb918b4a1ab4e776fe9f9a563eb244149..abd265f5815d974d6edb474e5a03ed08dc5843cc 100644 --- a/test/test_modules/test_experiment_setup.py +++ b/test/test_run_modules/test_experiment_setup.py @@ -51,8 +51,6 @@ class TestExperimentSetup: # setup for data default_stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] assert data_store.get("stations", "general") == default_stations - assert data_store.get("network", "general") == "AIRBASE" - assert data_store.get("station_type", "general") == "background" assert data_store.get("variables", "general") == list(default_statistics_per_var.keys()) assert data_store.get("statistics_per_var", "general") == default_statistics_per_var assert data_store.get("start", "general") == "1997-01-01" @@ -64,9 +62,9 @@ class TestExperimentSetup: assert data_store.get("window_lead_time", "general") == 3 # interpolation assert data_store.get("dimensions", "general") == {'new_index': ['datetime', 'Stations']} - assert data_store.get("interpolation_dim", "general") == "datetime" + assert data_store.get("time_dim", "general") == "datetime" assert data_store.get("interpolation_method", "general") == "linear" - assert data_store.get("limit_nan_fill", "general") == 1 + assert data_store.get("interpolation_limit", "general") == 1 # train parameters assert data_store.get("start", "general.train") == "1997-01-01" assert data_store.get("end", "general.train") == "2007-12-31" @@ -93,7 +91,7 @@ class TestExperimentSetup: stations=['DEBY053', 'DEBW059', 'DEBW027'], network="INTERNET", station_type="background", variables=["o3", "temp"], start="1999-01-01", end="2001-01-01", window_history_size=4, target_var="relhum", target_dim="target", window_lead_time=10, dimensions="dim1", - interpolation_dim="int_dim", interpolation_method="cubic", limit_nan_fill=5, train_start="2000-01-01", + time_dim="int_dim", interpolation_method="cubic", interpolation_limit=5, train_start="2000-01-01", train_end="2000-01-02", val_start="2000-01-03", val_end="2000-01-04", test_start="2000-01-05", test_end="2000-01-06", use_all_stations_on_all_data_sets=False, trainable=False, fraction_of_train=0.5, experiment_path=experiment_path, create_new_model=True, val_min_length=20) @@ -125,9 +123,9 @@ class TestExperimentSetup: assert data_store.get("window_lead_time", "general") == 10 # interpolation assert data_store.get("dimensions", "general") == "dim1" - assert data_store.get("interpolation_dim", "general") == "int_dim" + assert data_store.get("time_dim", "general") == "int_dim" assert data_store.get("interpolation_method", "general") == "cubic" - assert data_store.get("limit_nan_fill", "general") == 5 + assert data_store.get("interpolation_limit", "general") == 5 # train parameters assert data_store.get("start", "general.train") == "2000-01-01" assert data_store.get("end", "general.train") == "2000-01-02" diff --git a/test/test_modules/test_model_setup.py b/test/test_run_modules/test_model_setup.py similarity index 52% rename from test/test_modules/test_model_setup.py rename to test/test_run_modules/test_model_setup.py index 2b83d2549ea2f649091d2f16b67bf0d93789af52..1b3e43b2bbfda44f1a5b5463e876adc578360ff3 100644 --- a/test/test_modules/test_model_setup.py +++ b/test/test_run_modules/test_model_setup.py @@ -1,9 +1,11 @@ import os +import numpy as np +import shutil import pytest -from mlair.data_handling import DataPrepJoin -from mlair.data_handling.data_generator import DataGenerator +from mlair.data_handler import KerasIterator +from mlair.data_handler import DataCollection from mlair.helpers.datastore import EmptyScope from mlair.model_modules.keras_extensions import CallbackHandler from mlair.model_modules.model_class import AbstractModelClass, MyLittleModel @@ -29,29 +31,40 @@ class TestModelSetup: RunEnvironment().__del__() @pytest.fixture - def gen(self): - return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), 'DEBW107', ['o3', 'temp'], - 'datetime', 'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, - data_preparation=DataPrepJoin) + def path(self): + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") + shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None + yield p + shutil.rmtree(p, ignore_errors=True) @pytest.fixture - def setup_with_gen(self, setup, gen): - setup.data_store.set("generator", gen, "general.train") - setup.data_store.set("window_history_size", gen.window_history_size, "general") - setup.data_store.set("window_lead_time", gen.window_lead_time, "general") - setup.data_store.set("channels", 2, "general") + def keras_iterator(self, path): + coll = [] + for i in range(3): + coll.append(DummyData(50 + i)) + data_coll = DataCollection(collection=coll) + KerasIterator(data_coll, 25, path) + return data_coll + + @pytest.fixture + def setup_with_gen(self, setup, keras_iterator): + setup.data_store.set("data_collection", keras_iterator, "train") + shape_inputs = [keras_iterator[0].get_X()[0].shape[1:]] + setup.data_store.set("shape_inputs", shape_inputs, "model") + shape_outputs = [keras_iterator[0].get_Y()[0].shape[1:]] + setup.data_store.set("shape_outputs", shape_outputs, "model") yield setup RunEnvironment().__del__() @pytest.fixture - def setup_with_gen_tiny(self, setup, gen): - setup.data_store.set("generator", gen, "general.train") + def setup_with_gen_tiny(self, setup, keras_iterator): + setup.data_store.set("data_collection", keras_iterator, "train") yield setup RunEnvironment().__del__() @pytest.fixture def setup_with_model(self, setup): - setup.model = AbstractModelClass() + setup.model = AbstractModelClass(shape_inputs=(12, 1), shape_outputs=2) setup.model.test_param = "42" yield setup RunEnvironment().__del__() @@ -89,14 +102,17 @@ class TestModelSetup: assert setup_with_gen.model is None setup_with_gen.build_model() assert isinstance(setup_with_gen.model, AbstractModelClass) - expected = {"window_history_size", "window_lead_time", "channels", "dropout_rate", "regularizer", "initial_lr", - "optimizer", "activation"} + expected = {"lr_decay", "model_name", "dropout_rate", "regularizer", "initial_lr", "optimizer", "activation", + "shape_inputs", "shape_outputs"} assert expected <= self.current_scope_as_set(setup_with_gen) - def test_set_channels(self, setup_with_gen_tiny): - assert len(setup_with_gen_tiny.data_store.search_name("channels")) == 0 - setup_with_gen_tiny._set_channels() - assert setup_with_gen_tiny.data_store.get("channels", setup_with_gen_tiny.scope) == 2 + def test_set_shapes(self, setup_with_gen_tiny): + assert len(setup_with_gen_tiny.data_store.search_name("shape_inputs")) == 0 + assert len(setup_with_gen_tiny.data_store.search_name("shape_outputs")) == 0 + setup_with_gen_tiny._set_shapes() + assert setup_with_gen_tiny.data_store.get("shape_inputs", setup_with_gen_tiny.scope) == [(14, 1, 5), (10, 1, 2), + (1, 1, 2)] + assert setup_with_gen_tiny.data_store.get("shape_outputs", setup_with_gen_tiny.scope) == [(5,), (3,)] def test_load_weights(self): pass @@ -109,3 +125,20 @@ class TestModelSetup: def test_init(self): pass + + +class DummyData: + + def __init__(self, number_of_samples=np.random.randint(100, 150)): + self.number_of_samples = number_of_samples + + def get_X(self, upsampling=False, as_numpy=True): + X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 1, 5)) # samples, window, variables + X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 1, 2)) # samples, window, variables + X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 1, 2)) # samples, window, variables + return [X1, X2, X3] + + def get_Y(self, upsampling=False, as_numpy=True): + Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5)) # samples, window + Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 3)) # samples, window + return [Y1, Y2] \ No newline at end of file diff --git a/test/test_modules/test_partition_check.py b/test/test_run_modules/test_partition_check.py similarity index 95% rename from test/test_modules/test_partition_check.py rename to test/test_run_modules/test_partition_check.py index 1e576a8ce47c98e395468b76d3496dafa3cc0525..ba5b3d7ef127258eaa6c4f2a1a0b4d0b531eeac5 100644 --- a/test/test_modules/test_partition_check.py +++ b/test/test_run_modules/test_partition_check.py @@ -5,7 +5,6 @@ import mock from mlair.run_modules.experiment_setup import ExperimentSetup from mlair.run_modules.partition_check import PartitionCheck from mlair.run_modules.run_environment import RunEnvironment -from mlair.configuration import get_host class TestPartitionCheck: @@ -24,6 +23,7 @@ class TestPartitionCheck: @mock.patch("os.path.exists", return_value=False) @mock.patch("os.makedirs", side_effect=None) def obj_with_exp_setup_login(self, mock_host, mock_user, mock_path, mock_check): + RunEnvironment().__del__() ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background") pre = object.__new__(PartitionCheck) @@ -37,6 +37,7 @@ class TestPartitionCheck: @mock.patch("os.path.exists", return_value=False) @mock.patch("os.makedirs", side_effect=None) def obj_with_exp_setup_compute(self, mock_host, mock_user, mock_path, mock_check): + RunEnvironment().__del__() ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background") pre = object.__new__(PartitionCheck) @@ -71,5 +72,5 @@ class TestPartitionCheck: @mock.patch("os.path.exists", return_value=False) @mock.patch("os.makedirs", side_effect=None) def test_run_compute(self, mock_host, mock_user, mock_path, mock_check, obj_with_exp_setup_compute, caplog): - - assert obj_with_exp_setup_compute.__next__()._run() is None + obj = obj_with_exp_setup_compute.__next__() + assert obj._run() is None diff --git a/test/test_modules/test_post_processing.py b/test/test_run_modules/test_post_processing.py similarity index 100% rename from test/test_modules/test_post_processing.py rename to test/test_run_modules/test_post_processing.py diff --git a/test/test_modules/test_pre_processing.py b/test/test_run_modules/test_pre_processing.py similarity index 68% rename from test/test_modules/test_pre_processing.py rename to test/test_run_modules/test_pre_processing.py index a35e810c2d62ab746004442bffee51d85dc17ab2..97e73204068d334590ee98271080acddf29dfc5f 100644 --- a/test/test_modules/test_pre_processing.py +++ b/test/test_run_modules/test_pre_processing.py @@ -2,12 +2,11 @@ import logging import pytest -from mlair.data_handling import DataPrepJoin -from mlair.data_handling.data_generator import DataGenerator +from mlair.data_handler import DefaultDataPreparation, DataCollection, AbstractDataPreparation from mlair.helpers.datastore import NameNotFoundInScope from mlair.helpers import PyTestRegex from mlair.run_modules.experiment_setup import ExperimentSetup -from mlair.run_modules.pre_processing import PreProcessing, DEFAULT_ARGS_LIST, DEFAULT_KWARGS_LIST +from mlair.run_modules.pre_processing import PreProcessing from mlair.run_modules.run_environment import RunEnvironment @@ -29,7 +28,7 @@ class TestPreProcessing: def obj_with_exp_setup(self): ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background", - data_preparation=DataPrepJoin) + data_preparation=DefaultDataPreparation) pre = object.__new__(PreProcessing) super(PreProcessing, pre).__init__() yield pre @@ -42,25 +41,26 @@ class TestPreProcessing: caplog.set_level(logging.INFO) with PreProcessing(): assert caplog.record_tuples[0] == ('root', 20, 'PreProcessing started') - assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started (all)') + assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started (preprocessing)') assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 5 ' r'station\(s\). Found 5/5 valid stations.')) RunEnvironment().__del__() def test_run(self, obj_with_exp_setup): - assert obj_with_exp_setup.data_store.search_name("generator") == [] + assert obj_with_exp_setup.data_store.search_name("data_collection") == [] assert obj_with_exp_setup._run() is None - assert obj_with_exp_setup.data_store.search_name("generator") == sorted(["general.train", "general.val", - "general.train_val", "general.test"]) + assert obj_with_exp_setup.data_store.search_name("data_collection") == sorted(["general.train", "general.val", + "general.train_val", + "general.test"]) def test_split_train_val_test(self, obj_with_exp_setup): - assert obj_with_exp_setup.data_store.search_name("generator") == [] + assert obj_with_exp_setup.data_store.search_name("data_collection") == [] obj_with_exp_setup.split_train_val_test() data_store = obj_with_exp_setup.data_store - expected_params = ["generator", "start", "end", "stations", "permute_data", "min_length", "extreme_values", - "extremes_on_right_tail_only", "upsampling"] + expected_params = ["data_collection", "start", "end", "stations", "permute_data", "min_length", + "extreme_values", "extremes_on_right_tail_only", "upsampling"] assert data_store.search_scope("general.train") == sorted(expected_params) - assert data_store.search_name("generator") == sorted(["general.train", "general.val", "general.test", + assert data_store.search_name("data_collection") == sorted(["general.train", "general.val", "general.test", "general.train_val"]) def test_create_set_split_not_all_stations(self, caplog, obj_with_exp_setup): @@ -69,9 +69,9 @@ class TestPreProcessing: obj_with_exp_setup.create_set_split(slice(0, 2), "awesome") assert ('root', 10, "Awesome stations (len=2): ['DEBW107', 'DEBY081']") in caplog.record_tuples data_store = obj_with_exp_setup.data_store - assert isinstance(data_store.get("generator", "general.awesome"), DataGenerator) + assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection) with pytest.raises(NameNotFoundInScope): - data_store.get("generator", "general") + data_store.get("data_collection", "general") assert data_store.get("stations", "general.awesome") == ["DEBW107", "DEBY081"] def test_create_set_split_all_stations(self, caplog, obj_with_exp_setup): @@ -80,22 +80,22 @@ class TestPreProcessing: message = "Awesome stations (len=6): ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001']" assert ('root', 10, message) in caplog.record_tuples data_store = obj_with_exp_setup.data_store - assert isinstance(data_store.get("generator", "general.awesome"), DataGenerator) + assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection) with pytest.raises(NameNotFoundInScope): - data_store.get("generator", "general") + data_store.get("data_collection", "general") assert data_store.get("stations", "general.awesome") == ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] @pytest.mark.parametrize("name", (None, "tester")) - def test_check_valid_stations(self, caplog, obj_with_exp_setup, name): + def test_validate_station(self, caplog, obj_with_exp_setup, name): pre = obj_with_exp_setup caplog.set_level(logging.INFO) - args = pre.data_store.create_args_dict(DEFAULT_ARGS_LIST) - kwargs = pre.data_store.create_args_dict(DEFAULT_KWARGS_LIST) stations = pre.data_store.get("stations", "general") - valid_stations = pre.check_valid_stations(args, kwargs, stations, name=name) + data_preparation = pre.data_store.get("data_preparation") + collection, valid_stations = pre.validate_station(data_preparation, stations, set_name=name) + assert isinstance(collection, DataCollection) assert len(valid_stations) < len(stations) assert valid_stations == stations[:-1] - expected = 'check valid stations started (tester)' if name else 'check valid stations started' + expected = "check valid stations started" + ' (%s)' % (name if name else 'all') assert caplog.record_tuples[0] == ('root', 20, expected) assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 6 ' r'station\(s\). Found 5/6 valid stations.')) @@ -107,3 +107,11 @@ class TestPreProcessing: assert dummy_list[val] == list(range(10, 13)) assert dummy_list[test] == list(range(13, 15)) assert dummy_list[train_val] == list(range(0, 13)) + + def test_transformation(self): + pre = object.__new__(PreProcessing) + data_preparation = AbstractDataPreparation + stations = ['DEBW107', 'DEBY081'] + assert pre.transformation(data_preparation, stations) is None + class data_preparation_no_trans: pass + assert pre.transformation(data_preparation_no_trans, stations) is None diff --git a/test/test_modules/test_run_environment.py b/test/test_run_modules/test_run_environment.py similarity index 100% rename from test/test_modules/test_run_environment.py rename to test/test_run_modules/test_run_environment.py diff --git a/test/test_modules/test_training.py b/test/test_run_modules/test_training.py similarity index 72% rename from test/test_modules/test_training.py rename to test/test_run_modules/test_training.py index b80570bb51ec5886f163842a3a40411148df3419..1fec8f4e56e2925bff0bc4af859dac1fe5fbb2b6 100644 --- a/test/test_modules/test_training.py +++ b/test/test_run_modules/test_training.py @@ -9,9 +9,7 @@ import mock import pytest from keras.callbacks import History -from mlair.data_handling import DataPrepJoin -from mlair.data_handling.data_distributor import Distributor -from mlair.data_handling.data_generator import DataGenerator +from mlair.data_handler import DataCollection, KerasIterator, DefaultDataPreparation from mlair.helpers import PyTestRegex from mlair.model_modules.flatten import flatten_tail from mlair.model_modules.inception_model import InceptionModelBase @@ -20,7 +18,7 @@ from mlair.run_modules.run_environment import RunEnvironment from mlair.run_modules.training import Training -def my_test_model(activation, window_history_size, channels, dropout_rate, add_minor_branch=False): +def my_test_model(activation, window_history_size, channels, output_size, dropout_rate, add_minor_branch=False): inception_model = InceptionModelBase() conv_settings_dict1 = { 'tower_1': {'reduction_filter': 8, 'tower_filter': 8 * 2, 'tower_kernel': (3, 1), 'activation': activation}, @@ -29,7 +27,6 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m X_input = keras.layers.Input(shape=(window_history_size + 1, 1, channels)) X_in = inception_model.inception_block(X_input, conv_settings_dict1, pool_settings_dict1) if add_minor_branch: - # out = [flatten_tail(X_in, 'Minor_1', activation=activation)] out = [flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=4, output_activation='linear', reduction_filter=64, name='Minor_1', dropout_rate=dropout_rate, @@ -37,8 +34,7 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m else: out = [] X_in = keras.layers.Dropout(dropout_rate)(X_in) - # out.append(flatten_tail(X_in, 'Main', activation=activation)) - out.append(flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=4, + out.append(flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=output_size, output_activation='linear', reduction_filter=64, name='Main', dropout_rate=dropout_rate, )) @@ -48,7 +44,7 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m class TestTraining: @pytest.fixture - def init_without_run(self, path: str, model: keras.Model, callbacks: CallbackHandler, model_path): + def init_without_run(self, path: str, model: keras.Model, callbacks: CallbackHandler, model_path, batch_path): obj = object.__new__(Training) super(Training, obj).__init__() obj.model = model @@ -62,15 +58,18 @@ class TestTraining: obj.lr_sc = lr obj.hist = hist obj.experiment_name = "TestExperiment" - obj.data_store.set("generator", mock.MagicMock(return_value="mock_train_gen"), "general.train") - obj.data_store.set("generator", mock.MagicMock(return_value="mock_val_gen"), "general.val") - obj.data_store.set("generator", mock.MagicMock(return_value="mock_test_gen"), "general.test") + obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_train_gen"), "general.train") + obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_val_gen"), "general.val") + obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_test_gen"), "general.test") os.makedirs(path) obj.data_store.set("experiment_path", path, "general") + os.makedirs(batch_path) + obj.data_store.set("batch_path", batch_path, "general") os.makedirs(model_path) obj.data_store.set("model_path", model_path, "general") obj.data_store.set("model_name", os.path.join(model_path, "test_model.h5"), "general.model") obj.data_store.set("experiment_name", "TestExperiment", "general") + path_plot = os.path.join(path, "plots") os.makedirs(path_plot) obj.data_store.set("plot_path", path_plot, "general") @@ -109,14 +108,35 @@ class TestTraining: return os.path.join(path, "model") @pytest.fixture - def generator(self, path): - return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), ['DEBW107'], ['o3', 'temp'], 'datetime', - 'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, - data_preparation=DataPrepJoin) + def batch_path(self, path): + return os.path.join(path, "batch") + + @pytest.fixture + def window_history_size(self): + return 7 + + @pytest.fixture + def window_lead_time(self): + return 2 @pytest.fixture - def model(self): - return my_test_model(keras.layers.PReLU, 7, 2, 0.1, False) + def statistics_per_var(self): + return {'o3': 'dma8eu', 'temp': 'maximum'} + + @pytest.fixture + def data_collection(self, path, window_history_size, window_lead_time, statistics_per_var): + data_prep = DefaultDataPreparation.build(['DEBW107'], data_path=os.path.join(os.path.dirname(__file__), 'data'), + statistics_per_var=statistics_per_var, station_type="background", + network="AIRBASE", sampling="daily", target_dim="variables", + target_var="o3", time_dim="datetime", + window_history_size=window_history_size, + window_lead_time=window_lead_time, name_affix="train") + return DataCollection([data_prep]) + + @pytest.fixture + def model(self, window_history_size, window_lead_time, statistics_per_var): + channels = len(list(statistics_per_var.keys())) + return my_test_model(keras.layers.PReLU, window_history_size, channels, window_lead_time, 0.1, False) @pytest.fixture def callbacks(self, path): @@ -130,29 +150,31 @@ class TestTraining: return clbk, hist, lr @pytest.fixture - def ready_to_train(self, generator: DataGenerator, init_without_run: Training): - init_without_run.train_set = Distributor(generator, init_without_run.model, init_without_run.batch_size) - init_without_run.val_set = Distributor(generator, init_without_run.model, init_without_run.batch_size) + def ready_to_train(self, data_collection: DataCollection, init_without_run: Training, batch_path: str): + batch_size = init_without_run.batch_size + model = init_without_run.model + init_without_run.train_set = KerasIterator(data_collection, batch_size, batch_path, model=model, name="train") + init_without_run.val_set = KerasIterator(data_collection, batch_size, batch_path, model=model, name="val") init_without_run.model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error) return init_without_run @pytest.fixture - def ready_to_run(self, generator, init_without_run): + def ready_to_run(self, data_collection, init_without_run): obj = init_without_run - obj.data_store.set("generator", generator, "general.train") - obj.data_store.set("generator", generator, "general.val") - obj.data_store.set("generator", generator, "general.test") + obj.data_store.set("data_collection", data_collection, "general.train") + obj.data_store.set("data_collection", data_collection, "general.val") + obj.data_store.set("data_collection", data_collection, "general.test") obj.model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error) return obj @pytest.fixture - def ready_to_init(self, generator, model, callbacks, path, model_path): + def ready_to_init(self, data_collection, model, callbacks, path, model_path, batch_path): os.makedirs(path) os.makedirs(model_path) obj = RunEnvironment() - obj.data_store.set("generator", generator, "general.train") - obj.data_store.set("generator", generator, "general.val") - obj.data_store.set("generator", generator, "general.test") + obj.data_store.set("data_collection", data_collection, "general.train") + obj.data_store.set("data_collection", data_collection, "general.val") + obj.data_store.set("data_collection", data_collection, "general.test") model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error) obj.data_store.set("model", model, "general.model") obj.data_store.set("model_path", model_path, "general") @@ -167,6 +189,8 @@ class TestTraining: obj.data_store.set("experiment_path", path, "general") obj.data_store.set("trainable", True, "general") obj.data_store.set("create_new_model", True, "general") + os.makedirs(batch_path) + obj.data_store.set("batch_path", batch_path, "general") path_plot = os.path.join(path, "plots") os.makedirs(path_plot) obj.data_store.set("plot_path", path_plot, "general") @@ -177,6 +201,13 @@ class TestTraining: def test_init(self, ready_to_init): assert isinstance(Training(), Training) # just test, if nothing fails + def test_no_training(self, ready_to_init, caplog): + caplog.set_level(logging.INFO) + ready_to_init.data_store.set("trainable", False) + Training() + message = "No training has started, because trainable parameter was false." + assert caplog.record_tuples[-2] == ("root", 20, message) + def test_run(self, ready_to_run): assert ready_to_run._run() is None # just test, if nothing fails @@ -188,8 +219,8 @@ class TestTraining: def test_set_gen(self, init_without_run): assert init_without_run.train_set is None init_without_run._set_gen("train") - assert isinstance(init_without_run.train_set, Distributor) - assert init_without_run.train_set.generator.return_value == "mock_train_gen" + assert isinstance(init_without_run.train_set, KerasIterator) + assert init_without_run.train_set._collection.return_value == "mock_train_gen" def test_set_generators(self, init_without_run): sets = ["train", "val", "test"] @@ -197,7 +228,7 @@ class TestTraining: init_without_run.set_generators() assert not all([getattr(init_without_run, f"{obj}_set") is None for obj in sets]) assert all( - [getattr(init_without_run, f"{obj}_set").generator.return_value == f"mock_{obj}_gen" for obj in sets]) + [getattr(init_without_run, f"{obj}_set")._collection.return_value == f"mock_{obj}_gen" for obj in sets]) def test_train(self, ready_to_train, path): assert not hasattr(ready_to_train.model, "history")