diff --git a/.gitignore b/.gitignore
index 9ac8bb7635bb12d5e3bc32182a90d0f3ba985c58..305a5d1b9420eb62da24772fc1f4b263c1f3efe1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,7 +60,7 @@ Thumbs.db
 htmlcov/
 .pytest_cache
 /test/data/
-/test/test_modules/data/
+/test/test_run_modules/data/
 report.html
 /TestExperiment/
 /testrun_network*/
diff --git a/CI/run_pytest_coverage.sh b/CI/run_pytest_coverage.sh
index 45916427f1521843923fb94e49dc661241dc0369..24d916b1a32da714abc2e5de0ac2b4c2790752a9 100644
--- a/CI/run_pytest_coverage.sh
+++ b/CI/run_pytest_coverage.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # run coverage twice, 1) for html deploy 2) for success evaluation
-python3.6 -m pytest --cov=src --cov-report term  --cov-report html test/ | tee coverage_results.out
+python3.6 -m pytest --cov=mlair --cov-report term  --cov-report html test/ | tee coverage_results.out
 
 IS_FAILED=$?
 
diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py
index 3f38e14f8ab8d471e7b2a94813566ce21e1a8748..31746ec889cc82ebbae8de82a05c5cff02a22ac0 100644
--- a/mlair/configuration/defaults.py
+++ b/mlair/configuration/defaults.py
@@ -13,7 +13,8 @@ DEFAULT_START = "1997-01-01"
 DEFAULT_END = "2017-12-31"
 DEFAULT_WINDOW_HISTORY_SIZE = 13
 DEFAULT_OVERWRITE_LOCAL_DATA = False
-DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"}
+# DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"}
+DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise"}
 DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"]  # ju[wels} #hdfmll(ogin)
 DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"]  # first part of node names for Juwels (jw[comp], hdfmlc(ompute).
 DEFAULT_CREATE_NEW_MODEL = True
@@ -28,9 +29,9 @@ DEFAULT_TARGET_VAR = "o3"
 DEFAULT_TARGET_DIM = "variables"
 DEFAULT_WINDOW_LEAD_TIME = 3
 DEFAULT_DIMENSIONS = {"new_index": ["datetime", "Stations"]}
-DEFAULT_INTERPOLATION_DIM = "datetime"
+DEFAULT_TIME_DIM = "datetime"
 DEFAULT_INTERPOLATION_METHOD = "linear"
-DEFAULT_LIMIT_NAN_FILL = 1
+DEFAULT_INTERPOLATION_LIMIT = 1
 DEFAULT_TRAIN_START = "1997-01-01"
 DEFAULT_TRAIN_END = "2007-12-31"
 DEFAULT_TRAIN_MIN_LENGTH = 90
diff --git a/mlair/configuration/path_config.py b/mlair/configuration/path_config.py
index 0ef082b58cf7028ea4f71e86b6d0c4ecad6ff54d..9b3d6f250d97d93dd1d06004690885f44de30073 100644
--- a/mlair/configuration/path_config.py
+++ b/mlair/configuration/path_config.py
@@ -33,13 +33,13 @@ def prepare_host(create_new=True, data_path=None, sampling="daily") -> str:
         elif hostname == "zam347":
             data_path = f"/home/{user}/Data/toar_{sampling}/"
         elif hostname == "linux-aa9b":
-            data_path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/"
+            data_path = f"/home/{user}/mlair/data/toar_{sampling}/"
         elif (len(hostname) > 2) and (hostname[:2] == "jr"):
             data_path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/"
         elif (len(hostname) > 2) and (hostname[:2] in ['jw', 'ju'] or hostname[:5] in ['hdfml']):
             data_path = f"/p/project/deepacf/intelliaq/{user}/DATA/toar_{sampling}/"
         elif runner_regex.match(hostname) is not None:
-            data_path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/"
+            data_path = f"/home/{user}/mlair/data/toar_{sampling}/"
         else:
             data_path = os.path.join(os.getcwd(), "data", sampling)
             # raise OSError(f"unknown host '{hostname}'")
diff --git a/mlair/data_handling/__init__.py b/mlair/data_handler/__init__.py
similarity index 59%
rename from mlair/data_handling/__init__.py
rename to mlair/data_handler/__init__.py
index cb5aa5db0f29cf51d32ed54e810fa9b363d80cc6..451868b838ab7a0d165942e36b5ec6aa03e42721 100644
--- a/mlair/data_handling/__init__.py
+++ b/mlair/data_handler/__init__.py
@@ -10,6 +10,6 @@ __date__ = '2020-04-17'
 
 
 from .bootstraps import BootStraps
-from .data_preparation_join import DataPrepJoin
-from .data_generator import DataGenerator
-from .data_distributor import Distributor
+from .iterator import KerasIterator, DataCollection
+from .advanced_data_handler import DefaultDataPreparation, AbstractDataPreparation
+from .data_preparation_neighbors import DataPreparationNeighbors
diff --git a/mlair/data_handler/advanced_data_handler.py b/mlair/data_handler/advanced_data_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a9667f2a42575faa02d50e439252738a8dc8bb
--- /dev/null
+++ b/mlair/data_handler/advanced_data_handler.py
@@ -0,0 +1,356 @@
+
+__author__ = 'Lukas Leufen'
+__date__ = '2020-07-08'
+
+
+from mlair.helpers import to_list, remove_items
+import numpy as np
+import xarray as xr
+import pickle
+import os
+import pandas as pd
+import datetime as dt
+import shutil
+import inspect
+import copy
+
+from typing import Union, List, Tuple, Dict
+import logging
+from functools import reduce
+from mlair.data_handler.station_preparation import StationPrep
+from mlair.helpers.join import EmptyQueryResult
+
+
+number = Union[float, int]
+num_or_list = Union[number, List[number]]
+
+
+class DummyDataSingleStation:  # pragma: no cover
+
+    def __init__(self, name, number_of_samples=None):
+        self.name = name
+        self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150)
+
+    def get_X(self):
+        X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5))  # samples, window, variables
+        datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
+        return xr.DataArray(X1, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
+                                                                                  "window": range(14),
+                                                                                  "variables": range(5)})
+
+    def get_Y(self):
+        Y1 = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1)  # samples, window, variables
+        datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist()
+        return xr.DataArray(Y1, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist,
+                                                                                  "window": range(5),
+                                                                                  "variables": range(1)})
+
+    def __str__(self):
+        return self.name
+
+
+class AbstractDataPreparation:
+
+    _requirements = []
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    @classmethod
+    def build(cls, *args, **kwargs):
+        """Return initialised class."""
+        return cls(*args, **kwargs)
+
+    @classmethod
+    def requirements(cls):
+        """Return requirements and own arguments without duplicates."""
+        return list(set(cls._requirements + cls.own_args()))
+
+    @classmethod
+    def own_args(cls, *args):
+        return remove_items(inspect.getfullargspec(cls).args, ["self"] + list(args))
+
+    @classmethod
+    def transformation(cls, *args, **kwargs):
+        return None
+
+    def get_X(self, upsampling=False, as_numpy=False):
+        raise NotImplementedError
+
+    def get_Y(self, upsampling=False, as_numpy=False):
+        raise NotImplementedError
+
+    def get_data(self, upsampling=False, as_numpy=False):
+        return self.get_X(upsampling, as_numpy), self.get_Y(upsampling, as_numpy)
+
+    def get_coordinates(self) -> Union[None, Dict]:
+        return None
+
+
+class DefaultDataPreparation(AbstractDataPreparation):
+
+    _requirements = remove_items(inspect.getfullargspec(StationPrep).args, ["self", "station"])
+
+    def __init__(self, id_class, data_path, min_length=0,
+                 extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None):
+        super().__init__()
+        self.id_class = id_class
+        self.interpolation_dim = "datetime"
+        self.min_length = min_length
+        self._X = None
+        self._Y = None
+        self._X_extreme = None
+        self._Y_extreme = None
+        _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self))
+        self._save_file = os.path.join(data_path, f"data_preparation_{_name_affix}.pickle")
+        self._collection = self._create_collection()
+        self.harmonise_X()
+        self.multiply_extremes(extreme_values, extremes_on_right_tail_only, dim=self.interpolation_dim)
+        self._store(fresh_store=True)
+
+    @classmethod
+    def build(cls, station, **kwargs):
+        sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs}
+        sp = StationPrep(station, **sp_keys)
+        dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs}
+        return cls(sp, **dp_args)
+
+    def _create_collection(self):
+        return [self.id_class]
+
+    @classmethod
+    def requirements(cls):
+        return remove_items(super().requirements(), "id_class")
+
+    def _reset_data(self):
+        self._X, self._Y, self._X_extreme, self._Y_extreme = None, None, None, None
+
+    def _cleanup(self):
+        directory = os.path.dirname(self._save_file)
+        if os.path.exists(directory) is False:
+            os.makedirs(directory)
+        if os.path.exists(self._save_file):
+            shutil.rmtree(self._save_file, ignore_errors=True)
+
+    def _store(self, fresh_store=False):
+        self._cleanup() if fresh_store is True else None
+        data = {"X": self._X, "Y": self._Y, "X_extreme": self._X_extreme, "Y_extreme": self._Y_extreme}
+        with open(self._save_file, "wb") as f:
+            pickle.dump(data, f)
+        logging.debug(f"save pickle data to {self._save_file}")
+        self._reset_data()
+
+    def _load(self):
+        try:
+            with open(self._save_file, "rb") as f:
+                data = pickle.load(f)
+            logging.debug(f"load pickle data from {self._save_file}")
+            self._X, self._Y = data["X"], data["Y"]
+            self._X_extreme, self._Y_extreme = data["X_extreme"], data["Y_extreme"]
+        except FileNotFoundError:
+            pass
+
+    def get_data(self, upsampling=False, as_numpy=True):
+        self._load()
+        X = self.get_X(upsampling, as_numpy)
+        Y = self.get_Y(upsampling, as_numpy)
+        self._reset_data()
+        return X, Y
+
+    def __repr__(self):
+        return ";".join(list(map(lambda x: str(x), self._collection)))
+
+    def get_X_original(self):
+        X = []
+        for data in self._collection:
+            X.append(data.get_X())
+        return X
+
+    def get_Y_original(self):
+        Y = self._collection[0].get_Y()
+        return Y
+
+    @staticmethod
+    def _to_numpy(d):
+        return list(map(lambda x: np.copy(x), d))
+
+    def get_X(self, upsampling=False, as_numpy=True):
+        no_data = (self._X is None)
+        self._load() if no_data is True else None
+        X = self._X if upsampling is False else self._X_extreme
+        self._reset_data() if no_data is True else None
+        return self._to_numpy(X) if as_numpy is True else X
+
+    def get_Y(self, upsampling=False, as_numpy=True):
+        no_data = (self._Y is None)
+        self._load() if no_data is True else None
+        Y = self._Y if upsampling is False else self._Y_extreme
+        self._reset_data() if no_data is True else None
+        return self._to_numpy([Y]) if as_numpy is True else Y
+
+    def harmonise_X(self):
+        X_original, Y_original = self.get_X_original(), self.get_Y_original()
+        dim = self.interpolation_dim
+        intersect = reduce(np.intersect1d, map(lambda x: x.coords[dim].values, X_original))
+        if len(intersect) < max(self.min_length, 1):
+            X, Y = None, None
+        else:
+            X = list(map(lambda x: x.sel({dim: intersect}), X_original))
+            Y = Y_original.sel({dim: intersect})
+        self._X, self._Y = X, Y
+
+    def get_observation(self):
+        return self.id_class.observation.copy().squeeze()
+
+    def get_transformation_Y(self):
+        return self.id_class.get_transformation_information()
+
+    def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False,
+                          timedelta: Tuple[int, str] = (1, 'm'), dim="datetime"):
+        """
+        Multiply extremes.
+
+        This method extracts extreme values from self.labels which are defined in the argument extreme_values. One can
+        also decide only to extract extremes on the right tail of the distribution. When extreme_values is a list of
+        floats/ints all values larger (and smaller than negative extreme_values; extraction is performed in standardised
+        space) than are extracted iteratively. If for example extreme_values = [1.,2.] then a value of 1.5 would be
+        extracted once (for 0th entry in list), while a 2.5 would be extracted twice (once for each entry). Timedelta is
+        used to mark those extracted values by adding one min to each timestamp. As TOAR Data are hourly one can
+        identify those "artificial" data points later easily. Extreme inputs and labels are stored in
+        self.extremes_history and self.extreme_labels, respectively.
+
+        :param extreme_values: user definition of extreme
+        :param extremes_on_right_tail_only: if False also multiply values which are smaller then -extreme_values,
+            if True only extract values larger than extreme_values
+        :param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime
+        """
+        # check if X or Y is None
+        if (self._X is None) or (self._Y is None):
+            logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes")
+            return
+        if extreme_values is None:
+            logging.debug(f"No extreme values given, skip multiply extremes")
+            self._X_extreme, self._Y_extreme = self._X, self._Y
+            return
+
+        # check type if inputs
+        extreme_values = to_list(extreme_values)
+        for i in extreme_values:
+            if not isinstance(i, number.__args__):
+                raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element "
+                                f"{i} is type {type(i)}")
+
+        for extr_val in sorted(extreme_values):
+            # check if some extreme values are already extracted
+            if (self._X_extreme is None) or (self._Y_extreme is None):
+                X = self._X
+                Y = self._Y
+            else:  # one extr value iteration is done already: self.extremes_label is NOT None...
+                X = self._X_extreme
+                Y = self._Y_extreme
+
+            # extract extremes based on occurrence in labels
+            other_dims = remove_items(list(Y.dims), dim)
+            if extremes_on_right_tail_only:
+                extreme_idx = (Y > extr_val).any(dim=other_dims)
+            else:
+                extreme_idx = xr.concat([(Y < -extr_val).any(dim=other_dims[0]),
+                                           (Y > extr_val).any(dim=other_dims[0])],
+                                          dim=other_dims[1]).any(dim=other_dims[1])
+
+            extremes_X = list(map(lambda x: x.sel(**{dim: extreme_idx}), X))
+            self._add_timedelta(extremes_X, dim, timedelta)
+            # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X))
+
+            extremes_Y = Y.sel(**{dim: extreme_idx})
+            extremes_Y.coords[dim].values += np.timedelta64(*timedelta)
+
+            self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim)
+            self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X))
+
+    @staticmethod
+    def _add_timedelta(data, dim, timedelta):
+        for d in data:
+            d.coords[dim].values += np.timedelta64(*timedelta)
+
+    @classmethod
+    def transformation(cls, set_stations, **kwargs):
+        sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs}
+        transformation_dict = sp_keys.pop("transformation")
+        if transformation_dict is None:
+            return
+        scope = transformation_dict.pop("scope")
+        method = transformation_dict.pop("method")
+        if transformation_dict.pop("mean", None) is not None:
+            return
+        mean, std = None, None
+        for station in set_stations:
+            try:
+                sp = StationPrep(station, transformation={"method": method}, **sp_keys)
+                mean = sp.mean.copy(deep=True) if mean is None else mean.combine_first(sp.mean)
+                std = sp.std.copy(deep=True) if std is None else std.combine_first(sp.std)
+            except (AttributeError, EmptyQueryResult):
+                continue
+        if mean is None:
+            return None
+        mean_estimated = mean.mean("Stations")
+        std_estimated = std.mean("Stations")
+        return {"scope": scope, "method": method, "mean": mean_estimated, "std": std_estimated}
+
+    def get_coordinates(self):
+        return self.id_class.get_coordinates()
+
+
+def run_data_prep():
+
+    from .data_preparation_neighbors import DataPreparationNeighbors
+    data = DummyDataSingleStation("main_class")
+    data.get_X()
+    data.get_Y()
+
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+    data_prep = DataPreparationNeighbors(DummyDataSingleStation("main_class"),
+                                         path,
+                                         neighbors=[DummyDataSingleStation("neighbor1"),
+                                                    DummyDataSingleStation("neighbor2")],
+                                         extreme_values=[1., 1.2])
+    data_prep.get_data(upsampling=False)
+
+
+def create_data_prep():
+
+    from .data_preparation_neighbors import DataPreparationNeighbors
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+    station_type = None
+    network = 'UBA'
+    sampling = 'daily'
+    target_dim = 'variables'
+    target_var = 'o3'
+    interpolation_dim = 'datetime'
+    window_history_size = 7
+    window_lead_time = 3
+    central_station = StationPrep("DEBW011", path, {'o3': 'dma8eu', 'temp': 'maximum'}, {},station_type, network, sampling, target_dim,
+                                  target_var, interpolation_dim, window_history_size, window_lead_time)
+    neighbor1 = StationPrep("DEBW013", path, {'o3': 'dma8eu', 'temp-rea-miub': 'maximum'}, {},station_type, network, sampling, target_dim,
+                                  target_var, interpolation_dim, window_history_size, window_lead_time)
+    neighbor2 = StationPrep("DEBW034", path, {'o3': 'dma8eu', 'temp': 'maximum'}, {}, station_type, network, sampling, target_dim,
+                                  target_var, interpolation_dim, window_history_size, window_lead_time)
+
+    data_prep = []
+    data_prep.append(DataPreparationNeighbors(central_station, path, neighbors=[neighbor1, neighbor2]))
+    data_prep.append(DataPreparationNeighbors(neighbor1, path, neighbors=[central_station, neighbor2]))
+    data_prep.append(DataPreparationNeighbors(neighbor2, path, neighbors=[neighbor1, central_station]))
+    return data_prep
+
+
+if __name__ == "__main__":
+    from mlair.data_handler.station_preparation import StationPrep
+    from mlair.data_handler.iterator import KerasIterator, DataCollection
+    data_prep = create_data_prep()
+    data_collection = DataCollection(data_prep)
+    for data in data_collection:
+        print(data)
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata", "keras")
+    keras_it = KerasIterator(data_collection, 100, path, upsampling=True)
+    keras_it[2]
+
diff --git a/mlair/data_handler/bootstraps.py b/mlair/data_handler/bootstraps.py
new file mode 100644
index 0000000000000000000000000000000000000000..91603b41822b92e28fbd077c502d84707fff746f
--- /dev/null
+++ b/mlair/data_handler/bootstraps.py
@@ -0,0 +1,130 @@
+"""
+Collections of bootstrap methods and classes.
+
+How to use
+----------
+
+test
+
+"""
+
+__author__ = 'Felix Kleinert, Lukas Leufen'
+__date__ = '2020-02-07'
+
+
+import os
+from collections import Iterator, Iterable
+from itertools import chain
+
+import numpy as np
+import xarray as xr
+
+from mlair.data_handler.advanced_data_handler import AbstractDataPreparation
+
+
+class BootstrapIterator(Iterator):
+
+    _position: int = None
+
+    def __init__(self, data: "BootStraps"):
+        assert isinstance(data, BootStraps)
+        self._data = data
+        self._dimension = data.bootstrap_dimension
+        self._collection = self._data.bootstraps()
+        self._position = 0
+
+    def __next__(self):
+        """Return next element or stop iteration."""
+        try:
+            index, dimension = self._collection[self._position]
+            nboot = self._data.number_of_bootstraps
+            _X, _Y = self._data.data.get_data(as_numpy=False)
+            _X = list(map(lambda x: x.expand_dims({'boots': range(nboot)}, axis=-1), _X))
+            _Y = _Y.expand_dims({"boots": range(nboot)}, axis=-1)
+            single_variable = _X[index].sel({self._dimension: [dimension]})
+            shuffled_variable = self.shuffle(single_variable.values)
+            shuffled_data = xr.DataArray(shuffled_variable, coords=single_variable.coords, dims=single_variable.dims)
+            _X[index] = shuffled_data.combine_first(_X[index]).reindex_like(_X[index])
+            self._position += 1
+        except IndexError:
+            raise StopIteration()
+        _X, _Y = self._to_numpy(_X), self._to_numpy(_Y)
+        return self._reshape(_X), self._reshape(_Y), (index, dimension)
+
+    @staticmethod
+    def _reshape(d):
+        if isinstance(d, list):
+            return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d))
+        else:
+            shape = d.shape
+            return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1])
+
+    @staticmethod
+    def _to_numpy(d):
+        if isinstance(d, list):
+            return list(map(lambda x: x.values, d))
+        else:
+            return d.values
+
+    @staticmethod
+    def shuffle(data: np.ndarray) -> np.ndarray:
+        """
+        Shuffle randomly from given data (draw elements with replacement).
+
+        :param data: data to shuffle
+        :return: shuffled data as numpy array
+        """
+        size = data.shape
+        return np.random.choice(data.reshape(-1, ), size=size)
+
+
+class BootStraps(Iterable):
+    """
+    Main class to perform bootstrap operations.
+
+    This class requires a data handler following the definition of the AbstractDataPreparation, the number of bootstraps
+    to create and the dimension along this bootstrapping is performed (default dimension is `variables`).
+
+    When iterating on this class, it returns the bootstrapped X, Y and a tuple with (position of variable in X, name of
+    this variable). The tuple is interesting if X consists on mutliple input streams X_i (e.g. two or more stations)
+    because it shows which variable of which input X_i has been bootstrapped. All bootstrap combinations can be
+    retrieved by calling the .bootstraps() method. Further more, by calling the .get_orig_prediction() this class
+    imitates according to the set number of bootstraps the original prediction
+    """
+    def __init__(self, data: AbstractDataPreparation, number_of_bootstraps: int = 10,
+                 bootstrap_dimension: str = "variables"):
+        """
+        Create iterable class to be ready to iter.
+
+        :param data: a data generator object to get data / history
+        :param number_of_bootstraps: the number of bootstrap realisations
+        """
+        self.data = data
+        self.number_of_bootstraps = number_of_bootstraps
+        self.bootstrap_dimension = bootstrap_dimension
+
+    def __iter__(self):
+        return BootstrapIterator(self)
+
+    def __len__(self):
+        return len(self.bootstraps())
+
+    def bootstraps(self):
+        l = []
+        for i, x in enumerate(self.data.get_X(as_numpy=False)):
+            l.append(list(map(lambda y: (i, y), x.indexes['variables'])))
+        return list(chain(*l))
+
+    def get_orig_prediction(self, path: str, file_name: str, prediction_name: str = "CNN") -> np.ndarray:
+        """
+        Repeat predictions from given file(_name) in path by the number of boots.
+
+        :param path: path to file
+        :param file_name: file name
+        :param prediction_name: name of the prediction to select from loaded file (default CNN)
+        :return: repeated predictions
+        """
+        file = os.path.join(path, file_name)
+        prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze()
+        vals = np.tile(prediction.data, (self.number_of_bootstraps, 1))
+        return vals[~np.isnan(vals).any(axis=1), :]
diff --git a/mlair/data_handler/data_preparation_neighbors.py b/mlair/data_handler/data_preparation_neighbors.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c95b242e1046618403ebb6592407ef8b680e890
--- /dev/null
+++ b/mlair/data_handler/data_preparation_neighbors.py
@@ -0,0 +1,64 @@
+
+__author__ = 'Lukas Leufen'
+__date__ = '2020-07-17'
+
+
+from mlair.helpers import to_list
+from mlair.data_handler.station_preparation import StationPrep
+from mlair.data_handler.advanced_data_handler import DefaultDataPreparation
+import os
+
+from typing import Union, List
+
+number = Union[float, int]
+num_or_list = Union[number, List[number]]
+
+
+class DataPreparationNeighbors(DefaultDataPreparation):
+
+    def __init__(self, id_class, data_path, neighbors=None, min_length=0,
+                 extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False):
+        self.neighbors = to_list(neighbors) if neighbors is not None else []
+        super().__init__(id_class, data_path, min_length=min_length, extreme_values=extreme_values,
+                         extremes_on_right_tail_only=extremes_on_right_tail_only)
+
+    @classmethod
+    def build(cls, station, **kwargs):
+        sp_keys = {k: kwargs[k] for k in cls._requirements if k in kwargs}
+        sp = StationPrep(station, **sp_keys)
+        n_list = []
+        for neighbor in kwargs.get("neighbors", []):
+            n_list.append(StationPrep(neighbor, **sp_keys))
+        else:
+            kwargs["neighbors"] = n_list if len(n_list) > 0 else None
+        dp_args = {k: kwargs[k] for k in cls.own_args("id_class") if k in kwargs}
+        return cls(sp, **dp_args)
+
+    def _create_collection(self):
+        return [self.id_class] + self.neighbors
+
+    def get_coordinates(self, include_neighbors=False):
+        neighbors = list(map(lambda n: n.get_coordinates(), self.neighbors)) if include_neighbors is True else []
+        return [super(DataPreparationNeighbors, self).get_coordinates()].append(neighbors)
+
+
+if __name__ == "__main__":
+
+    a = DataPreparationNeighbors
+    requirements = a.requirements()
+
+    kwargs = {"path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata"),
+              "station_type": None,
+              "network": 'UBA',
+              "sampling": 'daily',
+              "target_dim": 'variables',
+              "target_var": 'o3',
+              "time_dim": 'datetime',
+              "window_history_size": 7,
+              "window_lead_time": 3,
+              "neighbors": ["DEBW034"],
+              "data_path": os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata"),
+              "statistics_per_var":  {'o3': 'dma8eu', 'temp': 'maximum'},
+              "transformation": None,}
+    a_inst = a.build("DEBW011", **kwargs)
+    print(a_inst)
diff --git a/mlair/data_handler/iterator.py b/mlair/data_handler/iterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..49569405a587920da795820d48f8d968a8142cc7
--- /dev/null
+++ b/mlair/data_handler/iterator.py
@@ -0,0 +1,213 @@
+
+__author__ = 'Lukas Leufen'
+__date__ = '2020-07-07'
+
+from collections import Iterator, Iterable
+import keras
+import numpy as np
+import math
+import os
+import shutil
+import pickle
+from typing import Tuple, List
+
+
+class StandardIterator(Iterator):
+
+    _position: int = None
+
+    def __init__(self, collection: list):
+        assert isinstance(collection, list)
+        self._collection = collection
+        self._position = 0
+
+    def __next__(self):
+        """Return next element or stop iteration."""
+        try:
+            value = self._collection[self._position]
+            self._position += 1
+        except IndexError:
+            raise StopIteration()
+        return value
+
+
+class DataCollection(Iterable):
+
+    def __init__(self, collection: list = None):
+        if collection is None:
+            collection = []
+        assert isinstance(collection, list)
+        self._collection = collection
+        self._mapping = {}
+        self._set_mapping()
+
+    def __len__(self):
+        return len(self._collection)
+
+    def __iter__(self) -> Iterator:
+        return StandardIterator(self._collection)
+
+    def __getitem__(self, index):
+        if isinstance(index, int):
+            return self._collection[index]
+        else:
+            return self._collection[self._mapping[str(index)]]
+
+    def add(self, element):
+        self._collection.append(element)
+        self._mapping[str(element)] = len(self._collection)
+
+    def _set_mapping(self):
+        for i, e in enumerate(self._collection):
+            self._mapping[str(e)] = i
+
+    def keys(self):
+        return list(self._mapping.keys())
+
+
+class KerasIterator(keras.utils.Sequence):
+
+    def __init__(self, collection: DataCollection, batch_size: int, batch_path: str, shuffle_batches: bool = False,
+                 model=None, upsampling=False, name=None):
+        self._collection = collection
+        batch_path = os.path.join(batch_path, str(name if name is not None else id(self)))
+        self._path = os.path.join(batch_path, "%i.pickle")
+        self.batch_size = batch_size
+        self.model = model
+        self.shuffle = shuffle_batches
+        self.upsampling = upsampling
+        self.indexes: list = []
+        self._cleanup_path(batch_path)
+        self._prepare_batches()
+
+    def __len__(self) -> int:
+        return len(self.indexes)
+
+    def __getitem__(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
+        """Get batch for given index."""
+        return self.__data_generation(self.indexes[index])
+
+    def _get_model_rank(self):
+        if self.model is not None:
+            mod_out = self.model.output_shape
+            if isinstance(mod_out, tuple):  # only one output branch: (None, ahead)
+                mod_rank = 1
+            elif isinstance(mod_out, list):  # multiple output branches, e.g.: [(None, ahead), (None, ahead)]
+                mod_rank = len(mod_out)
+            else:  # pragma: no cover
+                raise TypeError("model output shape must either be tuple or list.")
+            return mod_rank
+        else:  # no model provided, assume to use single output
+            return 1
+
+    def __data_generation(self, index: int) -> Tuple[np.ndarray, np.ndarray]:
+        """Load pickle data from disk."""
+        file = self._path % index
+        with open(file, "rb") as f:
+            data = pickle.load(f)
+        return data["X"], data["Y"]
+
+    @staticmethod
+    def _concatenate(new: List[np.ndarray], old: List[np.ndarray]) -> List[np.ndarray]:
+        """Concatenate two lists of data along axis=0."""
+        return list(map(lambda n1, n2: np.concatenate((n1, n2), axis=0), old, new))
+
+    def _get_batch(self, data_list: List[np.ndarray], b: int) -> List[np.ndarray]:
+        """Get batch according to batch size from data list."""
+        return list(map(lambda data: data[b * self.batch_size:(b+1) * self.batch_size, ...], data_list))
+
+    def _permute_data(self, X, Y):
+        p = np.random.permutation(len(X[0]))  # equiv to .shape[0]
+        X = list(map(lambda x: x[p], X))
+        Y = list(map(lambda x: x[p], Y))
+        return X, Y
+
+    def _prepare_batches(self) -> None:
+        """
+        Prepare all batches as locally stored files.
+
+        Walk through all elements of collection and split (or merge) data according to the batch size. Too long data
+        sets are divided into multiple batches. Not fully filled batches are merged with data from the next collection
+        element. If data is remaining after the last element, it is saved as smaller batch. All batches are enumerated
+        beginning from 0. A list with all batch numbers is stored in class's parameter indexes.
+        """
+        index = 0
+        remaining = None
+        mod_rank = self._get_model_rank()
+        for data in self._collection:
+            X = data.get_X(upsampling=self.upsampling)
+            Y = [data.get_Y(upsampling=self.upsampling)[0] for _ in range(mod_rank)]
+            if self.upsampling:
+                X, Y = self._permute_data(X, Y)
+            if remaining is not None:
+                X, Y = self._concatenate(X, remaining[0]), self._concatenate(Y, remaining[1])
+            length = X[0].shape[0]
+            batches = self._get_number_of_mini_batches(length)
+            for b in range(batches):
+                batch_X, batch_Y = self._get_batch(X, b), self._get_batch(Y, b)
+                self._save_to_pickle(X=batch_X, Y=batch_Y, index=index)
+                index += 1
+            if (batches * self.batch_size) < length:  # keep remaining to concatenate with next data element
+                remaining = (self._get_batch(X, batches), self._get_batch(Y, batches))
+            else:
+                remaining = None
+        if remaining is not None:  # add remaining as smaller batch
+            self._save_to_pickle(X=remaining[0], Y=remaining[1], index=index)
+            index += 1
+        self.indexes = np.arange(0, index).tolist()
+
+    def _save_to_pickle(self, X: List[np.ndarray], Y: List[np.ndarray], index: int) -> None:
+        """Save data as pickle file with variables X and Y and given index as <index>.pickle ."""
+        data = {"X": X, "Y": Y}
+        file = self._path % index
+        with open(file, "wb") as f:
+            pickle.dump(data, f)
+
+    def _get_number_of_mini_batches(self, number_of_samples: int) -> int:
+        """Return number of mini batches as the floored ration of number of samples to batch size."""
+        return math.floor(number_of_samples / self.batch_size)
+
+    @staticmethod
+    def _cleanup_path(path: str, create_new: bool = True) -> None:
+        """First remove existing path, second create empty path if enabled."""
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        if create_new is True:
+            os.makedirs(path)
+
+    def on_epoch_end(self) -> None:
+        """Randomly shuffle indexes if enabled."""
+        if self.shuffle is True:
+            np.random.shuffle(self.indexes)
+
+
+class DummyData:  # pragma: no cover
+
+    def __init__(self, number_of_samples=np.random.randint(100, 150)):
+        self.number_of_samples = number_of_samples
+
+    def get_X(self):
+        X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5))  # samples, window, variables
+        X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 2))  # samples, window, variables
+        X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 2))  # samples, window, variables
+        return [X1, X2, X3]
+
+    def get_Y(self):
+        Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5, 1))  # samples, window, variables
+        Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 5, 1))  # samples, window, variables
+        return [Y1, Y2]
+
+
+if __name__ == "__main__":
+
+    collection = []
+    for _ in range(3):
+        collection.append(DummyData(50))
+
+    data_collection = DataCollection(collection=collection)
+
+    path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+    iterator = KerasIterator(data_collection, 25, path, shuffle=True)
+
+    for data in data_collection:
+        print(data)
\ No newline at end of file
diff --git a/mlair/data_handling/data_preparation.py b/mlair/data_handler/station_preparation.py
similarity index 56%
rename from mlair/data_handling/data_preparation.py
rename to mlair/data_handler/station_preparation.py
index 1dce5c87c2b076621ee08ae0f18906fd47d95e95..ff8496ab30a3b6392ea2314ef2526c80e0f57591 100644
--- a/mlair/data_handling/data_preparation.py
+++ b/mlair/data_handler/station_preparation.py
@@ -1,13 +1,13 @@
 """Data Preparation class to handle data processing for machine learning."""
 
-__author__ = 'Lukas Leufen'
-__date__ = '2020-06-29'
+__author__ = 'Lukas Leufen, Felix Kleinert'
+__date__ = '2020-07-20'
 
 import datetime as dt
 import logging
 import os
 from functools import reduce
-from typing import Union, List, Iterable, Tuple
+from typing import Union, List, Iterable, Tuple, Dict
 
 import numpy as np
 import pandas as pd
@@ -24,61 +24,178 @@ number = Union[float, int]
 num_or_list = Union[number, List[number]]
 data_or_none = Union[xr.DataArray, None]
 
+# defaults
+DEFAULT_STATION_TYPE = "background"
+DEFAULT_NETWORK = "AIRBASE"
+DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values',
+                        'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values',
+                        'pblheight': 'maximum'}
+DEFAULT_WINDOW_LEAD_TIME = 3
+DEFAULT_WINDOW_HISTORY_SIZE = 13
+DEFAULT_TIME_DIM = "datetime"
+DEFAULT_TARGET_VAR = "o3"
+DEFAULT_TARGET_DIM = "variables"
+DEFAULT_SAMPLING = "daily"
+DEFAULT_INTERPOLATION_METHOD = "linear"
+
+
+class AbstractStationPrep(object):
+    def __init__(self): #, path, station, statistics_per_var, transformation, **kwargs):
+        pass
+
+    def get_X(self):
+        raise NotImplementedError
 
-class AbstractDataPrep(object):
-    """
-    This class prepares data to be used in neural networks.
-
-    The instance searches for local stored data, that meet the given demands. If no local data is found, the DataPrep
-    instance will load data from TOAR database and store this data locally to use the next time. For the moment, there
-    is only support for daily aggregated time series. The aggregation can be set manually and differ for each variable.
-
-    After data loading, different data pre-processing steps can be executed to prepare the data for further
-    applications. Especially the following methods can be used for the pre-processing step:
-
-    - interpolate: interpolate between data points by using xarray's interpolation method
-    - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on \
-        interval [0, 1] are not implemented yet.
-    - make window history: represent the history (time steps before) for training/ testing; X
-    - make labels: create target vector with given leading time steps for training/ testing; y
-    - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. \
-        Use this method after the creation of the window history and labels to clean up the data cube.
-
-    To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA,
-    "Umweltbundesamt") and the variables to use. Further options can be set in the instance.
+    def get_Y(self):
+        raise NotImplementedError
 
-    * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable.
-    * `start`: define a start date for the data cube creation. Default: Use the first entry in time series
-    * `end`: set the end date for the data cube. Default: Use last date in time series.
-    * `store_data_locally`: store recently downloaded data on local disk. Default: True
-    * set further parameters for xarray's interpolation methods to modify the interpolation scheme
 
-    """
+class StationPrep(AbstractStationPrep):
 
-    def __init__(self, path: str, station: Union[str, List[str]], variables: List[str], **kwargs):
-        """Construct instance."""
-        self.path = os.path.abspath(path)
+    def __init__(self, station, data_path, statistics_per_var, station_type=DEFAULT_STATION_TYPE,
+                 network=DEFAULT_NETWORK, sampling=DEFAULT_SAMPLING, target_dim=DEFAULT_TARGET_DIM,
+                 target_var=DEFAULT_TARGET_VAR, time_dim=DEFAULT_TIME_DIM,
+                 window_history_size=DEFAULT_WINDOW_HISTORY_SIZE, window_lead_time=DEFAULT_WINDOW_LEAD_TIME,
+                 interpolation_limit: int = 0, interpolation_method: str = DEFAULT_INTERPOLATION_METHOD,
+                 overwrite_local_data: bool = False, transformation=None, store_data_locally: bool = True,
+                 min_length: int = 0, start=None, end=None, **kwargs):
+        super().__init__()  # path, station, statistics_per_var, transformation, **kwargs)
         self.station = helpers.to_list(station)
-        self.variables = variables
-        self.mean: data_or_none = None
-        self.std: data_or_none = None
-        self.history: data_or_none = None
-        self.label: data_or_none = None
-        self.observation: data_or_none = None
-        self.extremes_history: data_or_none = None
-        self.extremes_label: data_or_none = None
-        self.kwargs = kwargs
+        self.path = os.path.abspath(data_path)
+        self.statistics_per_var = statistics_per_var
+        self.transformation = self.setup_transformation(transformation)
+
+        self.station_type = station_type
+        self.network = network
+        self.sampling = sampling
+        self.target_dim = target_dim
+        self.target_var = target_var
+        self.time_dim = time_dim
+        self.window_history_size = window_history_size
+        self.window_lead_time = window_lead_time
+
+        self.interpolation_limit = interpolation_limit
+        self.interpolation_method = interpolation_method
+
+        self.overwrite_local_data = overwrite_local_data
+        self.store_data_locally = store_data_locally
+        self.min_length = min_length
+        self.start = start
+        self.end = end
+
+        # internal
         self.data = None
         self.meta = None
+        self.variables = kwargs.get('variables', list(statistics_per_var.keys()))
+        self.history = None
+        self.label = None
+        self.observation = None
+
+        # internal for transformation
+        self.mean = None
+        self.std = None
+        self.max = None
+        self.min = None
         self._transform_method = None
-        self.statistics_per_var = kwargs.get("statistics_per_var", None)
-        self.sampling = kwargs.get("sampling", "daily")
-        if self.statistics_per_var is not None or self.sampling == "hourly":
-            self.load_data()
+
+        self.kwargs = kwargs
+        # self.kwargs["overwrite_local_data"] = overwrite_local_data
+
+        # self.make_samples()
+        self.setup_samples()
+
+    def __str__(self):
+        return self.station[0]
+
+    def __len__(self):
+        assert len(self.get_X()) == len(self.get_Y())
+        return len(self.get_X())
+
+    @property
+    def shape(self):
+        return self.data.shape, self.get_X().shape, self.get_Y().shape
+
+    def __repr__(self):
+        return f"StationPrep(station={self.station}, data_path='{self.path}', " \
+               f"statistics_per_var={self.statistics_per_var}, " \
+               f"station_type='{self.station_type}', network='{self.network}', " \
+               f"sampling='{self.sampling}', target_dim='{self.target_dim}', target_var='{self.target_var}', " \
+               f"time_dim='{self.time_dim}', window_history_size={self.window_history_size}, " \
+               f"window_lead_time={self.window_lead_time}, interpolation_limit={self.interpolation_limit}, " \
+               f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data}, " \
+               f"transformation={self._print_transformation_as_string}, **{self.kwargs})"
+
+    @property
+    def _print_transformation_as_string(self):
+        str_name = ''
+        if self.transformation is None:
+            str_name = f'{None}'
         else:
-            raise NotImplementedError("Either select hourly data or provide statistics_per_var.")
+            for k, v in self.transformation.items():
+                if v is not None:
+                    try:
+                        v_pr = f"xr.DataArray.from_dict({v.to_dict()})"
+                    except AttributeError:
+                        v_pr = f"'{v}'"
+                    str_name += f"'{k}':{v_pr}, "
+            str_name = f"{{{str_name}}}"
+        return str_name
+
+    def get_transposed_history(self) -> xr.DataArray:
+        """Return history.
 
-    def load_data(self, source_name=""):
+        :return: history with dimensions datetime, window, Stations, variables.
+        """
+        return self.history.transpose("datetime", "window", "Stations", "variables").copy()
+
+    def get_transposed_label(self) -> xr.DataArray:
+        """Return label.
+
+        :return: label with dimensions datetime*, window*, Stations, variables.
+        """
+        return self.label.squeeze("Stations").transpose("datetime", "window").copy()
+
+    def get_X(self):
+        return self.get_transposed_history()
+
+    def get_Y(self):
+        return self.get_transposed_label()
+
+    def get_coordinates(self):
+        coords = self.meta.loc[["station_lon", "station_lat"]].astype(float)
+        return coords.rename(index={"station_lon": "lon", "station_lat": "lat"}).to_dict()[str(self)]
+
+    def call_transform(self, inverse=False):
+        self.transform(dim=self.time_dim, method=self.transformation["method"],
+                       mean=self.transformation['mean'], std=self.transformation["std"],
+                       min_val=self.transformation["min"], max_val=self.transformation["max"],
+                       inverse=inverse
+                       )
+
+    def set_transformation(self, transformation: dict):
+        if self._transform_method is not None:
+            self.call_transform(inverse=True)
+        self.transformation = self.setup_transformation(transformation)
+        self.call_transform()
+        self.make_samples()
+
+    def setup_samples(self):
+        """
+        Setup samples. This method prepares and creates samples X, and labels Y.
+        """
+        self.load_data()
+        self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit)
+        if self.transformation is not None:
+            self.call_transform()
+        self.make_samples()
+
+    def make_samples(self):
+        self.make_history_window(self.target_dim, self.window_history_size, self.time_dim)
+        self.make_labels(self.target_dim, self.target_var, self.time_dim, self.window_lead_time)
+        self.make_observation(self.target_dim, self.target_var, self.time_dim)
+        self.remove_nan(self.time_dim)
+
+    def read_data_from_disk(self, source_name=""):
         """
         Load data and meta data either from local disk (preferred) or download new data by using a custom download method.
 
@@ -90,7 +207,7 @@ class AbstractDataPrep(object):
         check_path_and_create(self.path)
         file_name = self._set_file_name()
         meta_file = self._set_meta_file_name()
-        if self.kwargs.get('overwrite_local_data', False):
+        if self.overwrite_local_data is True:
             logging.debug(f"overwrite_local_data is true, therefore reload {file_name}{source_name}")
             if os.path.exists(file_name):
                 os.remove(file_name)
@@ -114,24 +231,111 @@ class AbstractDataPrep(object):
         data = self._slice_prep(data)
         self.data = self.check_for_negative_concentrations(data)
 
-    def download_data(self, file_name, meta_file) -> [xr.DataArray, pd.DataFrame]:
+    def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]:
         """
-        Download data and meta.
+        Download data from TOAR database using the JOIN interface.
+
+        Data is transformed to a xarray dataset. If class attribute store_data_locally is true, data is additionally
+        stored locally using given names for file and meta file.
 
         :param file_name: name of file to save data to (containing full path)
         :param meta_file: name of the meta data file (also containing full path)
-        """
-        raise NotImplementedError
+
+        :return: downloaded data and its meta data
+        """
+        df_all = {}
+        df, meta = join.download_join(station_name=self.station, stat_var=self.statistics_per_var,
+                                      station_type=self.station_type, network_name=self.network, sampling=self.sampling)
+        df_all[self.station[0]] = df
+        # convert df_all to xarray
+        xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
+        xarr = xr.Dataset(xarr).to_array(dim='Stations')
+        if self.store_data_locally is True:
+            # save locally as nc/csv file
+            xarr.to_netcdf(path=file_name)
+            meta.to_csv(meta_file)
+        return xarr, meta
+
+    def download_data(self, file_name, meta_file):
+        data, meta = self.download_data_from_join(file_name, meta_file)
+        return data, meta
 
     def check_station_meta(self):
         """
-        Placeholder function to implement some additional station meta data check if desired.
+        Search for the entries in meta data and compare the value with the requested values.
 
-        Ideally, this method should raise a FileNotFoundError if a value mismatch to load fresh data from a source. If
-        this method is not required for your application just inherit and add the `pass` command inside the method. The
-        NotImplementedError is more a reminder that you could use it.
+        Will raise a FileNotFoundError if the values mismatch.
         """
-        raise NotImplementedError
+        if self.station_type is not None:
+            check_dict = {"station_type": self.station_type, "network_name": self.network}
+            for (k, v) in check_dict.items():
+                if v is None:
+                    continue
+                if self.meta.at[k, self.station[0]] != v:
+                    logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != "
+                                  f"{self.meta.at[k, self.station[0]]} (local). Raise FileNotFoundError to trigger new "
+                                  f"grapping from web.")
+                    raise FileNotFoundError
+
+    def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray:
+        """
+        Set all negative concentrations to zero.
+
+        Names of all concentrations are extracted from https://join.fz-juelich.de/services/rest/surfacedata/
+        #2.1 Parameters. Currently, this check is applied on "benzene", "ch4", "co", "ethane", "no", "no2", "nox",
+        "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", and "toluene".
+
+        :param data: data array containing variables to check
+        :param minimum: minimum value, by default this should be 0
+
+        :return: corrected data
+        """
+        chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5",
+                     "propane", "so2", "toluene"]
+        used_chem_vars = list(set(chem_vars) & set(self.variables))
+        data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum)
+        return data
+
+    def shift(self, dim: str, window: int) -> xr.DataArray:
+        """
+        Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0).
+
+        :param dim: dimension along shift is applied
+        :param window: number of steps to shift (corresponds to the window length)
+
+        :return: shifted data
+        """
+        start = 1
+        end = 1
+        if window <= 0:
+            start = window
+        else:
+            end = window + 1
+        res = []
+        for w in range(start, end):
+            res.append(self.data.shift({dim: -w}))
+        window_array = self.create_index_array('window', range(start, end), squeeze_dim=self.target_dim)
+        res = xr.concat(res, dim=window_array)
+        return res
+
+    @staticmethod
+    def create_index_array(index_name: str, index_value: Iterable[int], squeeze_dim: str) -> xr.DataArray:
+        """
+        Create an 1D xr.DataArray with given index name and value.
+
+        :param index_name: name of dimension
+        :param index_value: values of this dimension
+
+        :return: this array
+        """
+        ind = pd.DataFrame({'val': index_value}, index=index_value)
+        # res = xr.Dataset.from_dataframe(ind).to_array().rename({'index': index_name}).squeeze(dim=squeez/e_dim, drop=True)
+        res = xr.Dataset.from_dataframe(ind).to_array(squeeze_dim).rename({'index': index_name}).squeeze(
+            dim=squeeze_dim,
+            drop=True
+        )
+        res.name = index_name
+        return res
 
     def _set_file_name(self):
         all_vars = sorted(self.statistics_per_var.keys())
@@ -141,11 +345,6 @@ class AbstractDataPrep(object):
         all_vars = sorted(self.statistics_per_var.keys())
         return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(all_vars)}_meta.csv")
 
-    def __repr__(self):
-        """Represent class attributes."""
-        return f"AbstractDataPrep(path='{self.path}', station={self.station}, variables={self.variables}, " \
-               f"**{self.kwargs})"
-
     def interpolate(self, dim: str, method: str = 'linear', limit: int = None, use_coordinate: Union[bool, str] = True,
                     **kwargs):
         """
@@ -187,126 +386,6 @@ class AbstractDataPrep(object):
         self.data = self.data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate,
                                              **kwargs)
 
-    @staticmethod
-    def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None:
-        """
-        Support inverse_transformation method.
-
-        Validate if all required statistics are available for given method. E.g. centering requires mean only, whereas
-        normalisation requires mean and standard deviation. Will raise an AttributeError on missing requirements.
-
-        :param mean: data with all mean values
-        :param std: data with all standard deviation values
-        :param method: name of transformation method
-        """
-        msg = ""
-        if method in ['standardise', 'centre'] and mean is None:
-            msg += "mean, "
-        if method == 'standardise' and std is None:
-            msg += "std, "
-        if len(msg) > 0:
-            raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}")
-
-    def inverse_transform(self) -> None:
-        """
-        Perform inverse transformation.
-
-        Will raise an AssertionError, if no transformation was performed before. Checks first, if all required
-        statistics are available for inverse transformation. Class attributes data, mean and std are overwritten by
-        new data afterwards. Thereby, mean, std, and the private transform method are set to None to indicate, that the
-        current data is not transformed.
-        """
-
-        def f_inverse(data, mean, std, method_inverse):
-            if method_inverse == 'standardise':
-                return statistics.standardise_inverse(data, mean, std), None, None
-            elif method_inverse == 'centre':
-                return statistics.centre_inverse(data, mean), None, None
-            elif method_inverse == 'normalise':
-                raise NotImplementedError
-            else:
-                raise NotImplementedError
-
-        if self._transform_method is None:
-            raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.")
-        self.check_inverse_transform_params(self.mean, self.std, self._transform_method)
-        self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method)
-        self._transform_method = None
-
-    def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None,
-                  std=None) -> None:
-        """
-        Transform data according to given transformation settings.
-
-        This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0
-        and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale
-        (`method=centre`). Furthermore, this sets an internal instance attribute for later inverse transformation. This
-        method will raise an AssertionError if an internal transform method was already set ('inverse=False') or if the
-        internal transform method, internal mean and internal standard deviation weren't set ('inverse=True').
-
-        :param string/int dim: This param is not used for inverse transformation.
-                | for xarray.DataArray as string: name of dimension which should be standardised
-                | for pandas.DataFrame as int: axis of dimension which should be standardised
-        :param method: Choose the transformation method from 'standardise' and 'centre'. 'normalise' is not implemented
-                    yet. This param is not used for inverse transformation.
-        :param inverse: Switch between transformation and inverse transformation.
-
-        :return: xarray.DataArrays or pandas.DataFrames:
-                #. mean: Mean of data
-                #. std: Standard deviation of data
-                #. data: Standardised data
-        """
-
-        def f(data):
-            if method == 'standardise':
-                return statistics.standardise(data, dim)
-            elif method == 'centre':
-                return statistics.centre(data, dim)
-            elif method == 'normalise':
-                # use min/max of data or given min/max
-                raise NotImplementedError
-            else:
-                raise NotImplementedError
-
-        def f_apply(data):
-            if method == "standardise":
-                return mean, std, statistics.standardise_apply(data, mean, std)
-            elif method == "centre":
-                return mean, None, statistics.centre_apply(data, mean)
-            else:
-                raise NotImplementedError
-
-        if not inverse:
-            if self._transform_method is not None:
-                raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with "
-                                     f"{self._transform_method}. Please perform inverse transformation of data first.")
-            self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data)
-            self._transform_method = method
-        else:
-            self.inverse_transform()
-
-    def get_transformation_information(self, variable: str) -> Tuple[data_or_none, data_or_none, str]:
-        """
-        Extract transformation statistics and method.
-
-        Get mean and standard deviation for given variable and the transformation method if set. If a transformation
-        depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are
-        returned with None as fill value.
-
-        :param variable: Variable for which the information on transformation is requested.
-
-        :return: mean, standard deviation and transformation method
-        """
-        try:
-            mean = self.mean.sel({'variables': variable}).values
-        except AttributeError:
-            mean = None
-        try:
-            std = self.std.sel({'variables': variable}).values
-        except AttributeError:
-            std = None
-        return mean, std, self._transform_method
-
     def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None:
         """
         Create a xr.DataArray containing history data.
@@ -324,28 +403,6 @@ class AbstractDataPrep(object):
         window = -abs(window)
         self.history = self.shift(dim_name_of_shift, window).sel({dim_name_of_inputs: self.variables})
 
-    def shift(self, dim: str, window: int) -> xr.DataArray:
-        """
-        Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0).
-
-        :param dim: dimension along shift is applied
-        :param window: number of steps to shift (corresponds to the window length)
-
-        :return: shifted data
-        """
-        start = 1
-        end = 1
-        if window <= 0:
-            start = window
-        else:
-            end = window + 1
-        res = []
-        for w in range(start, end):
-            res.append(self.data.shift({dim: -w}))
-        window_array = self.create_index_array('window', range(start, end))
-        res = xr.concat(res, dim=window_array)
-        return res
-
     def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str,
                     window: int) -> None:
         """
@@ -390,8 +447,7 @@ class AbstractDataPrep(object):
             intersect = reduce(np.intersect1d, (non_nan_history.coords[dim].values, non_nan_label.coords[dim].values,
                                                 non_nan_observation.coords[dim].values))
 
-        min_length = self.kwargs.get("min_length", 0)
-        if len(intersect) < max(min_length, 1):
+        if len(intersect) < max(self.min_length, 1):
             self.history = None
             self.label = None
             self.observation = None
@@ -400,21 +456,6 @@ class AbstractDataPrep(object):
             self.label = self.label.sel({dim: intersect})
             self.observation = self.observation.sel({dim: intersect})
 
-    @staticmethod
-    def create_index_array(index_name: str, index_value: Iterable[int]) -> xr.DataArray:
-        """
-        Create an 1D xr.DataArray with given index name and value.
-
-        :param index_name: name of dimension
-        :param index_value: values of this dimension
-
-        :return: this array
-        """
-        ind = pd.DataFrame({'val': index_value}, index=index_value)
-        res = xr.Dataset.from_dataframe(ind).to_array().rename({'index': index_name}).squeeze(dim='variable', drop=True)
-        res.name = index_name
-        return res
-
     def _slice_prep(self, data: xr.DataArray, coord: str = 'datetime') -> xr.DataArray:
         """
         Set start and end date for slicing and execute self._slice().
@@ -424,8 +465,8 @@ class AbstractDataPrep(object):
 
         :return: sliced data
         """
-        start = self.kwargs.get('start', data.coords[coord][0].values)
-        end = self.kwargs.get('end', data.coords[coord][-1].values)
+        start = self.start if self.start is not None else data.coords[coord][0].values
+        end = self.end if self.end is not None else data.coords[coord][-1].values
         return self._slice(data, start, end, coord)
 
     @staticmethod
@@ -461,98 +502,200 @@ class AbstractDataPrep(object):
         data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum)
         return data
 
-    def get_transposed_history(self) -> xr.DataArray:
-        """Return history.
+    @staticmethod
+    def setup_transformation(transformation: Dict):
+        """
+        Set up transformation by extracting all relevant information.
 
-        :return: history with dimensions datetime, window, Stations, variables.
+        Extract all information from transformation dictionary. Possible keys are method, mean, std, min, max.
+        * If a transformation should be applied on base of existing values, these need to be provided in the respective
+          keys "mean" and "std" (again only if required for given method).
+
+        :param transformation: the transformation dictionary as described above.
+
+        :return: updated transformation dictionary
         """
-        return self.history.transpose("datetime", "window", "Stations", "variables").copy()
+        if transformation is None:
+            return
+        elif not isinstance(transformation, dict):
+            raise TypeError(f"`transformation' must be either `None' or dict like e.g. `{{'method': 'standardise'}},"
+                            f" but transformation is of type {type(transformation)}.")
+        transformation = transformation.copy()
+        method = transformation.get("method", None)
+        mean = transformation.get("mean", None)
+        std = transformation.get("std", None)
+        max_val = transformation.get("max", None)
+        min_val = transformation.get("min", None)
+
+        transformation["method"] = method
+        transformation["mean"] = mean
+        transformation["std"] = std
+        transformation["max"] = max_val
+        transformation["min"] = min_val
+        return transformation
+
+    def load_data(self):
+        try:
+            self.read_data_from_disk()
+        except FileNotFoundError:
+            self.download_data()
+            self.load_data()
 
-    def get_transposed_label(self) -> xr.DataArray:
-        """Return label.
+    def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None,
+                  std=None, min_val=None, max_val=None) -> None:
+        """
+        Transform data according to given transformation settings.
+
+        This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0
+        and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale
+        (`method=centre`). Furthermore, this sets an internal instance attribute for later inverse transformation. This
+        method will raise an AssertionError if an internal transform method was already set ('inverse=False') or if the
+        internal transform method, internal mean and internal standard deviation weren't set ('inverse=True').
+
+        :param string/int dim: This param is not used for inverse transformation.
+                | for xarray.DataArray as string: name of dimension which should be standardised
+                | for pandas.DataFrame as int: axis of dimension which should be standardised
+        :param method: Choose the transformation method from 'standardise' and 'centre'. 'normalise' is not implemented
+                    yet. This param is not used for inverse transformation.
+        :param inverse: Switch between transformation and inverse transformation.
+        :param mean: Used for transformation (if required by 'method') based on external data. If 'None' the mean is
+                    calculated over the data in this class instance.
+        :param std: Used for transformation (if required by 'method') based on external data. If 'None' the std is
+                    calculated over the data in this class instance.
+        :param min_val: Used for transformation (if required by 'method') based on external data. If 'None' min_val is
+                    extracted from the data in this class instance.
+        :param max_val: Used for transformation (if required by 'method') based on external data. If 'None' max_val is
+                    extracted from the data in this class instance.
 
-        :return: label with dimensions datetime, window, Stations, variables.
+        :return: xarray.DataArrays or pandas.DataFrames:
+                #. mean: Mean of data
+                #. std: Standard deviation of data
+                #. data: Standardised data
         """
-        return self.label.squeeze("Stations").transpose("datetime", "window").copy()
 
-    def get_extremes_history(self) -> xr.DataArray:
-        """Return extremes history.
+        def f(data):
+            if method == 'standardise':
+                return statistics.standardise(data, dim)
+            elif method == 'centre':
+                return statistics.centre(data, dim)
+            elif method == 'normalise':
+                # use min/max of data or given min/max
+                raise NotImplementedError
+            else:
+                raise NotImplementedError
+
+        def f_apply(data):
+            if method == "standardise":
+                return mean, std, statistics.standardise_apply(data, mean, std)
+            elif method == "centre":
+                return mean, None, statistics.centre_apply(data, mean)
+            else:
+                raise NotImplementedError
+
+        if not inverse:
+            if self._transform_method is not None:
+                raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with "
+                                     f"{self._transform_method}. Please perform inverse transformation of data first.")
+            # apply transformation on local data instance (f) if mean is None, else apply by using mean (and std) from
+            # external data.
+            self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data)
+
+            # set transform method to find correct method for inverse transformation.
+            self._transform_method = method
+        else:
+            self.inverse_transform()
 
-        :return: extremes history with dimensions datetime, window, Stations, variables.
+    @staticmethod
+    def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None:
         """
-        return self.extremes_history.transpose("datetime", "window", "Stations", "variables").copy()
+        Support inverse_transformation method.
+
+        Validate if all required statistics are available for given method. E.g. centering requires mean only, whereas
+        normalisation requires mean and standard deviation. Will raise an AttributeError on missing requirements.
 
-    def get_extremes_label(self) -> xr.DataArray:
-        """Return extremes label.
+        :param mean: data with all mean values
+        :param std: data with all standard deviation values
+        :param method: name of transformation method
+        """
+        msg = ""
+        if method in ['standardise', 'centre'] and mean is None:
+            msg += "mean, "
+        if method == 'standardise' and std is None:
+            msg += "std, "
+        if len(msg) > 0:
+            raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}")
 
-        :return: extremes label with dimensions datetime, window, Stations, variables.
+    def inverse_transform(self) -> None:
         """
-        return self.extremes_label.squeeze("Stations").transpose("datetime", "window").copy()
+        Perform inverse transformation.
 
-    def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False,
-                          timedelta: Tuple[int, str] = (1, 'm')):
+        Will raise an AssertionError, if no transformation was performed before. Checks first, if all required
+        statistics are available for inverse transformation. Class attributes data, mean and std are overwritten by
+        new data afterwards. Thereby, mean, std, and the private transform method are set to None to indicate, that the
+        current data is not transformed.
         """
-        Multiply extremes.
 
-        This method extracts extreme values from self.labels which are defined in the argument extreme_values. One can
-        also decide only to extract extremes on the right tail of the distribution. When extreme_values is a list of
-        floats/ints all values larger (and smaller than negative extreme_values; extraction is performed in standardised
-        space) than are extracted iteratively. If for example extreme_values = [1.,2.] then a value of 1.5 would be
-        extracted once (for 0th entry in list), while a 2.5 would be extracted twice (once for each entry). Timedelta is
-        used to mark those extracted values by adding one min to each timestamp. As TOAR Data are hourly one can
-        identify those "artificial" data points later easily. Extreme inputs and labels are stored in
-        self.extremes_history and self.extreme_labels, respectively.
+        def f_inverse(data, mean, std, method_inverse):
+            if method_inverse == 'standardise':
+                return statistics.standardise_inverse(data, mean, std), None, None
+            elif method_inverse == 'centre':
+                return statistics.centre_inverse(data, mean), None, None
+            elif method_inverse == 'normalise':
+                raise NotImplementedError
+            else:
+                raise NotImplementedError
+
+        if self._transform_method is None:
+            raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.")
+        self.check_inverse_transform_params(self.mean, self.std, self._transform_method)
+        self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method)
+        self._transform_method = None
+        # update X and Y
+        self.make_samples()
 
-        :param extreme_values: user definition of extreme
-        :param extremes_on_right_tail_only: if False also multiply values which are smaller then -extreme_values,
-            if True only extract values larger than extreme_values
-        :param timedelta: used as arguments for np.timedelta in order to mark extreme values on datetime
+    def get_transformation_information(self, variable: str = None) -> Tuple[data_or_none, data_or_none, str]:
         """
-        # check if labels or history is None
-        if (self.label is None) or (self.history is None):
-            logging.debug(f"{self.station} has `None' labels, skip multiply extremes")
-            return
+        Extract transformation statistics and method.
+
+        Get mean and standard deviation for given variable and the transformation method if set. If a transformation
+        depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are
+        returned with None as fill value.
+
+        :param variable: Variable for which the information on transformation is requested.
 
-        # check type if inputs
-        extreme_values = helpers.to_list(extreme_values)
-        for i in extreme_values:
-            if not isinstance(i, number.__args__):
-                raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element "
-                                f"{i} is type {type(i)}")
-
-        for extr_val in sorted(extreme_values):
-            # check if some extreme values are already extracted
-            if (self.extremes_label is None) or (self.extremes_history is None):
-                # extract extremes based on occurance in labels
-                if extremes_on_right_tail_only:
-                    extreme_label_idx = (self.label > extr_val).any(axis=0).values.reshape(-1, )
-                else:
-                    extreme_label_idx = np.concatenate(((self.label < -extr_val).any(axis=0).values.reshape(-1, 1),
-                                                        (self.label > extr_val).any(axis=0).values.reshape(-1, 1)),
-                                                       axis=1).any(axis=1)
-                extremes_label = self.label[..., extreme_label_idx]
-                extremes_history = self.history[..., extreme_label_idx, :]
-                extremes_label.datetime.values += np.timedelta64(*timedelta)
-                extremes_history.datetime.values += np.timedelta64(*timedelta)
-                self.extremes_label = extremes_label  # .squeeze('Stations').transpose('datetime', 'window')
-                self.extremes_history = extremes_history  # .transpose('datetime', 'window', 'Stations', 'variables')
-            else:  # one extr value iteration is done already: self.extremes_label is NOT None...
-                if extremes_on_right_tail_only:
-                    extreme_label_idx = (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, )
-                else:
-                    extreme_label_idx = np.concatenate(
-                        ((self.extremes_label < -extr_val).any(axis=0).values.reshape(-1, 1),
-                         (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, 1)
-                         ), axis=1).any(axis=1)
-                # check on existing extracted extremes to minimise computational costs for comparison
-                extremes_label = self.extremes_label[..., extreme_label_idx]
-                extremes_history = self.extremes_history[..., extreme_label_idx, :]
-                extremes_label.datetime.values += np.timedelta64(*timedelta)
-                extremes_history.datetime.values += np.timedelta64(*timedelta)
-                self.extremes_label = xr.concat([self.extremes_label, extremes_label], dim='datetime')
-                self.extremes_history = xr.concat([self.extremes_history, extremes_history], dim='datetime')
+        :return: mean, standard deviation and transformation method
+        """
+        variable = self.target_var if variable is None else variable
+        try:
+            mean = self.mean.sel({'variables': variable}).values
+        except AttributeError:
+            mean = None
+        try:
+            std = self.std.sel({'variables': variable}).values
+        except AttributeError:
+            std = None
+        return mean, std, self._transform_method
 
 
 if __name__ == "__main__":
-    dp = AbstractDataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'})
-    print(dp)
+    # dp = AbstractDataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'})
+    # print(dp)
+    statistics_per_var = {'o3': 'dma8eu', 'temp-rea-miub': 'maximum'}
+    sp = StationPrep(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122',
+                     statistics_per_var=statistics_per_var, station_type='background',
+                     network='UBA', sampling='daily', target_dim='variables', target_var='o3',
+                     time_dim='datetime', window_history_size=7, window_lead_time=3,
+                     interpolation_limit=0
+                     )  # transformation={'method': 'standardise'})
+    # sp.set_transformation({'method': 'standardise', 'mean': sp.mean+2, 'std': sp.std+1})
+    sp2 = StationPrep(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122',
+                      statistics_per_var=statistics_per_var, station_type='background',
+                      network='UBA', sampling='daily', target_dim='variables', target_var='o3',
+                      time_dim='datetime', window_history_size=7, window_lead_time=3,
+                      transformation={'method': 'standardise'})
+    sp2.transform(inverse=True)
+    sp.get_X()
+    sp.get_Y()
+    print(len(sp))
+    print(sp.shape)
+    print(sp)
diff --git a/mlair/data_handling/bootstraps.py b/mlair/data_handling/bootstraps.py
deleted file mode 100644
index 4e72b2b81476d04aec819cc6be0fdfd585e5eaf9..0000000000000000000000000000000000000000
--- a/mlair/data_handling/bootstraps.py
+++ /dev/null
@@ -1,383 +0,0 @@
-"""
-Collections of bootstrap methods and classes.
-
-How to use
-----------
-
-test
-
-"""
-
-__author__ = 'Felix Kleinert, Lukas Leufen'
-__date__ = '2020-02-07'
-
-
-import logging
-import os
-import re
-from typing import List, Union, Pattern, Tuple
-
-import dask.array as da
-import keras
-import numpy as np
-import xarray as xr
-
-from mlair import helpers
-from mlair.data_handling.data_generator import DataGenerator
-
-
-class BootStrapGenerator(keras.utils.Sequence):
-    """
-    Generator that returns bootstrapped history objects for given boot index while iteration.
-
-    generator for bootstraps as keras sequence inheritance. Initialise with number of boots, the original history, the
-    shuffled data, all used variables and the current shuffled variable. While iterating over this generator, it returns
-    the bootstrapped history for given boot index (this is the iterator index) in the same format like the original
-    history ready to use. Note, that in some cases some samples can contain nan values (in these cases the entire data
-    row is null, not only single entries).
-    """
-
-    def __init__(self, number_of_boots: int, history: xr.DataArray, shuffled: xr.DataArray, variables: List[str],
-                 shuffled_variable: str):
-        """
-        Set up the generator.
-
-        :param number_of_boots: number of bootstrap realisations
-        :param history: original history (the ground truth)
-        :param shuffled: the shuffled history
-        :param variables: list with all variables of interest
-        :param shuffled_variable: name of the variable that shall be bootstrapped
-        """
-        self.number_of_boots = number_of_boots
-        self.variables = variables
-        self.history_orig = history
-        self.history = history.sel(variables=helpers.remove_items(self.variables, shuffled_variable))
-        self.shuffled = shuffled.sel(variables=shuffled_variable)
-
-    def __len__(self) -> int:
-        """
-        Return number of bootstraps.
-
-        :return: number of bootstraps
-        """
-        return self.number_of_boots
-
-    def __getitem__(self, index: int) -> xr.DataArray:
-        """
-        Return bootstrapped history for given bootstrap index in same index structure like the original history object.
-
-        :param index: boot index e [0, nboots-1]
-        :return: bootstrapped history ready to use
-        """
-        logging.debug(f"boot: {index}")
-        boot_hist = self.history.copy()
-        boot_hist = boot_hist.combine_first(self.__get_shuffled(index))
-        return boot_hist.reindex_like(self.history_orig)
-
-    def __get_shuffled(self, index: int) -> xr.DataArray:
-        """
-        Return shuffled data for given boot index from shuffled attribute.
-
-        :param index: boot index e [0, nboots-1]
-        :return: shuffled data
-        """
-        shuffled_var = self.shuffled.sel(boots=index).expand_dims("variables").drop("boots")
-        return shuffled_var.transpose("datetime", "window", "Stations", "variables")
-
-
-class CreateShuffledData:
-    """
-    Verify and create shuffled data for all data contained in given data generator class.
-
-    Starts automatically on initialisation, no further calls are required. Check and new creations are all performed
-    inside bootstrap_path.
-    """
-
-    def __init__(self, data: DataGenerator, number_of_bootstraps: int, bootstrap_path: str):
-        """
-        Shuffled data is automatically created in initialisation.
-
-        :param data: data to shuffle
-        :param number_of_bootstraps:
-        :param bootstrap_path: Path to find and store the bootstraps
-        """
-        self.data = data
-        self.number_of_bootstraps = number_of_bootstraps
-        self.bootstrap_path = bootstrap_path
-        self.create_shuffled_data()
-
-    def create_shuffled_data(self) -> None:
-        """
-        Create shuffled data.
-
-        Use original test data, add dimension 'boots' with length number of bootstraps and insert randomly selected
-        variables. If there is a suitable local file for requested window size and number of bootstraps, no additional
-        file will be created inside this function.
-        """
-        logging.info("create / check shuffled bootstrap data")
-        variables_str = '_'.join(sorted(self.data.variables))
-        window = self.data.window_history_size
-        for station in self.data.stations:
-            valid, nboot = self.valid_bootstrap_file(station, variables_str, window)
-            if not valid:
-                logging.info(f'create bootstap data for {station}')
-                hist = self.data.get_data_generator(station).get_transposed_history()
-                file_path = self._set_file_path(station, variables_str, window, nboot)
-                hist = hist.expand_dims({'boots': range(nboot)}, axis=-1)
-                shuffled_variable = []
-                chunks = (100, *hist.shape[1:3], hist.shape[-1])
-                for i, var in enumerate(hist.coords['variables']):
-                    single_variable = hist.sel(variables=var).values
-                    shuffled_variable.append(self.shuffle(single_variable, chunks=chunks))
-                shuffled_variable_da = da.stack(shuffled_variable, axis=-2).rechunk("auto")
-                shuffled_data = xr.DataArray(shuffled_variable_da, coords=hist.coords, dims=hist.dims)
-                shuffled_data.to_netcdf(file_path)
-
-    def _set_file_path(self, station: str, variables: str, window: int, nboots: int) -> str:
-        """
-        Set file name.
-
-        Set file name following naming convention <station>_<var1>_<var2>_..._hist<window>_nboots<nboots>_shuffled.nc
-        and create joined path using bootstrap_path attribute set on initialisation.
-
-        :param station: station name
-        :param variables: variables already preprocessed as single string with all variables seperated by underscore
-        :param window: window length
-        :param nboots: number of boots
-        :return: full file path
-        """
-        file_name = f"{station}_{variables}_hist{window}_nboots{nboots}_shuffled.nc"
-        return os.path.join(self.bootstrap_path, file_name)
-
-    def valid_bootstrap_file(self, station: str, variables: str, window: int) -> [bool, Union[None, int]]:
-        """
-        Compare local bootstrap file with given settings for station, variables, window and number of bootstraps.
-
-        If a match was found, this method returns a tuple (True, None). In any other case, it returns (False,
-        max_nboot), where max_nboot is the highest boot number found in the local storage. A match is defined so that
-        the window length is ge than given window size form args and the number of boots is also ge than the given
-        number of boots from this class. Furthermore, this functions deletes local files, if the match the station
-        pattern but don't fit the window and bootstrap condition. This is performed, because it is assumed, that the
-        corresponding file will be created with a longer or at the least same window size and numbers of bootstraps.
-
-        :param station: name of the station to validate
-        :param variables: all variables already merged in single string seperated by underscore
-        :param window: required window size
-        :return: tuple containing information if valid file was found first and second the number of boots that needs to
-            be used for the new boot creation (this is only relevant, if no valid file was found - otherwise the return
-            statement is anyway None).
-        """
-        regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled")
-        max_nboot = self.number_of_bootstraps
-        for file in os.listdir(self.bootstrap_path):
-            match = regex.match(file)
-            if match:
-                window_file = int(match.group(1))
-                nboot_file = int(match.group(2))
-                max_nboot = max([max_nboot, nboot_file])
-                if (window_file >= window) and (nboot_file >= self.number_of_bootstraps):
-                    return True, None
-                else:
-                    os.remove(os.path.join(self.bootstrap_path, file))
-        return False, max_nboot
-
-    @staticmethod
-    def shuffle(data: da.array, chunks: Tuple) -> da.core.Array:
-        """
-        Shuffle randomly from given data (draw elements with replacement).
-
-        :param data: data to shuffle
-        :param chunks: chunk size for dask
-        :return: shuffled data as dask core array (not computed yet)
-        """
-        size = data.shape
-        return da.random.choice(data.reshape(-1, ), size=size, chunks=chunks)
-
-
-class BootStraps:
-    """
-    Main class to perform bootstrap operations.
-
-    This class requires a DataGenerator object and a path, where to find and store all data related to the bootstrap
-    operation. In initialisation, this class will automatically call the class CreateShuffleData to set up the shuffled
-    data sets. How to use BootStraps:
-
-    * call .get_generator(<station>, <variable>) to get a generator for given station and variable combination that \
-        iterates over all bootstrap realisations (as keras sequence)
-    * call .get_labels(<station>) to get the measured observations in the same format as bootstrap predictions
-    * call .get_bootstrap_predictions(<station>, <variable>) to get the bootstrapped predictions
-    * call .get_orig_prediction(<station>) to get the non-bootstrapped predictions (referred as original predictions)
-    """
-
-    def __init__(self, data: DataGenerator, bootstrap_path: str, number_of_bootstraps: int = 10):
-        """
-        Automatically check and create (if needed) shuffled data on initialisation.
-
-        :param data: a data generator object to get data / history
-        :param bootstrap_path: path to find and store the bootstrap data
-        :param number_of_bootstraps: the number of bootstrap realisations
-        """
-        self.data = data
-        self.number_of_bootstraps = number_of_bootstraps
-        self.bootstrap_path = bootstrap_path
-        CreateShuffledData(data, number_of_bootstraps, bootstrap_path)
-
-    @property
-    def stations(self) -> List[str]:
-        """
-        Station property inherits directly from data generator object.
-
-        :return: list with all stations
-        """
-        return self.data.stations
-
-    @property
-    def variables(self) -> List[str]:
-        """
-        Variables property inherits directly from data generator object.
-
-        :return: list with all variables
-        """
-        return self.data.variables
-
-    @property
-    def window_history_size(self) -> int:
-        """
-        Window history size property inherits directly from data generator object.
-
-        :return: the window history size
-        """
-        return self.data.window_history_size
-
-    def get_generator(self, station: str, variable: str) -> BootStrapGenerator:
-        """
-        Return the actual generator to use for the bootstrap evaluation.
-
-        The generator requires information on station and bootstrapped variable. There is only a loop on the bootstrap
-        realisation and not on stations or variables.
-
-        :param station: name of the station
-        :param variable: name of the variable to bootstrap
-        :return: BootStrapGenerator class ready to use.
-        """
-        hist, _ = self.data[station]
-        shuffled_data = self._load_shuffled_data(station, self.variables).reindex_like(hist)
-        return BootStrapGenerator(self.number_of_bootstraps, hist, shuffled_data, self.variables, variable)
-
-    def get_labels(self, station: str) -> np.ndarray:
-        """
-        Repeat labels for given key by the number of boots and returns as single array.
-
-        :param station: name of station
-        :return: repeated labels as single array
-        """
-        labels = self.data[station][1]
-        return np.tile(labels.data, (self.number_of_bootstraps, 1))
-
-    def get_orig_prediction(self, path: str, file_name: str, prediction_name: str = "CNN") -> np.ndarray:
-        """
-        Repeat predictions from given file(_name) in path by the number of boots.
-
-        :param path: path to file
-        :param file_name: file name
-        :param prediction_name: name of the prediction to select from loaded file (default CNN)
-        :return: repeated predictions
-        """
-        file = os.path.join(path, file_name)
-        prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze()
-        vals = np.tile(prediction.data, (self.number_of_bootstraps, 1))
-        return vals[~np.isnan(vals).any(axis=1), :]
-
-    def _load_shuffled_data(self, station: str, variables: List[str]) -> xr.DataArray:
-        """
-        Load shuffled data from bootstrap path.
-
-        Data is stored as '<station>_<var1>_<var2>_..._hist<histsize>_nboots<nboots>_shuffled.nc', e.g.
-        'DEBW107_cloudcover_no_no2_temp_u_v_hist13_nboots20_shuffled.nc'
-
-        :param station: name of station
-        :param variables: list of variables
-        :return: shuffled data as xarray
-        """
-        file_name = self._get_shuffled_data_file(station, variables)
-        shuffled_data = xr.open_dataarray(file_name, chunks=100)
-        return shuffled_data
-
-    def _get_shuffled_data_file(self, station: str, variables: List[str]) -> str:
-        """
-        Look for data file using regular expressions and returns found file or raise FileNotFoundError.
-
-        :param station: name of station
-        :param variables: name of variables
-        :return: found file with complete path
-        """
-        files = os.listdir(self.bootstrap_path)
-        regex = self._create_file_regex(station, variables)
-        file = self._filter_files(regex, files, self.window_history_size, self.number_of_bootstraps)
-        if file:
-            return os.path.join(self.bootstrap_path, file)
-        else:
-            raise FileNotFoundError(f"Could not find a file to match pattern {regex}")
-
-    @staticmethod
-    def _create_file_regex(station: str, variables: List[str]) -> Pattern:
-        """
-        Create regex for given station and variables.
-
-        With this regex, it is possible to look for shuffled data with pattern:
-        `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc`
-
-        :param station: station name to use as prefix
-        :param variables: variables to add after station
-        :return: compiled regular expression
-        """
-        var_regex = "".join([rf"(_\w+)*_{v}(_\w+)*" for v in sorted(variables)])
-        regex = re.compile(rf"{station}{var_regex}_hist(\d+)_nboots(\d+)_shuffled\.nc")
-        return regex
-
-    @staticmethod
-    def _filter_files(regex: Pattern, files: List[str], window: int, nboot: int) -> Union[str, None]:
-        """
-        Filter list of files by regex.
-
-        Regex has to be structured to match the following string structure
-        `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc`. Hist and nboots values have to be included as
-        group. All matches are compared to given window and nboot parameters. A valid file must have the same value (or
-        larger) than these parameters and contain all variables.
-
-        :param regex: compiled regular expression pattern following the style from method description
-        :param files: list of file names to filter
-        :param window: minimum length of window to look for
-        :param nboot: minimal number of boots to search
-        :return: matching file name or None, if no valid file was found
-        """
-        for f in files:
-            match = regex.match(f)
-            if match:
-                last = match.lastindex
-                if (int(match.group(last - 1)) >= window) and (int(match.group(last)) >= nboot):
-                    return f
-
-
-if __name__ == "__main__":
-
-    from mlair.run_modules.experiment_setup import ExperimentSetup
-    from mlair.run_modules.run_environment import RunEnvironment
-    from mlair.run_modules.pre_processing import PreProcessing
-
-    formatter = '%(asctime)s - %(levelname)s: %(message)s  [%(filename)s:%(funcName)s:%(lineno)s]'
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    with RunEnvironment() as run_env:
-        ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'],
-                        station_type='background', trainable=True, window_history_size=9)
-        PreProcessing()
-
-        data = run_env.data_store.get("generator", "general.test")
-        path = run_env.data_store.get("bootstrap_path", "general")
-        number_bootstraps = 10
-
-        boots = BootStraps(data, path, number_bootstraps)
-        for b in boots.boot_strap_generator():
-            a, c = b
-        logging.info(f"len is {len(boots.get_boot_strap_meta())}")
diff --git a/mlair/data_handling/data_distributor.py b/mlair/data_handling/data_distributor.py
deleted file mode 100644
index bba5f2636f802e2d6843ef4a5ba5e6537c70dd61..0000000000000000000000000000000000000000
--- a/mlair/data_handling/data_distributor.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""
-Data Distribution Module.
-
-How to use
-----------
-
-Create distributor object from a generator object and parse it to the fit generator method. Provide the number of
-steps per epoch with distributor's length method.
-
-.. code-block:: python
-
-    model = YourKerasModel()
-    data_generator = DataGenerator(*args, **kwargs)
-    data_distributor = Distributor(data_generator, model, **kwargs)
-    history = model.fit_generator(generator=data_distributor.distribute_on_batches(),
-                                  steps_per_epoch=len(data_distributor),
-                                  epochs=10,)
-
-Additionally, a validation data set can be parsed using the length and distribute methods.
-"""
-
-from __future__ import generator_stop
-
-__author__ = "Lukas Leufen, Felix Kleinert"
-__date__ = '2019-12-05'
-
-import math
-
-import keras
-import numpy as np
-
-from mlair.data_handling.data_generator import DataGenerator
-
-
-class Distributor(keras.utils.Sequence):
-    """Distribute data generator elements according to mini batch size."""
-
-    def __init__(self, generator: DataGenerator, model: keras.models, batch_size: int = 256,
-                 permute_data: bool = False, upsampling: bool = False):
-        """
-        Set up distributor.
-
-        :param generator: The generator object must be iterable and return inputs and targets on each iteration
-        :param model: a keras model with one or more output branches
-        :param batch_size: batch size to use
-        :param permute_data: data is randomly permuted if enabled on each train step
-        :param upsampling: upsample data with upsample extremes data from generator object and shuffle data or use only
-            the standard input data.
-        """
-        self.generator = generator
-        self.model = model
-        self.batch_size = batch_size
-        self.do_data_permutation = permute_data
-        self.upsampling = upsampling
-
-    def _get_model_rank(self):
-        mod_out = self.model.output_shape
-        if isinstance(mod_out, tuple):
-            # only one output branch: (None, ahead)
-            mod_rank = 1
-        elif isinstance(mod_out, list):
-            # multiple output branches, e.g.: [(None, ahead), (None, ahead)]
-            mod_rank = len(mod_out)
-        else:  # pragma: no cover
-            raise TypeError("model output shape must either be tuple or list.")
-        return mod_rank
-
-    def _get_number_of_mini_batches(self, values):
-        return math.ceil(values.shape[0] / self.batch_size)
-
-    def _permute_data(self, x, y):
-        """
-        Permute inputs x and labels y if permutation is enabled in instance.
-
-        :param x: inputs
-        :param y: labels
-        :return: permuted or original data
-        """
-        if self.do_data_permutation:
-            p = np.random.permutation(len(x))  # equiv to .shape[0]
-            x = x[p]
-            y = y[p]
-        return x, y
-
-    def distribute_on_batches(self, fit_call=True):
-        """
-        Create generator object to distribute mini batches.
-
-        Split data from given generator object (usually for single station) according to the given batch size. Also
-        perform upsampling if enabled and random shuffling (either if data permutation is enabled or if upsampling is
-        enabled). Lastly multiply targets if provided model has multiple output branches.
-
-        :param fit_call: switch to exit while loop after first iteration. This is used to determine the length of all
-            distributed mini batches. For default, fit_call is True to obtain infinite loop for training.
-        :return: yields next mini batch
-        """
-        while True:
-            for k, v in enumerate(self.generator):
-                # get rank of output
-                mod_rank = self._get_model_rank()
-                # get data
-                x_total = np.copy(v[0])
-                y_total = np.copy(v[1])
-                if self.upsampling:
-                    try:
-                        s = self.generator.get_data_generator(k)
-                        x_total = np.concatenate([x_total, np.copy(s.get_extremes_history())], axis=0)
-                        y_total = np.concatenate([y_total, np.copy(s.get_extremes_label())], axis=0)
-                    except AttributeError:  # no extremes history / labels available, copy will fail
-                        pass
-                # get number of mini batches
-                num_mini_batches = self._get_number_of_mini_batches(x_total)
-                # permute order for mini-batches
-                x_total, y_total = self._permute_data(x_total, y_total)
-                for prev, curr in enumerate(range(1, num_mini_batches + 1)):
-                    x = x_total[prev * self.batch_size:curr * self.batch_size, ...]
-                    y = [y_total[prev * self.batch_size:curr * self.batch_size, ...] for _ in range(mod_rank)]
-                    if x is not None:  # pragma: no branch
-                        yield x, y
-                        if (k + 1) == len(self.generator) and curr == num_mini_batches and not fit_call:
-                            return
-
-    def __len__(self) -> int:
-        """
-        Total number of distributed mini batches.
-
-        :return: the length of the distribute on batches object
-        """
-        num_batch = 0
-        for _ in self.distribute_on_batches(fit_call=False):
-            num_batch += 1
-        return num_batch
diff --git a/mlair/data_handling/data_generator.py b/mlair/data_handling/data_generator.py
deleted file mode 100644
index 0088d00a95bf4d741bd3c71d6c0fcb011915d94f..0000000000000000000000000000000000000000
--- a/mlair/data_handling/data_generator.py
+++ /dev/null
@@ -1,366 +0,0 @@
-"""Data Generator class to handle large arrays for machine learning."""
-
-__author__ = 'Felix Kleinert, Lukas Leufen'
-__date__ = '2019-11-07'
-
-import logging
-import os
-import pickle
-from typing import Union, List, Tuple, Any, Dict
-
-import dask.array as da
-import keras
-import xarray as xr
-
-from mlair import helpers
-from mlair.data_handling.data_preparation import AbstractDataPrep
-from mlair.helpers.join import EmptyQueryResult
-
-number = Union[float, int]
-num_or_list = Union[number, List[number]]
-data_or_none = Union[xr.DataArray, None]
-
-
-class DataGenerator(keras.utils.Sequence):
-    """
-    This class is a generator to handle large arrays for machine learning.
-
-    .. code-block:: python
-
-        data_generator = DataGenerator(**args, **kwargs)
-
-    Data generator item can be called manually by position (integer) or  station id (string). Methods also accept lists
-    with exactly one entry of integer or string.
-
-    .. code-block::
-
-        # select generator elements by position index
-        first_element = data_generator.get_data_generator([0])  # 1st element
-        n_element = data_generator.get_data_generator([4])  # 5th element
-
-        # select by name
-        station_xy = data_generator.get_data_generator(["station_xy"])  # will raise KeyError if not available
-
-    If used as iterator or directly called by get item method, the data generator class returns transposed labels and
-    history object from underlying data preparation class DataPrep.
-
-    .. code-block:: python
-
-        # select history and label by position
-        hist, labels = data_generator[0]
-        # by name
-        hist, labels = data_generator["station_xy"]
-        # as iterator
-        for (hist, labels) in data_generator:
-            pass
-
-    This class can also be used with keras' fit_generator and predict_generator. Individual stations are the iterables.
-    """
-
-    def __init__(self, data_path: str, stations: Union[str, List[str]], variables: List[str],
-                 interpolation_dim: str, target_dim: str, target_var: str, station_type: str = None,
-                 interpolation_method: str = "linear", limit_nan_fill: int = 1, window_history_size: int = 7,
-                 window_lead_time: int = 4, transformation: Dict = None, extreme_values: num_or_list = None,
-                 data_preparation=None, **kwargs):
-        """
-        Set up data generator.
-
-        :param data_path: path to data
-        :param stations: list with all stations to include
-        :param variables: list with all used variables
-        :param interpolation_dim: dimension along which interpolation is applied
-        :param target_dim: dimension of target variable
-        :param target_var: name of target variable
-        :param station_type: TOAR station type classification (background, traffic)
-        :param interpolation_method: method of interpolation
-        :param limit_nan_fill: maximum gab in data to fill by interpolation
-        :param window_history_size: length of the history window
-        :param window_lead_time: lenght of the label window
-        :param transformation: transformation method to apply on data
-        :param extreme_values: set up the extreme value upsampling
-        :param kwargs: additional kwargs that are used in either DataPrep (transformation, start / stop period, ...)
-            or extreme values
-        """
-        self.data_path = os.path.abspath(data_path)
-        self.data_path_tmp = os.path.join(os.path.abspath(data_path), "tmp")
-        if not os.path.exists(self.data_path_tmp):
-            os.makedirs(self.data_path_tmp)
-        self.stations = helpers.to_list(stations)
-        self.variables = variables
-        self.interpolation_dim = interpolation_dim
-        self.target_dim = target_dim
-        self.target_var = target_var
-        self.station_type = station_type
-        self.interpolation_method = interpolation_method
-        self.limit_nan_fill = limit_nan_fill
-        self.window_history_size = window_history_size
-        self.window_lead_time = window_lead_time
-        self.extreme_values = extreme_values
-        self.DataPrep = data_preparation if data_preparation is not None else AbstractDataPrep
-        self.kwargs = kwargs
-        self.transformation = self.setup_transformation(transformation)
-
-    def __repr__(self):
-        """Display all class attributes."""
-        return f"DataGenerator(path='{self.data_path}', stations={self.stations}, " \
-               f"variables={self.variables}, station_type={self.station_type}, " \
-               f"interpolation_dim='{self.interpolation_dim}', target_dim='{self.target_dim}', " \
-               f"target_var='{self.target_var}', **{self.kwargs})"
-
-    def __len__(self):
-        """Return the number of stations."""
-        return len(self.stations)
-
-    def __iter__(self) -> "DataGenerator":
-        """
-        Define the __iter__ part of the iterator protocol to iterate through this generator.
-
-        Sets the private attribute `_iterator` to 0.
-        """
-        self._iterator = 0
-        return self
-
-    def __next__(self) -> Tuple[xr.DataArray, xr.DataArray]:
-        """
-        Get the data generator, and return the history and label data of this generator.
-
-        This is the implementation of the __next__ method of the iterator protocol.
-        """
-        if self._iterator < self.__len__():
-            data = self.get_data_generator()
-            self._iterator += 1
-            if data.history is not None and data.label is not None:  # pragma: no branch
-                return data.get_transposed_history(), data.get_transposed_label()
-            else:
-                self.__next__()  # pragma: no cover
-        else:
-            raise StopIteration
-
-    def __getitem__(self, item: Union[str, int]) -> Tuple[xr.DataArray, xr.DataArray]:
-        """
-        Define the get item method for this generator.
-
-        Retrieve data from generator and return history and labels.
-
-        :param item: station key to choose the data generator.
-        :return: The generator's time series of history data and its labels
-        """
-        data = self.get_data_generator(key=item)
-        return data.get_transposed_history(), data.get_transposed_label()
-
-    def setup_transformation(self, transformation: Dict):
-        """
-        Set up transformation by extracting all relevant information.
-
-        Extract all information from transformation dictionary. Possible keys are scope. method, mean, and std. Scope
-        can either be station or data. Station scope means, that data transformation is performed for each station
-        independently (somehow like batch normalisation), whereas data scope means a transformation applied on the
-        entire data set.
-
-        * If using data scope, mean and standard deviation (each only if required by transformation method) can either
-          be calculated accurate or as an estimate (faster implementation). This must be set in dictionary  either
-          as "mean": "accurate" or "mean": "estimate". In both cases, the required statistics are calculated and saved.
-          After this calculations, the mean key is overwritten by the actual values to use.
-        * If using station scope, no additional information is required.
-        * If a transformation should be applied on base of existing values, these need to be provided in the respective
-          keys "mean" and "std" (again only if required for given method).
-
-        :param transformation: the transformation dictionary as described above.
-
-        :return: updated transformation dictionary
-        """
-        if transformation is None:
-            return
-        transformation = transformation.copy()
-        scope = transformation.get("scope", "station")
-        method = transformation.get("method", "standardise")
-        mean = transformation.get("mean", None)
-        std = transformation.get("std", None)
-        if scope == "data":
-            if isinstance(mean, str):
-                if mean == "accurate":
-                    mean, std = self.calculate_accurate_transformation(method)
-                elif mean == "estimate":
-                    mean, std = self.calculate_estimated_transformation(method)
-                else:
-                    raise ValueError(f"given mean attribute must either be equal to strings 'accurate' or 'estimate' or"
-                                     f"be an array with already calculated means. Given was: {mean}")
-        elif scope == "station":
-            mean, std = None, None
-        else:
-            raise ValueError(f"Scope argument can either be 'station' or 'data'. Given was: {scope}")
-        transformation["method"] = method
-        transformation["mean"] = mean
-        transformation["std"] = std
-        return transformation
-
-    def calculate_accurate_transformation(self, method: str) -> Tuple[data_or_none, data_or_none]:
-        """
-        Calculate accurate transformation statistics.
-
-        Use all stations of this generator and calculate mean and standard deviation on entire data set using dask.
-        Because there can be much data, this can take a while.
-
-        :param method: name of transformation method
-
-        :return: accurate calculated mean and std (depending on transformation)
-        """
-        tmp = []
-        mean = None
-        std = None
-        for station in self.stations:
-            try:
-                data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type,
-                                     **self.kwargs)
-                chunks = (1, 100, data.data.shape[2])
-                tmp.append(da.from_array(data.data.data, chunks=chunks))
-            except EmptyQueryResult:
-                continue
-        tmp = da.concatenate(tmp, axis=1)
-        if method in ["standardise", "centre"]:
-            mean = da.nanmean(tmp, axis=1).compute()
-            mean = xr.DataArray(mean.flatten(), coords={"variables": sorted(self.variables)}, dims=["variables"])
-            if method == "standardise":
-                std = da.nanstd(tmp, axis=1).compute()
-                std = xr.DataArray(std.flatten(), coords={"variables": sorted(self.variables)}, dims=["variables"])
-        else:
-            raise NotImplementedError
-        return mean, std
-
-    def calculate_estimated_transformation(self, method):
-        """
-        Calculate estimated transformation statistics.
-
-        Use all stations of this generator and calculate mean and standard deviation first for each station separately.
-        Afterwards, calculate the average mean and standard devation as estimated statistics. Because this method does
-        not consider the length of each data set, the estimated mean distinguishes from the real data mean. Furthermore,
-        the estimated standard deviation is assumed to be the mean (also not weighted) of all deviations. But this is
-        mathematically not true, but still a rough and faster estimation of the true standard deviation. Do not use this
-        method for further statistical calculation. However, in the scope of data preparation for machine learning, this
-        approach is decent ("it is just scaling").
-
-        :param method: name of transformation method
-
-        :return: accurate calculated mean and std (depending on transformation)
-        """
-        data = [[]] * len(self.variables)
-        coords = {"variables": self.variables, "Stations": range(0)}
-        mean = xr.DataArray(data, coords=coords, dims=["variables", "Stations"])
-        std = xr.DataArray(data, coords=coords, dims=["variables", "Stations"])
-        for station in self.stations:
-            try:
-                data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type,
-                                     **self.kwargs)
-                data.transform("datetime", method=method)
-                mean = mean.combine_first(data.mean)
-                std = std.combine_first(data.std)
-                data.transform("datetime", method=method, inverse=True)
-            except EmptyQueryResult:
-                continue
-        return mean.mean("Stations") if mean.shape[1] > 0 else None, std.mean("Stations") if std.shape[1] > 0 else None
-
-    def get_data_generator(self, key: Union[str, int] = None, load_local_tmp_storage: bool = True,
-                           save_local_tmp_storage: bool = True) -> AbstractDataPrep:
-        """
-        Create DataPrep object and preprocess data for given key.
-
-        Select data for given key, create a DataPrep object and
-        * apply transformation (optional)
-        * interpolate
-        * make history, labels, and observation
-        * remove nans
-        * upsample extremes (optional).
-        Processed data can be stored locally in a .pickle file. If load local tmp storage is enabled, the get data
-        generator tries first to load data from local pickle file and only creates a new DataPrep object if it couldn't
-        load this data from disk.
-
-        :param key: station key to choose the data generator.
-        :param load_local_tmp_storage: say if data should be processed from scratch or loaded as already processed data
-            from tmp pickle file to save computational time (but of course more disk space required).
-        :param save_local_tmp_storage: save processed data as temporal file locally (default True)
-
-        :return: preprocessed data as a DataPrep instance
-        """
-        station = self.get_station_key(key)
-        try:
-            if not load_local_tmp_storage:
-                raise FileNotFoundError
-            data = self._load_pickle_data(station, self.variables)
-        except FileNotFoundError:
-            logging.debug(f"load not pickle data for {station}")
-            data = self.DataPrep(self.data_path, station, self.variables, station_type=self.station_type,
-                                 **self.kwargs)
-            if self.transformation is not None:
-                data.transform("datetime", **helpers.remove_items(self.transformation, "scope"))
-            data.interpolate(self.interpolation_dim, method=self.interpolation_method, limit=self.limit_nan_fill)
-            data.make_history_window(self.target_dim, self.window_history_size, self.interpolation_dim)
-            data.make_labels(self.target_dim, self.target_var, self.interpolation_dim, self.window_lead_time)
-            data.make_observation(self.target_dim, self.target_var, self.interpolation_dim)
-            data.remove_nan(self.interpolation_dim)
-            if self.extreme_values is not None:
-                kwargs = {"extremes_on_right_tail_only": self.kwargs.get("extremes_on_right_tail_only", False)}
-                data.multiply_extremes(self.extreme_values, **kwargs)
-            if save_local_tmp_storage:
-                self._save_pickle_data(data)
-        return data
-
-    def _save_pickle_data(self, data: Any):
-        """
-        Save given data locally as .pickle in self.data_path_tmp with name '<station>_<var1>_<var2>_..._<varX>.pickle'.
-
-        :param data: any data, that should be saved
-        """
-        date = f"{self.kwargs.get('start')}_{self.kwargs.get('end')}"
-        vars = '_'.join(sorted(data.variables))
-        station = ''.join(data.station)
-        file = os.path.join(self.data_path_tmp, f"{station}_{vars}_{date}_.pickle")
-        with open(file, "wb") as f:
-            pickle.dump(data, f)
-        logging.debug(f"save pickle data to {file}")
-
-    def _load_pickle_data(self, station: Union[str, List[str]], variables: List[str]) -> Any:
-        """
-        Load locally saved data from self.data_path_tmp and name '<station>_<var1>_<var2>_..._<varX>.pickle'.
-
-        :param station: station to load
-        :param variables: list of variables to load
-        :return: loaded data
-        """
-        date = f"{self.kwargs.get('start')}_{self.kwargs.get('end')}"
-        vars = '_'.join(sorted(variables))
-        station = ''.join(station)
-        file = os.path.join(self.data_path_tmp, f"{station}_{vars}_{date}_.pickle")
-        with open(file, "rb") as f:
-            data = pickle.load(f)
-        logging.debug(f"load pickle data from {file}")
-        return data
-
-    def get_station_key(self, key: Union[None, str, int, List[Union[None, str, int]]]) -> str:
-        """
-        Return a valid station key or raise KeyError if this wasn't possible.
-
-        :param key: station key to choose the data generator.
-        :return: station key (id from database)
-        """
-        # extract value if given as list
-        if isinstance(key, list):
-            if len(key) == 1:
-                key = key[0]
-            else:
-                raise KeyError(f"More than one key was given: {key}")
-        # return station name either from key or the recent element from iterator
-        if key is None:
-            return self.stations[self._iterator]
-        else:
-            if isinstance(key, int):
-                if key < self.__len__():
-                    return self.stations[key]
-                else:
-                    raise KeyError(f"{key} is not in range(0, {self.__len__()})")
-            elif isinstance(key, str):
-                if key in self.stations:
-                    return key
-                else:
-                    raise KeyError(f"{key} is not in stations")
-            else:
-                raise KeyError(f"Key has to be from Union[str, int]. Given was {key} ({type(key)})")
diff --git a/mlair/data_handling/data_preparation_join.py b/mlair/data_handling/data_preparation_join.py
deleted file mode 100644
index 516be5b3d4cebdbca4e9328f4886988008efbeb8..0000000000000000000000000000000000000000
--- a/mlair/data_handling/data_preparation_join.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""Data Preparation class to handle data processing for machine learning."""
-
-__author__ = 'Felix Kleinert, Lukas Leufen'
-__date__ = '2019-10-16'
-
-import datetime as dt
-import inspect
-import logging
-from typing import Union, List
-
-import pandas as pd
-import xarray as xr
-
-from mlair import helpers
-from mlair.helpers import join
-from mlair.data_handling.data_preparation import AbstractDataPrep
-
-# define a more general date type for type hinting
-date = Union[dt.date, dt.datetime]
-str_or_list = Union[str, List[str]]
-number = Union[float, int]
-num_or_list = Union[number, List[number]]
-data_or_none = Union[xr.DataArray, None]
-
-
-class DataPrepJoin(AbstractDataPrep):
-    """
-    This class prepares data to be used in neural networks.
-
-    The instance searches for local stored data, that meet the given demands. If no local data is found, the DataPrep
-    instance will load data from TOAR database and store this data locally to use the next time. For the moment, there
-    is only support for daily aggregated time series. The aggregation can be set manually and differ for each variable.
-
-    After data loading, different data pre-processing steps can be executed to prepare the data for further
-    applications. Especially the following methods can be used for the pre-processing step:
-
-    - interpolate: interpolate between data points by using xarray's interpolation method
-    - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on \
-        interval [0, 1] are not implemented yet.
-    - make window history: represent the history (time steps before) for training/ testing; X
-    - make labels: create target vector with given leading time steps for training/ testing; y
-    - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. \
-        Use this method after the creation of the window history and labels to clean up the data cube.
-
-    To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA,
-    "Umweltbundesamt") and the variables to use. Further options can be set in the instance.
-
-    * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable.
-    * `start`: define a start date for the data cube creation. Default: Use the first entry in time series
-    * `end`: set the end date for the data cube. Default: Use last date in time series.
-    * `store_data_locally`: store recently downloaded data on local disk. Default: True
-    * set further parameters for xarray's interpolation methods to modify the interpolation scheme
-
-    """
-
-    def __init__(self, path: str, station: Union[str, List[str]], variables: List[str], network: str = None,
-                 station_type: str = None, **kwargs):
-        self.network = network
-        self.station_type = station_type
-        params = helpers.remove_items(inspect.getfullargspec(AbstractDataPrep.__init__).args, "self")
-        kwargs = {**{k: v for k, v in locals().items() if k in params and v is not None}, **kwargs}
-        super().__init__(**kwargs)
-
-    def download_data(self, file_name, meta_file):
-        """
-        Download data and meta from join.
-
-        :param file_name: name of file to save data to (containing full path)
-        :param meta_file: name of the meta data file (also containing full path)
-        """
-        data, meta = self.download_data_from_join(file_name, meta_file)
-        return data, meta
-
-    def check_station_meta(self):
-        """
-        Search for the entries in meta data and compare the value with the requested values.
-
-        Will raise a FileNotFoundError if the values mismatch.
-        """
-        if self.station_type is not None:
-            check_dict = {"station_type": self.station_type, "network_name": self.network}
-            for (k, v) in check_dict.items():
-                if v is None:
-                    continue
-                if self.meta.at[k, self.station[0]] != v:
-                    logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != "
-                                  f"{self.meta.at[k, self.station[0]]} (local). Raise FileNotFoundError to trigger new "
-                                  f"grapping from web.")
-                    raise FileNotFoundError
-
-    def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]:
-        """
-        Download data from TOAR database using the JOIN interface.
-
-        Data is transformed to a xarray dataset. If class attribute store_data_locally is true, data is additionally
-        stored locally using given names for file and meta file.
-
-        :param file_name: name of file to save data to (containing full path)
-        :param meta_file: name of the meta data file (also containing full path)
-
-        :return: downloaded data and its meta data
-        """
-        df_all = {}
-        df, meta = join.download_join(station_name=self.station, stat_var=self.statistics_per_var,
-                                      station_type=self.station_type, network_name=self.network, sampling=self.sampling)
-        df_all[self.station[0]] = df
-        # convert df_all to xarray
-        xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
-        xarr = xr.Dataset(xarr).to_array(dim='Stations')
-        if self.kwargs.get('store_data_locally', True):
-            # save locally as nc/csv file
-            xarr.to_netcdf(path=file_name)
-            meta.to_csv(meta_file)
-        return xarr, meta
-
-    def __repr__(self):
-        """Represent class attributes."""
-        return f"Dataprep(path='{self.path}', network='{self.network}', station={self.station}, " \
-               f"variables={self.variables}, station_type={self.station_type}, **{self.kwargs})"
-
-
-if __name__ == "__main__":
-    dp = DataPrepJoin('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'})
-    print(dp)
diff --git a/mlair/helpers/__init__.py b/mlair/helpers/__init__.py
index 546713b3f18f2cb64c1527b57d1e9e2138e927aa..9e2f612c86dc0477693567210493fbdcf3002954 100644
--- a/mlair/helpers/__init__.py
+++ b/mlair/helpers/__init__.py
@@ -3,4 +3,4 @@
 from .testing import PyTestRegex, PyTestAllEqual
 from .time_tracking import TimeTracking, TimeTrackingWrapper
 from .logger import Logger
-from .helpers import remove_items, float_round, dict_to_xarray, to_list
+from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value
diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py
index 968ee5385f5a44cdbbce5653a864875011874150..b12d9028747aa677802c4a99e35852b514128e4c 100644
--- a/mlair/helpers/helpers.py
+++ b/mlair/helpers/helpers.py
@@ -92,3 +92,10 @@ def remove_items(obj: Union[List, Dict], items: Any):
         return remove_from_dict(obj, items)
     else:
         raise TypeError(f"{inspect.stack()[0][3]} does not support type {type(obj)}.")
+
+
+def extract_value(encapsulated_value):
+    try:
+        return extract_value(encapsulated_value[0])
+    except TypeError:
+        return encapsulated_value
diff --git a/mlair/model_modules/linear_model.py b/mlair/model_modules/linear_model.py
index e556f0358a2a5e5247f7b6cc7d416af25a8a664d..341c787e3060fd7e7cc3ff468ba40add9b9936d2 100644
--- a/mlair/model_modules/linear_model.py
+++ b/mlair/model_modules/linear_model.py
@@ -42,21 +42,27 @@ class OrdinaryLeastSquaredModel:
         return self.ordinary_least_squared_model(self.x, self.y)
 
     def _set_x_y_from_generator(self):
-        data_x = None
-        data_y = None
+        data_x, data_y = None, None
         for item in self.generator:
-            x = self.reshape_xarray_to_numpy(item[0])
-            y = item[1].values
-            data_x = np.concatenate((data_x, x), axis=0) if data_x is not None else x
-            data_y = np.concatenate((data_y, y), axis=0) if data_y is not None else y
-        self.x = data_x
-        self.y = data_y
+            x, y = item.get_data(as_numpy=True)
+            x = self.flatten(x)
+            data_x = self._concatenate(x, data_x)
+            data_y = self._concatenate(y, data_y)
+        self.x, self.y = np.concatenate(data_x, axis=1), data_y[0]
+
+    def _concatenate(self, new, old):
+        return list(map(lambda n1, n2: np.concatenate((n1, n2), axis=0), old, new)) if old is not None else new
 
     def predict(self, data):
         """Apply OLS model on data."""
-        data = sm.add_constant(self.reshape_xarray_to_numpy(data), has_constant="add")
+        data = sm.add_constant(np.concatenate(self.flatten(data), axis=1), has_constant="add")
         return np.atleast_2d(self.model.predict(data))
 
+    @staticmethod
+    def flatten(data):
+        shapes = list(map(lambda x: x.shape, data))
+        return list(map(lambda x, shape: x.reshape(shape[0], -1), data, shapes))
+
     @staticmethod
     def reshape_xarray_to_numpy(data):
         """Reshape xarray data to numpy data and flatten."""
diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py
index b1779ecd7c087519e8cb8e78b2c9998214d12758..56e7b4c347a69781854a9cf8ad9a719f7d6ac8b9 100644
--- a/mlair/model_modules/model_class.py
+++ b/mlair/model_modules/model_class.py
@@ -139,7 +139,7 @@ class AbstractModelClass(ABC):
     the corresponding loss function.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, shape_inputs, shape_outputs) -> None:
         """Predefine internal attributes for model and loss."""
         self.__model = None
         self.model_name = self.__class__.__name__
@@ -153,6 +153,8 @@ class AbstractModelClass(ABC):
                                           'target_tensors': None
                                           }
         self.__compile_options = self.__allowed_compile_options
+        self.shape_inputs = shape_inputs
+        self.shape_outputs = self.__extract_from_tuple(shape_outputs)
 
     def __getattr__(self, name: str) -> Any:
         """
@@ -273,6 +275,11 @@ class AbstractModelClass(ABC):
                 raise ValueError(
                     f"Got different values or arguments for same argument: self.{allow_k}={new_v_attr.__class__} and '{allow_k}': {new_v_dic.__class__}")
 
+    @staticmethod
+    def __extract_from_tuple(tup):
+        """Return element of tuple if it contains only a single element."""
+        return tup[0] if isinstance(tup, tuple) and len(tup) == 1 else tup
+
     @staticmethod
     def __compare_keras_optimizers(first, second):
         if first.__class__ == second.__class__ and first.__module__ == 'keras.optimizers':
@@ -340,24 +347,19 @@ class MyLittleModel(AbstractModelClass):
     Dense layer.
     """
 
-    def __init__(self, window_history_size, window_lead_time, channels):
+    def __init__(self, shape_inputs: list, shape_outputs: list):
         """
         Sets model and loss depending on the given arguments.
 
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param regularizer: <not used here>
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
+        :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables))
+        :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast))
         """
 
-        super().__init__()
+        assert len(shape_inputs) == 1
+        assert len(shape_outputs) == 1
+        super().__init__(shape_inputs[0], shape_outputs[0])
 
         # settings
-        self.window_history_size = window_history_size
-        self.window_lead_time = window_lead_time
-        self.channels = channels
         self.dropout_rate = 0.1
         self.regularizer = keras.regularizers.l2(0.1)
         self.activation = keras.layers.PReLU
@@ -370,17 +372,10 @@ class MyLittleModel(AbstractModelClass):
     def set_model(self):
         """
         Build the model.
-
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
-        :return: built keras model
         """
 
         # add 1 to window_size to include current time step t0
-        x_input = keras.layers.Input(shape=(self.window_history_size + 1, 1, self.channels))
+        x_input = keras.layers.Input(shape=self.shape_inputs)
         x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1'.format("major"))(x_input)
         x_in = self.activation(name='{}_conv_act'.format("major"))(x_in)
         x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in)
@@ -391,16 +386,16 @@ class MyLittleModel(AbstractModelClass):
         x_in = self.activation()(x_in)
         x_in = keras.layers.Dense(16, name='{}_Dense_16'.format("major"))(x_in)
         x_in = self.activation()(x_in)
-        x_in = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("major"))(x_in)
+        x_in = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("major"))(x_in)
         out_main = self.activation()(x_in)
         self.model = keras.Model(inputs=x_input, outputs=[out_main])
 
     def set_compile_options(self):
         self.initial_lr = 1e-2
-        self.optimizer = keras.optimizers.SGD(lr=self.initial_lr, momentum=0.9)
+        self.optimizer = keras.optimizers.adam(lr=self.initial_lr)
         self.lr_decay = mlair.model_modules.keras_extensions.LearningRateDecay(base_lr=self.initial_lr, drop=.94,
                                                                                epochs_drop=10)
-        self.compile_options = {"loss": keras.losses.mean_squared_error, "metrics": ["mse", "mae"]}
+        self.compile_options = {"loss": [keras.losses.mean_squared_error], "metrics": ["mse", "mae"]}
 
 
 class MyBranchedModel(AbstractModelClass):
@@ -412,24 +407,19 @@ class MyBranchedModel(AbstractModelClass):
     Dense layer.
     """
 
-    def __init__(self, window_history_size, window_lead_time, channels):
+    def __init__(self, shape_inputs: list, shape_outputs: list):
         """
         Sets model and loss depending on the given arguments.
 
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param regularizer: <not used here>
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
+        :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables))
+        :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast))
         """
 
-        super().__init__()
+        assert len(shape_inputs) == 1
+        assert len(shape_outputs) == 1
+        super().__init__(shape_inputs[0], shape_outputs[0])
 
         # settings
-        self.window_history_size = window_history_size
-        self.window_lead_time = window_lead_time
-        self.channels = channels
         self.dropout_rate = 0.1
         self.regularizer = keras.regularizers.l2(0.1)
         self.activation = keras.layers.PReLU
@@ -442,32 +432,25 @@ class MyBranchedModel(AbstractModelClass):
     def set_model(self):
         """
         Build the model.
-
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
-        :return: built keras model
         """
 
         # add 1 to window_size to include current time step t0
-        x_input = keras.layers.Input(shape=(self.window_history_size + 1, 1, self.channels))
+        x_input = keras.layers.Input(shape=self.shape_inputs)
         x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1'.format("major"))(x_input)
         x_in = self.activation(name='{}_conv_act'.format("major"))(x_in)
         x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in)
         x_in = keras.layers.Dropout(self.dropout_rate, name='{}_Dropout_1'.format("major"))(x_in)
         x_in = keras.layers.Dense(64, name='{}_Dense_64'.format("major"))(x_in)
         x_in = self.activation()(x_in)
-        out_minor_1 = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("minor_1"))(x_in)
+        out_minor_1 = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("minor_1"))(x_in)
         out_minor_1 = self.activation(name="minor_1")(out_minor_1)
         x_in = keras.layers.Dense(32, name='{}_Dense_32'.format("major"))(x_in)
         x_in = self.activation()(x_in)
-        out_minor_2 = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("minor_2"))(x_in)
+        out_minor_2 = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("minor_2"))(x_in)
         out_minor_2 = self.activation(name="minor_2")(out_minor_2)
         x_in = keras.layers.Dense(16, name='{}_Dense_16'.format("major"))(x_in)
         x_in = self.activation()(x_in)
-        x_in = keras.layers.Dense(self.window_lead_time, name='{}_Dense'.format("major"))(x_in)
+        x_in = keras.layers.Dense(self.shape_outputs, name='{}_Dense'.format("major"))(x_in)
         out_main = self.activation(name="main")(x_in)
         self.model = keras.Model(inputs=x_input, outputs=[out_minor_1, out_minor_2, out_main])
 
@@ -482,24 +465,19 @@ class MyBranchedModel(AbstractModelClass):
 
 class MyTowerModel(AbstractModelClass):
 
-    def __init__(self, window_history_size, window_lead_time, channels):
+    def __init__(self, shape_inputs: list, shape_outputs: list):
         """
         Sets model and loss depending on the given arguments.
 
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param regularizer: <not used here>
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
+        :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables))
+        :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast))
         """
 
-        super().__init__()
+        assert len(shape_inputs) == 1
+        assert len(shape_outputs) == 1
+        super().__init__(shape_inputs[0], shape_outputs[0])
 
         # settings
-        self.window_history_size = window_history_size
-        self.window_lead_time = window_lead_time
-        self.channels = channels
         self.dropout_rate = 1e-2
         self.regularizer = keras.regularizers.l2(0.1)
         self.initial_lr = 1e-2
@@ -515,13 +493,6 @@ class MyTowerModel(AbstractModelClass):
     def set_model(self):
         """
         Build the model.
-
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
-        :return: built keras model
         """
         activation = self.activation
         conv_settings_dict1 = {
@@ -555,9 +526,7 @@ class MyTowerModel(AbstractModelClass):
         ##########################################
         inception_model = InceptionModelBase()
 
-        X_input = keras.layers.Input(
-            shape=(
-            self.window_history_size + 1, 1, self.channels))  # add 1 to window_size to include current time step t0
+        X_input = keras.layers.Input(shape=self.shape_inputs)
 
         X_in = inception_model.inception_block(X_input, conv_settings_dict1, pool_settings_dict1,
                                                regularizer=self.regularizer,
@@ -579,7 +548,7 @@ class MyTowerModel(AbstractModelClass):
         # out_main = flatten_tail(X_in, 'Main', activation=activation, bound_weight=True, dropout_rate=self.dropout_rate,
         #                         reduction_filter=64, inner_neurons=64, output_neurons=self.window_lead_time)
 
-        out_main = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.window_lead_time,
+        out_main = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.shape_outputs,
                                 output_activation='linear', reduction_filter=64,
                                 name='Main', bound_weight=True, dropout_rate=self.dropout_rate,
                                 kernel_regularizer=self.regularizer
@@ -594,24 +563,19 @@ class MyTowerModel(AbstractModelClass):
 
 class MyPaperModel(AbstractModelClass):
 
-    def __init__(self, window_history_size, window_lead_time, channels):
+    def __init__(self, shape_inputs: list, shape_outputs: list):
         """
         Sets model and loss depending on the given arguments.
 
-        :param activation: activation function
-        :param window_history_size: number of historical time steps included in the input data
-        :param channels: number of variables used in input data
-        :param regularizer: <not used here>
-        :param dropout_rate: dropout rate used in the model [0, 1)
-        :param window_lead_time: number of time steps to forecast in the output layer
+        :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables))
+        :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast))
         """
 
-        super().__init__()
+        assert len(shape_inputs) == 1
+        assert len(shape_outputs) == 1
+        super().__init__(shape_inputs[0], shape_outputs[0])
 
         # settings
-        self.window_history_size = window_history_size
-        self.window_lead_time = window_lead_time
-        self.channels = channels
         self.dropout_rate = .3
         self.regularizer = keras.regularizers.l2(0.001)
         self.initial_lr = 1e-3
@@ -676,9 +640,7 @@ class MyPaperModel(AbstractModelClass):
         ##########################################
         inception_model = InceptionModelBase()
 
-        X_input = keras.layers.Input(
-            shape=(
-            self.window_history_size + 1, 1, self.channels))  # add 1 to window_size to include current time step t0
+        X_input = keras.layers.Input(shape=self.shape_inputs)
 
         pad_size = PadUtils.get_padding_for_same(first_kernel)
         # X_in = adv_pad.SymmetricPadding2D(padding=pad_size)(X_input)
@@ -696,7 +658,7 @@ class MyPaperModel(AbstractModelClass):
                                                padding=self.padding)
         # out_minor1 = flatten_tail(X_in, 'minor_1', False, self.dropout_rate, self.window_lead_time,
         #                           self.activation, 32, 64)
-        out_minor1 = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.window_lead_time,
+        out_minor1 = flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=self.shape_outputs,
                                   output_activation='linear', reduction_filter=32,
                                   name='minor_1', bound_weight=False, dropout_rate=self.dropout_rate,
                                   kernel_regularizer=self.regularizer
@@ -714,7 +676,7 @@ class MyPaperModel(AbstractModelClass):
         #                                        batch_normalisation=True)
         #############################################
 
-        out_main = flatten_tail(X_in, inner_neurons=64 * 2, activation=activation, output_neurons=self.window_lead_time,
+        out_main = flatten_tail(X_in, inner_neurons=64 * 2, activation=activation, output_neurons=self.shape_outputs,
                                 output_activation='linear',  reduction_filter=64 * 2,
                                 name='Main', bound_weight=False, dropout_rate=self.dropout_rate,
                                 kernel_regularizer=self.regularizer
diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py
index ff5c2bc3ee2ef0923ac50f91ce5acd6807e1eb2e..5cc449aac88ebab58689656820769fe7751f6098 100644
--- a/mlair/plotting/postprocessing_plotting.py
+++ b/mlair/plotting/postprocessing_plotting.py
@@ -19,7 +19,7 @@ import xarray as xr
 from matplotlib.backends.backend_pdf import PdfPages
 
 from mlair import helpers
-from mlair.data_handling import DataGenerator
+from mlair.data_handler.iterator import DataCollection
 from mlair.helpers import TimeTrackingWrapper
 
 logging.getLogger('matplotlib').setLevel(logging.WARNING)
@@ -236,12 +236,10 @@ class PlotStationMap(AbstractPlotClass):
 
         import cartopy.crs as ccrs
         if generators is not None:
-            for color, gen in generators.items():
-                for k, v in enumerate(gen):
-                    station_coords = gen.get_data_generator(k).meta.loc[['station_lon', 'station_lat']]
-                    # station_names = gen.get_data_generator(k).meta.loc[['station_id']]
-                    IDx, IDy = float(station_coords.loc['station_lon'].values), float(
-                        station_coords.loc['station_lat'].values)
+            for color, data_collection in generators.items():
+                for station in data_collection:
+                    coords = station.get_coordinates()
+                    IDx, IDy = coords["lon"], coords["lat"]
                     self._ax.plot(IDx, IDy, mfc=color, mec='k', marker='s', markersize=6, transform=ccrs.PlateCarree())
 
     def _plot(self, generators: Dict):
@@ -713,6 +711,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
         """
         data = helpers.dict_to_xarray(data, "station").sortby(self._x_name)
         self._labels = [str(i) + "d" for i in data.coords["ahead"].values]
+        if "station" not in data.dims:
+            data = data.expand_dims("station")
         return data.to_dataframe("data").reset_index(level=[0, 1, 2])
 
     def _label_add(self, score_only: bool):
@@ -785,8 +785,8 @@ class PlotTimeSeries:
 
     def _plot(self, plot_folder):
         pdf_pages = self._create_pdf_pages(plot_folder)
-        start, end = self._get_time_range(self._load_data(self._stations[0]))
         for pos, station in enumerate(self._stations):
+            start, end = self._get_time_range(self._load_data(self._stations[0]))
             data = self._load_data(station)
             fig, axes, factor = self._create_subplots(start, end)
             nan_list = []
@@ -896,11 +896,12 @@ class PlotAvailability(AbstractPlotClass):
 
     """
 
-    def __init__(self, generators: Dict[str, DataGenerator], plot_folder: str = ".", sampling="daily",
-                 summary_name="data availability"):
+    def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", sampling="daily",
+                 summary_name="data availability", time_dimension="datetime"):
         """Initialise."""
         # create standard Gantt plot for all stations (currently in single pdf file with single page)
         super().__init__(plot_folder, "data_availability")
+        self.dim = time_dimension
         self.sampling = self._get_sampling(sampling)
         plot_dict = self._prepare_data(generators)
         lgd = self._plot(plot_dict)
@@ -923,34 +924,30 @@ class PlotAvailability(AbstractPlotClass):
         elif sampling == "hourly":
             return "h"
 
-    def _prepare_data(self, generators: Dict[str, DataGenerator]):
+    def _prepare_data(self, generators: Dict[str, DataCollection]):
         plt_dict = {}
-        for subset, generator in generators.items():
-            stations = generator.stations
-            for station in stations:
-                station_data = generator.get_data_generator(station)
-                labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean()
+        for subset, data_collection in generators.items():
+            for station in data_collection:
+                labels = station.get_Y(as_numpy=False).resample({self.dim: self.sampling}, skipna=True).mean()
                 labels_bool = labels.sel(window=1).notnull()
-                group = (labels_bool != labels_bool.shift(datetime=1)).cumsum()
+                group = (labels_bool != labels_bool.shift({self.dim: 1})).cumsum()
                 plot_data = pd.DataFrame({"avail": labels_bool.values, "group": group.values},
-                                         index=labels.datetime.values)
+                                         index=labels.coords[self.dim].values)
                 t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0]))
                 t2 = [i[1:] for i in t if i[0]]
 
-                if plt_dict.get(station) is None:
-                    plt_dict[station] = {subset: t2}
+                if plt_dict.get(str(station)) is None:
+                    plt_dict[str(station)] = {subset: t2}
                 else:
-                    plt_dict[station].update({subset: t2})
+                    plt_dict[str(station)].update({subset: t2})
         return plt_dict
 
-    def _summarise_data(self, generators: Dict[str, DataGenerator], summary_name: str):
+    def _summarise_data(self, generators: Dict[str, DataCollection], summary_name: str):
         plt_dict = {}
-        for subset, generator in generators.items():
+        for subset, data_collection in generators.items():
             all_data = None
-            stations = generator.stations
-            for station in stations:
-                station_data = generator.get_data_generator(station)
-                labels = station_data.get_transposed_label().resample(datetime=self.sampling, skipna=True).mean()
+            for station in data_collection:
+                labels = station.get_Y(as_numpy=False).resample({self.dim: self.sampling}, skipna=True).mean()
                 labels_bool = labels.sel(window=1).notnull()
                 if all_data is None:
                     all_data = labels_bool
@@ -959,8 +956,9 @@ class PlotAvailability(AbstractPlotClass):
                     all_data = np.logical_or(tmp, labels_bool).combine_first(
                         all_data)  # apply logical on merge and fill missing with all_data
 
-            group = (all_data != all_data.shift(datetime=1)).cumsum()
-            plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values}, index=all_data.datetime.values)
+            group = (all_data != all_data.shift({self.dim: 1})).cumsum()
+            plot_data = pd.DataFrame({"avail": all_data.values, "group": group.values},
+                                     index=all_data.coords[self.dim].values)
             t = plot_data.groupby("group").apply(lambda x: (x["avail"].head(1)[0], x.index[0], x.shape[0]))
             t2 = [i[1:] for i in t if i[0]]
             if plt_dict.get(summary_name) is None:
diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py
index d93b8c02641acf3127cd63d0814709cc1f56cee2..407465ad4cd99b85c3c5b37eb2aef6e9e71c6424 100644
--- a/mlair/run_modules/experiment_setup.py
+++ b/mlair/run_modules/experiment_setup.py
@@ -13,12 +13,12 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT,
     DEFAULT_HPC_LOGIN_LIST, DEFAULT_HPC_HOST_LIST, DEFAULT_CREATE_NEW_MODEL, DEFAULT_TRAINABLE, \
     DEFAULT_FRACTION_OF_TRAINING, DEFAULT_EXTREME_VALUES, DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY, DEFAULT_PERMUTE_DATA, \
     DEFAULT_BATCH_SIZE, DEFAULT_EPOCHS, DEFAULT_TARGET_VAR, DEFAULT_TARGET_DIM, DEFAULT_WINDOW_LEAD_TIME, \
-    DEFAULT_DIMENSIONS, DEFAULT_INTERPOLATION_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_LIMIT_NAN_FILL, \
+    DEFAULT_DIMENSIONS, DEFAULT_TIME_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_INTERPOLATION_LIMIT, \
     DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \
     DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \
     DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \
     DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST
-from mlair.data_handling import DataPrepJoin
+from mlair.data_handler.advanced_data_handler import DefaultDataPreparation
 from mlair.run_modules.run_environment import RunEnvironment
 from mlair.model_modules.model_class import MyLittleModel as VanillaModel
 
@@ -50,8 +50,6 @@ class ExperimentSetup(RunEnvironment):
         * `plot_path` [.]
         * `forecast_path` [.]
         * `stations` [.]
-        * `network` [.]
-        * `station_type` [.]
         * `statistics_per_var` [.]
         * `variables` [.]
         * `start` [.]
@@ -66,7 +64,7 @@ class ExperimentSetup(RunEnvironment):
 
         # interpolation
         self._set_param("dimensions", dimensions, default={'new_index': ['datetime', 'Stations']})
-        self._set_param("interpolation_dim", interpolation_dim, default='datetime')
+        self._set_param("time_dim", time_dim, default='datetime')
         self._set_param("interpolation_method", interpolation_method, default='linear')
         self._set_param("limit_nan_fill", limit_nan_fill, default=1)
 
@@ -116,10 +114,6 @@ class ExperimentSetup(RunEnvironment):
         investigations are stored outside this structure.
     :param stations: list of stations or single station to use in experiment. If not provided, stations are set to
         :py:const:`default stations <DEFAULT_STATIONS>`.
-    :param network: name of network to restrict to use only stations from this measurement network. Default is
-        `AIRBASE` .
-    :param station_type: restrict network type to one of TOAR's categories (background, traffic, industrial). Default is
-        `None` to use all categories.
     :param variables: list of all variables to use. Valid names can be found in
         `Section 2.1 Parameters <https://join.fz-juelich.de/services/rest/surfacedata/>`_. If not provided, this
         parameter is filled with keys from ``statistics_per_var``.
@@ -140,7 +134,7 @@ class ExperimentSetup(RunEnvironment):
     :param window_lead_time: number of time steps to predict by model (default 3). Time steps `t_0+1` to `t_0+w` are
         predicted.
     :param dimensions:
-    :param interpolation_dim:
+    :param time_dim:
     :param interpolation_method:
     :param limit_nan_fill:
     :param train_start:
@@ -209,8 +203,6 @@ class ExperimentSetup(RunEnvironment):
     def __init__(self,
                  experiment_date=None,
                  stations: Union[str, List[str]] = None,
-                 network: str = None,
-                 station_type: str = None,
                  variables: Union[str, List[str]] = None,
                  statistics_per_var: Dict = None,
                  start: str = None,
@@ -220,16 +212,16 @@ class ExperimentSetup(RunEnvironment):
                  target_dim=None,
                  window_lead_time: int = None,
                  dimensions=None,
-                 interpolation_dim=None,
+                 time_dim=None,
                  interpolation_method=None,
-                 limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None,
+                 interpolation_limit=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None,
                  test_end=None, use_all_stations_on_all_data_sets=None, trainable: bool = None, fraction_of_train: float = None,
                  experiment_path=None, plot_path: str = None, forecast_path: str = None, overwrite_local_data = None, sampling: str = "daily",
                  create_new_model = None, bootstrap_path=None, permute_data_on_training = None, transformation=None,
                  train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None,
                  extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None, number_of_bootstraps=None,
-                 create_new_bootstraps=None, data_path: str = None, login_nodes=None, hpc_hosts=None, model=None,
-                 batch_size=None, epochs=None, data_preparation=None):
+                 create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None,
+                 hpc_hosts=None, model=None, batch_size=None, epochs=None, data_preparation=None, **kwargs):
 
         # create run framework
         super().__init__()
@@ -265,6 +257,9 @@ class ExperimentSetup(RunEnvironment):
         logging.info(f"Experiment path is: {experiment_path}")
         path_config.check_path_and_create(self.data_store.get("experiment_path"))
 
+        # batch path (temporary)
+        self._set_param("batch_path", batch_path, default=os.path.join(experiment_path, "batch_data"))
+
         # set model path
         self._set_param("model_path", None, os.path.join(experiment_path, "model"))
         path_config.check_path_and_create(self.data_store.get("model_path"))
@@ -285,8 +280,6 @@ class ExperimentSetup(RunEnvironment):
 
         # setup for data
         self._set_param("stations", stations, default=DEFAULT_STATIONS)
-        self._set_param("network", network, default=DEFAULT_NETWORK)
-        self._set_param("station_type", station_type, default=DEFAULT_STATION_TYPE)
         self._set_param("statistics_per_var", statistics_per_var, default=DEFAULT_VAR_ALL_DICT)
         self._set_param("variables", variables, default=list(self.data_store.get("statistics_per_var").keys()))
         self._set_param("start", start, default=DEFAULT_START)
@@ -297,7 +290,7 @@ class ExperimentSetup(RunEnvironment):
         self._set_param("sampling", sampling)
         self._set_param("transformation", transformation, default=DEFAULT_TRANSFORMATION)
         self._set_param("transformation", None, scope="preprocessing")
-        self._set_param("data_preparation", data_preparation, default=DataPrepJoin)
+        self._set_param("data_preparation", data_preparation, default=DefaultDataPreparation)
 
         # target
         self._set_param("target_var", target_var, default=DEFAULT_TARGET_VAR)
@@ -306,9 +299,9 @@ class ExperimentSetup(RunEnvironment):
 
         # interpolation
         self._set_param("dimensions", dimensions, default=DEFAULT_DIMENSIONS)
-        self._set_param("interpolation_dim", interpolation_dim, default=DEFAULT_INTERPOLATION_DIM)
+        self._set_param("time_dim", time_dim, default=DEFAULT_TIME_DIM)
         self._set_param("interpolation_method", interpolation_method, default=DEFAULT_INTERPOLATION_METHOD)
-        self._set_param("limit_nan_fill", limit_nan_fill, default=DEFAULT_LIMIT_NAN_FILL)
+        self._set_param("interpolation_limit", interpolation_limit, default=DEFAULT_INTERPOLATION_LIMIT)
 
         # train set parameters
         self._set_param("start", train_start, default=DEFAULT_TRAIN_START, scope="train")
@@ -344,6 +337,7 @@ class ExperimentSetup(RunEnvironment):
         self._set_param("number_of_bootstraps", number_of_bootstraps, default=DEFAULT_NUMBER_OF_BOOTSTRAPS,
                         scope="general.postprocessing")
         self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing")
+        self._set_param("neighbors", ["DEBW030"])  # TODO: just for testing
 
         # check variables, statistics and target variable
         self._check_target_var()
@@ -352,6 +346,15 @@ class ExperimentSetup(RunEnvironment):
         # set model architecture class
         self._set_param("model_class", model, VanillaModel)
 
+        # set remaining kwargs
+        if len(kwargs) > 0:
+            for k, v in kwargs.items():
+                if len(self.data_store.search_name(k)) == 0:
+                    self._set_param(k, v)
+                else:
+                    raise KeyError(f"Given argument {k} with value {v} cannot be set for this experiment due to a "
+                                   f"conflict with an existing entry with same naming: {k}={self.data_store.get(k)}")
+
     def _set_param(self, param: str, value: Any, default: Any = None, scope: str = "general") -> None:
         """Set given parameter and log in debug."""
         if value is None and default is not None:
@@ -391,6 +394,7 @@ class ExperimentSetup(RunEnvironment):
         if not set(target_var).issubset(stat.keys()):
             raise ValueError(f"Could not find target variable {target_var} in statistics_per_var.")
 
+
 if __name__ == "__main__":
     formatter = '%(asctime)s - %(levelname)s: %(message)s  [%(filename)s:%(funcName)s:%(lineno)s]'
     logging.basicConfig(format=formatter, level=logging.DEBUG)
diff --git a/mlair/run_modules/model_setup.py b/mlair/run_modules/model_setup.py
index 9b282c50c7ebccb740fe98b5159eb086aa8828c9..3dc56f01c4f37ce9fc53086d837386af81e5f53d 100644
--- a/mlair/run_modules/model_setup.py
+++ b/mlair/run_modules/model_setup.py
@@ -34,8 +34,6 @@ class ModelSetup(RunEnvironment):
         * `trainable` [.]
         * `create_new_model` [.]
         * `generator` [train]
-        * `window_lead_time` [.]
-        * `window_history_size` [.]
         * `model_class` [.]
 
     Optional objects
@@ -73,7 +71,7 @@ class ModelSetup(RunEnvironment):
     def _run(self):
 
         # set channels depending on inputs
-        self._set_channels()
+        self._set_shapes()
 
         # build model graph using settings from my_model_settings()
         self.build_model()
@@ -94,10 +92,12 @@ class ModelSetup(RunEnvironment):
         # report settings
         self.report_model()
 
-    def _set_channels(self):
-        """Set channels as number of variables of train generator."""
-        channels = self.data_store.get("generator", "train")[0][0].shape[-1]
-        self.data_store.set("channels", channels, self.scope)
+    def _set_shapes(self):
+        """Set input and output shapes from train collection."""
+        shape = list(map(lambda x: x.shape[1:], self.data_store.get("data_collection", "train")[0].get_X()))
+        self.data_store.set("shape_inputs", shape, self.scope)
+        shape = list(map(lambda y: y.shape[1:], self.data_store.get("data_collection", "train")[0].get_Y()))
+        self.data_store.set("shape_outputs", shape, self.scope)
 
     def compile_model(self):
         """
@@ -134,8 +134,8 @@ class ModelSetup(RunEnvironment):
             logging.info('no weights to reload...')
 
     def build_model(self):
-        """Build model using window_history_size, window_lead_time and channels from data store."""
-        args_list = ["window_history_size", "window_lead_time", "channels"]
+        """Build model using input and output shapes from data store."""
+        args_list = ["shape_inputs", "shape_outputs"]
         args = self.data_store.create_args_dict(args_list, self.scope)
         model = self.data_store.get("model_class")
         self.model = model(**args)
@@ -165,7 +165,7 @@ class ModelSetup(RunEnvironment):
                 v = ",".join(self._clean_name(str(u)) for u in v)
             if "<" in str(v):
                 v = self._clean_name(str(v))
-            df.loc[k] = v
+            df.loc[k] = str(v)
         df.sort_index(inplace=True)
         column_format = "ll"
         path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py
index d390ecf05b2e3144b15edba0e30da7eb2b7e430c..d4f409ec503ba0ae37bdd1d1bec4b0207eec453c 100644
--- a/mlair/run_modules/post_processing.py
+++ b/mlair/run_modules/post_processing.py
@@ -13,9 +13,9 @@ import numpy as np
 import pandas as pd
 import xarray as xr
 
-from mlair.data_handling import BootStraps, Distributor, DataGenerator, DataPrepJoin
+from mlair.data_handler import BootStraps, KerasIterator
 from mlair.helpers.datastore import NameNotFoundInDataStore
-from mlair.helpers import TimeTracking, statistics
+from mlair.helpers import TimeTracking, statistics, extract_value
 from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel
 from mlair.model_modules.model_class import AbstractModelClass
 from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \
@@ -42,7 +42,7 @@ class PostProcessing(RunEnvironment):
         * `model_path` [.]
         * `target_var` [.]
         * `sampling` [.]
-        * `window_lead_time` [.]
+        * `output_shape` [model]
         * `evaluate_bootstraps` [postprocessing] and if enabled:
 
             * `create_new_bootstraps` [postprocessing]
@@ -65,14 +65,16 @@ class PostProcessing(RunEnvironment):
         self.model: keras.Model = self._load_model()
         self.ols_model = None
         self.batch_size: int = self.data_store.get_default("batch_size", "model", 64)
-        self.test_data: DataGenerator = self.data_store.get("generator", "test")
-        self.test_data_distributed = Distributor(self.test_data, self.model, self.batch_size)
-        self.train_data: DataGenerator = self.data_store.get("generator", "train")
-        self.val_data: DataGenerator = self.data_store.get("generator", "val")
-        self.train_val_data: DataGenerator = self.data_store.get("generator", "train_val")
+        self.test_data = self.data_store.get("data_collection", "test")
+        batch_path = self.data_store.get("batch_path", scope="test")
+        self.test_data_distributed = KerasIterator(self.test_data, self.batch_size, model=self.model, name="test", batch_path=batch_path)
+        self.train_data = self.data_store.get("data_collection", "train")
+        self.val_data = self.data_store.get("data_collection", "val")
+        self.train_val_data = self.data_store.get("data_collection", "train_val")
         self.plot_path: str = self.data_store.get("plot_path")
         self.target_var = self.data_store.get("target_var")
         self._sampling = self.data_store.get("sampling")
+        self.window_lead_time = extract_value(self.data_store.get("shape_outputs", "model"))
         self.skill_scores = None
         self.bootstrap_skill_scores = None
         self._run()
@@ -141,34 +143,29 @@ class PostProcessing(RunEnvironment):
             bootstrap_path = self.data_store.get("bootstrap_path")
             forecast_path = self.data_store.get("forecast_path")
             number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing")
-
-            # set bootstrap class
-            bootstraps = BootStraps(self.test_data, bootstrap_path, number_of_bootstraps)
-
-            # create bootstrapped predictions for all stations and variables and save it to disk
             dims = ["index", "ahead", "type"]
-            for station in bootstraps.stations:
-                with TimeTracking(name=station):
-                    logging.info(station)
-                    for var in bootstraps.variables:
-                        station_bootstrap = bootstraps.get_generator(station, var)
-
-                        # make bootstrap predictions
-                        bootstrap_predictions = self.model.predict_generator(generator=station_bootstrap,
-                                                                             workers=2,
-                                                                             use_multiprocessing=True)
-                        if isinstance(bootstrap_predictions, list):  # if model is branched model
-                            bootstrap_predictions = bootstrap_predictions[-1]
-                        # save bootstrap predictions separately for each station and variable combination
-                        bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1)
-                        shape = bootstrap_predictions.shape
-                        coords = (range(shape[0]), range(1, shape[1] + 1))
-                        tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims)
-                        file_name = os.path.join(forecast_path, f"bootstraps_{var}_{station}.nc")
-                        tmp.to_netcdf(file_name)
+            for station in self.test_data:
+                logging.info(str(station))
+                X, Y = None, None
+                bootstraps = BootStraps(station, number_of_bootstraps)
+                for boot in bootstraps:
+                    X, Y, (index, dimension) = boot
+                    # make bootstrap predictions
+                    bootstrap_predictions = self.model.predict(X)
+                    if isinstance(bootstrap_predictions, list):  # if model is branched model
+                        bootstrap_predictions = bootstrap_predictions[-1]
+                    # save bootstrap predictions separately for each station and variable combination
+                    bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1)
+                    shape = bootstrap_predictions.shape
+                    coords = (range(shape[0]), range(1, shape[1] + 1))
+                    var = f"{index}_{dimension}"
+                    tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims)
+                    file_name = os.path.join(forecast_path, f"bootstraps_{station}_{var}.nc")
+                    tmp.to_netcdf(file_name)
+                else:
                     # store also true labels for each station
-                    labels = np.expand_dims(bootstraps.get_labels(station), axis=-1)
-                    file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc")
+                    labels = np.expand_dims(Y, axis=-1)
+                    file_name = os.path.join(forecast_path, f"bootstraps_{station}_labels.nc")
                     labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=dims)
                     labels.to_netcdf(file_name)
 
@@ -186,42 +183,50 @@ class PostProcessing(RunEnvironment):
             # extract all requirements from data store
             bootstrap_path = self.data_store.get("bootstrap_path")
             forecast_path = self.data_store.get("forecast_path")
-            window_lead_time = self.data_store.get("window_lead_time")
             number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing")
-            bootstraps = BootStraps(self.test_data, bootstrap_path, number_of_bootstraps)
-
+            forecast_file = f"forecasts_norm_%s_test.nc"
+            bootstraps = BootStraps(self.test_data[0], number_of_bootstraps).bootstraps()
             skill_scores = statistics.SkillScores(None)
             score = {}
-            for station in self.test_data.stations:
+            for station in self.test_data:
                 logging.info(station)
 
                 # get station labels
-                file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc")
+                file_name = os.path.join(forecast_path, f"bootstraps_{str(station)}_labels.nc")
                 labels = xr.open_dataarray(file_name)
                 shape = labels.shape
 
                 # get original forecasts
-                orig = bootstraps.get_orig_prediction(forecast_path, f"forecasts_norm_{station}_test.nc").reshape(shape)
+                orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps)
+                orig = orig.reshape(shape)
                 coords = (range(shape[0]), range(1, shape[1] + 1), ["orig"])
                 orig = xr.DataArray(orig, coords=coords, dims=["index", "ahead", "type"])
 
                 # calculate skill scores for each variable
-                skill = pd.DataFrame(columns=range(1, window_lead_time + 1))
-                for boot in self.test_data.variables:
-                    file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc")
+                skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1))
+                for boot_set in bootstraps:
+                    boot_var = f"{boot_set[0]}_{boot_set[1]}"
+                    file_name = os.path.join(forecast_path, f"bootstraps_{station}_{boot_var}.nc")
                     boot_data = xr.open_dataarray(file_name)
                     boot_data = boot_data.combine_first(labels).combine_first(orig)
                     boot_scores = []
-                    for ahead in range(1, window_lead_time + 1):
+                    for ahead in range(1, self.window_lead_time + 1):
                         data = boot_data.sel(ahead=ahead)
                         boot_scores.append(
-                            skill_scores.general_skill_score(data, forecast_name=boot, reference_name="orig"))
-                    skill.loc[boot] = np.array(boot_scores)
+                            skill_scores.general_skill_score(data, forecast_name=boot_var, reference_name="orig"))
+                    skill.loc[boot_var] = np.array(boot_scores)
 
                 # collect all results in single dictionary
-                score[station] = xr.DataArray(skill, dims=["boot_var", "ahead"])
+                score[str(station)] = xr.DataArray(skill, dims=["boot_var", "ahead"])
             return score
 
+    @staticmethod
+    def get_orig_prediction(path, file_name, number_of_bootstraps, prediction_name="CNN"):
+        file = os.path.join(path, file_name)
+        prediction = xr.open_dataarray(file).sel(type=prediction_name).squeeze()
+        vals = np.tile(prediction.data, (number_of_bootstraps, 1))
+        return vals[~np.isnan(vals).any(axis=1), :]
+
     def _load_model(self) -> keras.models:
         """
         Load NN model either from data store or from local path.
@@ -259,12 +264,13 @@ class PostProcessing(RunEnvironment):
         path = self.data_store.get("forecast_path")
 
         plot_list = self.data_store.get("plot_list", "postprocessing")
+        time_dimension = self.data_store.get("time_dim")
 
         if self.bootstrap_skill_scores is not None and "PlotBootstrapSkillScore" in plot_list:
             PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN")
 
         if "PlotConditionalQuantiles" in plot_list:
-            PlotConditionalQuantiles(self.test_data.stations, data_pred_path=path, plot_folder=self.plot_path)
+            PlotConditionalQuantiles(self.test_data.keys(), data_pred_path=path, plot_folder=self.plot_path)
         if "PlotStationMap" in plot_list:
             if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts") or self.data_store.get(
                     "hostname")[:6] in self.data_store.get("hpc_hosts"):
@@ -273,7 +279,7 @@ class PostProcessing(RunEnvironment):
             else:
                 PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path)
         if "PlotMonthlySummary" in plot_list:
-            PlotMonthlySummary(self.test_data.stations, path, r"forecasts_%s_test.nc", self.target_var,
+            PlotMonthlySummary(self.test_data.keys(), path, r"forecasts_%s_test.nc", self.target_var,
                                plot_folder=self.plot_path)
         if "PlotClimatologicalSkillScore" in plot_list:
             PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, model_setup="CNN")
@@ -282,16 +288,16 @@ class PostProcessing(RunEnvironment):
         if "PlotCompetitiveSkillScore" in plot_list:
             PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN")
         if "PlotTimeSeries" in plot_list:
-            PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path,
+            PlotTimeSeries(self.test_data.keys(), path, r"forecasts_%s_test.nc", plot_folder=self.plot_path,
                            sampling=self._sampling)
         if "PlotAvailability" in plot_list:
             avail_data = {"train": self.train_data, "val": self.val_data, "test": self.test_data}
-            PlotAvailability(avail_data, plot_folder=self.plot_path)
+            PlotAvailability(avail_data, plot_folder=self.plot_path, time_dimension=time_dimension)
 
     def calculate_test_score(self):
         """Evaluate test score of model and save locally."""
-        test_score = self.model.evaluate_generator(generator=self.test_data_distributed.distribute_on_batches(),
-                                                   use_multiprocessing=False, verbose=0, steps=1)
+        test_score = self.model.evaluate_generator(generator=self.test_data_distributed,
+                                                   use_multiprocessing=True, verbose=0, steps=1)
         path = self.data_store.get("model_path")
         with open(os.path.join(path, "test_scores.txt"), "a") as f:
             for index, item in enumerate(test_score):
@@ -311,24 +317,26 @@ class PostProcessing(RunEnvironment):
         be found inside `forecast_path`.
         """
         logging.debug("start make_prediction")
-        for i, _ in enumerate(self.test_data):
-            data = self.test_data.get_data_generator(i)
-            input_data = data.get_transposed_history()
+        time_dimension = self.data_store.get("time_dim")
+        for i, data in enumerate(self.test_data):
+            input_data = data.get_X()
+            target_data = data.get_Y(as_numpy=False)
+            observation_data = data.get_observation()
 
             # get scaling parameters
-            mean, std, transformation_method = data.get_transformation_information(variable=self.target_var)
+            mean, std, transformation_method = data.get_transformation_Y()
 
             for normalised in [True, False]:
                 # create empty arrays
                 nn_prediction, persistence_prediction, ols_prediction, observation = self._create_empty_prediction_arrays(
-                    data, count=4)
+                    target_data, count=4)
 
                 # nn forecast
                 nn_prediction = self._create_nn_forecast(input_data, nn_prediction, mean, std, transformation_method,
                                                          normalised)
 
                 # persistence
-                persistence_prediction = self._create_persistence_forecast(data, persistence_prediction, mean, std,
+                persistence_prediction = self._create_persistence_forecast(observation_data, persistence_prediction, mean, std,
                                                                            transformation_method, normalised)
 
                 # ols
@@ -336,11 +344,12 @@ class PostProcessing(RunEnvironment):
                                                            normalised)
 
                 # observation
-                observation = self._create_observation(data, observation, mean, std, transformation_method, normalised)
+                observation = self._create_observation(target_data, observation, mean, std, transformation_method, normalised)
 
                 # merge all predictions
-                full_index = self.create_fullindex(data.data.indexes['datetime'], self._get_frequency())
-                all_predictions = self.create_forecast_arrays(full_index, list(data.label.indexes['window']),
+                full_index = self.create_fullindex(observation_data.indexes[time_dimension], self._get_frequency())
+                all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes['window']),
+                                                              time_dimension,
                                                               CNN=nn_prediction,
                                                               persi=persistence_prediction,
                                                               obs=observation,
@@ -349,7 +358,7 @@ class PostProcessing(RunEnvironment):
                 # save all forecasts locally
                 path = self.data_store.get("forecast_path")
                 prefix = "forecasts_norm" if normalised else "forecasts"
-                file = os.path.join(path, f"{prefix}_{data.station[0]}_test.nc")
+                file = os.path.join(path, f"{prefix}_{str(data)}_test.nc")
                 all_predictions.to_netcdf(file)
 
     def _get_frequency(self) -> str:
@@ -358,14 +367,14 @@ class PostProcessing(RunEnvironment):
         return getter.get(self._sampling, None)
 
     @staticmethod
-    def _create_observation(data: DataPrepJoin, _, mean: xr.DataArray, std: xr.DataArray, transformation_method: str,
+    def _create_observation(data, _, mean: xr.DataArray, std: xr.DataArray, transformation_method: str,
                             normalised: bool) -> xr.DataArray:
         """
         Create observation as ground truth from given data.
 
         Inverse transformation is applied to the ground truth to get the output in the original space.
 
-        :param data: transposed observation from DataPrep
+        :param data: observation
         :param mean: mean of target value transformation
         :param std: standard deviation of target value transformation
         :param transformation_method: target values transformation method
@@ -373,10 +382,9 @@ class PostProcessing(RunEnvironment):
 
         :return: filled data array with observation
         """
-        obs = data.label.copy()
         if not normalised:
-            obs = statistics.apply_inverse_transformation(obs, mean, std, transformation_method)
-        return obs
+            data = statistics.apply_inverse_transformation(data, mean, std, transformation_method)
+        return data
 
     def _create_ols_forecast(self, input_data: xr.DataArray, ols_prediction: xr.DataArray, mean: xr.DataArray,
                              std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray:
@@ -397,12 +405,11 @@ class PostProcessing(RunEnvironment):
         tmp_ols = self.ols_model.predict(input_data)
         if not normalised:
             tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method)
-        tmp_ols = np.expand_dims(tmp_ols, axis=1)
         target_shape = ols_prediction.values.shape
         ols_prediction.values = np.swapaxes(tmp_ols, 2, 0) if target_shape != tmp_ols.shape else tmp_ols
         return ols_prediction
 
-    def _create_persistence_forecast(self, data: DataPrepJoin, persistence_prediction: xr.DataArray, mean: xr.DataArray,
+    def _create_persistence_forecast(self, data, persistence_prediction: xr.DataArray, mean: xr.DataArray,
                                      std: xr.DataArray, transformation_method: str, normalised: bool) -> xr.DataArray:
         """
         Create persistence forecast with given data.
@@ -410,7 +417,7 @@ class PostProcessing(RunEnvironment):
         Persistence is deviated from the value at t=0 and applied to all following time steps (t+1, ..., t+window).
         Inverse transformation is applied to the forecast to get the output in the original space.
 
-        :param data: DataPrep
+        :param data: observation
         :param persistence_prediction: empty array in right shape to fill with data
         :param mean: mean of target value transformation
         :param std: standard deviation of target value transformation
@@ -419,12 +426,10 @@ class PostProcessing(RunEnvironment):
 
         :return: filled data array with persistence predictions
         """
-        tmp_persi = data.observation.copy().sel({'window': 0})
+        tmp_persi = data.copy()
         if not normalised:
             tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method)
-        window_lead_time = self.data_store.get("window_lead_time")
-        persistence_prediction.values = np.expand_dims(np.tile(tmp_persi.squeeze('Stations'), (window_lead_time, 1)),
-                                                       axis=1)
+        persistence_prediction.values = np.tile(tmp_persi, (self.window_lead_time, 1)).T
         return persistence_prediction
 
     def _create_nn_forecast(self, input_data: xr.DataArray, nn_prediction: xr.DataArray, mean: xr.DataArray,
@@ -449,18 +454,20 @@ class PostProcessing(RunEnvironment):
         if not normalised:
             tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method)
         if isinstance(tmp_nn, list):
-            nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn[-1], axis=1), 2, 0)
+            nn_prediction.values = tmp_nn[-1]
         elif tmp_nn.ndim == 3:
-            nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn[-1, ...], axis=1), 2, 0)
+            nn_prediction.values = tmp_nn[-1, ...]
         elif tmp_nn.ndim == 2:
-            nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn, axis=1), 2, 0)
+            nn_prediction.values = tmp_nn
         else:
             raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {tmp_nn.dims}.")
         return nn_prediction
 
     @staticmethod
-    def _create_empty_prediction_arrays(generator, count=1):
-        return [generator.label.copy() for _ in range(count)]
+    def _create_empty_prediction_arrays(target_data, count=1):
+        """
+        Create array to collect all predictions. Expand target data by a station dimension. """
+        return [target_data.copy() for _ in range(count)]
 
     @staticmethod
     def create_fullindex(df: Union[xr.DataArray, pd.DataFrame, pd.DatetimeIndex], freq: str) -> pd.DataFrame:
@@ -488,7 +495,7 @@ class PostProcessing(RunEnvironment):
         return index
 
     @staticmethod
-    def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], **kwargs):
+    def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], time_dimension, **kwargs):
         """
         Combine different forecast types into single xarray.
 
@@ -503,12 +510,8 @@ class PostProcessing(RunEnvironment):
         res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan),
                            coords=[index.index, ahead_names, keys], dims=['index', 'ahead', 'type'])
         for k, v in kwargs.items():
-            try:
-                match_index = np.stack(set(res.index.values) & set(v.index.values))
-                res.loc[match_index, :, k] = v.loc[match_index]
-            except AttributeError:  # v is xarray type and has no attribute .index
-                match_index = np.stack(set(res.index.values) & set(v.indexes['datetime'].values))
-                res.loc[match_index, :, k] = v.sel({'datetime': match_index}).squeeze('Stations').transpose()
+            match_index = np.stack(set(res.index.values) & set(v.indexes[time_dimension].values))
+            res.loc[match_index, :, k] = v.loc[match_index]
         return res
 
     def _get_external_data(self, station: str) -> Union[xr.DataArray, None]:
@@ -521,12 +524,15 @@ class PostProcessing(RunEnvironment):
         :param station: name of station to load external data.
         """
         try:
-            data = self.train_val_data.get_data_generator(station)
-            mean, std, transformation_method = data.get_transformation_information(variable=self.target_var)
-            external_data = self._create_observation(data, None, mean, std, transformation_method, normalised=False)
-            external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"])
-            return external_data.rename({'datetime': 'index'})
-        except KeyError:
+            data = self.train_val_data[station]
+            # target_data = data.get_Y(as_numpy=False)
+            observation = data.get_observation()
+            mean, std, transformation_method = data.get_transformation_Y()
+            # external_data = self._create_observation(target_data, None, mean, std, transformation_method, normalised=False)
+            # external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"])
+            external_data = self._create_observation(observation, None, mean, std, transformation_method, normalised=False)
+            return external_data.rename({external_data.dims[0]: 'index'})
+        except IndexError:
             return None
 
     def calculate_skill_scores(self) -> Tuple[Dict, Dict]:
@@ -540,15 +546,14 @@ class PostProcessing(RunEnvironment):
         :return: competitive and climatological skill scores
         """
         path = self.data_store.get("forecast_path")
-        window_lead_time = self.data_store.get("window_lead_time")
         skill_score_competitive = {}
         skill_score_climatological = {}
-        for station in self.test_data.stations:
-            file = os.path.join(path, f"forecasts_{station}_test.nc")
+        for station in self.test_data:
+            file = os.path.join(path, f"forecasts_{str(station)}_test.nc")
             data = xr.open_dataarray(file)
             skill_score = statistics.SkillScores(data)
             external_data = self._get_external_data(station)
-            skill_score_competitive[station] = skill_score.skill_scores(window_lead_time)
+            skill_score_competitive[station] = skill_score.skill_scores(self.window_lead_time)
             skill_score_climatological[station] = skill_score.climatological_skill_scores(external_data,
-                                                                                          window_lead_time)
+                                                                                          self.window_lead_time)
         return skill_score_competitive, skill_score_climatological
diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py
index 243daf20e4e99331fb32ed89769dbf584c235110..b4185df2f6699cb20ac96e32661433e7a6164abc 100644
--- a/mlair/run_modules/pre_processing.py
+++ b/mlair/run_modules/pre_processing.py
@@ -5,22 +5,17 @@ __date__ = '2019-11-25'
 
 import logging
 import os
-from typing import Tuple, Dict, List
+from typing import Tuple
 
 import numpy as np
 import pandas as pd
 
-from mlair.data_handling import DataGenerator
+from mlair.data_handler import DataCollection
 from mlair.helpers import TimeTracking
 from mlair.configuration import path_config
 from mlair.helpers.join import EmptyQueryResult
 from mlair.run_modules.run_environment import RunEnvironment
 
-DEFAULT_ARGS_LIST = ["data_path", "stations", "variables", "interpolation_dim", "target_dim", "target_var"]
-DEFAULT_KWARGS_LIST = ["limit_nan_fill", "window_history_size", "window_lead_time", "statistics_per_var", "min_length",
-                       "station_type", "overwrite_local_data", "start", "end", "sampling", "transformation",
-                       "extreme_values", "extremes_on_right_tail_only", "network", "data_preparation"]
-
 
 class PreProcessing(RunEnvironment):
     """
@@ -59,10 +54,11 @@ class PreProcessing(RunEnvironment):
         self._run()
 
     def _run(self):
-        args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope="preprocessing")
-        kwargs = self.data_store.create_args_dict(DEFAULT_KWARGS_LIST, scope="preprocessing")
         stations = self.data_store.get("stations")
-        valid_stations = self.check_valid_stations(args, kwargs, stations, load_tmp=False, save_tmp=False, name="all")
+        data_preparation = self.data_store.get("data_preparation")
+        _, valid_stations = self.validate_station(data_preparation, stations, "preprocessing", overwrite_local_data=True)
+        if len(valid_stations) == 0:
+            raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.")
         self.data_store.set("stations", valid_stations)
         self.split_train_val_test()
         self.report_pre_processing()
@@ -70,16 +66,14 @@ class PreProcessing(RunEnvironment):
     def report_pre_processing(self):
         """Log some metrics on data and create latex report."""
         logging.debug(20 * '##')
-        n_train = len(self.data_store.get('generator', 'train'))
-        n_val = len(self.data_store.get('generator', 'val'))
-        n_test = len(self.data_store.get('generator', 'test'))
+        n_train = len(self.data_store.get('data_collection', 'train'))
+        n_val = len(self.data_store.get('data_collection', 'val'))
+        n_test = len(self.data_store.get('data_collection', 'test'))
         n_total = n_train + n_val + n_test
         logging.debug(f"Number of all stations: {n_total}")
         logging.debug(f"Number of training stations: {n_train}")
         logging.debug(f"Number of val stations: {n_val}")
         logging.debug(f"Number of test stations: {n_test}")
-        logging.debug(f"TEST SHAPE OF GENERATOR CALL: {self.data_store.get('generator', 'test')[0][0].shape}"
-                      f"{self.data_store.get('generator', 'test')[0][1].shape}")
         self.create_latex_report()
 
     def create_latex_report(self):
@@ -121,11 +115,12 @@ class PreProcessing(RunEnvironment):
         set_names = ["train", "val", "test"]
         df = pd.DataFrame(columns=meta_data + set_names)
         for set_name in set_names:
-            data: DataGenerator = self.data_store.get("generator", set_name)
-            for station in data.stations:
-                df.loc[station, set_name] = data.get_data_generator(station).get_transposed_label().shape[0]
-                if df.loc[station, meta_data].isnull().any():
-                    df.loc[station, meta_data] = data.get_data_generator(station).meta.loc[meta_data].values.flatten()
+            data = self.data_store.get("data_collection", set_name)
+            for station in data:
+                station_name = str(station.id_class)
+                df.loc[station_name, set_name] = station.get_Y()[0].shape[0]
+                if df.loc[station_name, meta_data].isnull().any():
+                    df.loc[station_name, meta_data] = station.id_class.meta.loc[meta_data].values.flatten()
             df.loc["# Samples", set_name] = df.loc[:, set_name].sum()
             df.loc["# Stations", set_name] = df.loc[:, set_name].count()
         df[meta_round] = df[meta_round].astype(float).round(precision)
@@ -147,7 +142,7 @@ class PreProcessing(RunEnvironment):
         Split data into subsets.
 
         Currently: train, val, test and train_val (actually this is only the merge of train and val, but as an separate
-        generator). IMPORTANT: Do not change to order of the execution of create_set_split. The train subset needs
+        data_collection). IMPORTANT: Do not change to order of the execution of create_set_split. The train subset needs
         always to be executed at first, to set a proper transformation.
         """
         fraction_of_training = self.data_store.get("fraction_of_training")
@@ -184,40 +179,20 @@ class PreProcessing(RunEnvironment):
         return train_index, val_index, test_index, train_val_index
 
     def create_set_split(self, index_list: slice, set_name: str) -> None:
-        """
-        Create subsets and store in data store.
-
-        Create the subset for given split index and stores the DataGenerator with given set name in data store as
-        `generator`. Check for all valid stations using the default (kw)args for given scope and create the
-        DataGenerator for all valid stations. Also set all transformation information, if subset is training set. Make
-        sure, that the train set is executed first, and all other subsets afterwards.
-
-        :param index_list: list of all stations to use for the set. If attribute use_all_stations_on_all_data_sets=True,
-            this list is ignored.
-        :param set_name: name to load/save all information from/to data store.
-        """
-        args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope=set_name)
-        kwargs = self.data_store.create_args_dict(DEFAULT_KWARGS_LIST, scope=set_name)
-        stations = args["stations"]
+        # get set stations
+        stations = self.data_store.get("stations", scope=set_name)
         if self.data_store.get("use_all_stations_on_all_data_sets"):
             set_stations = stations
         else:
             set_stations = stations[index_list]
         logging.debug(f"{set_name.capitalize()} stations (len={len(set_stations)}): {set_stations}")
-        # validate set
-        set_stations = self.check_valid_stations(args, kwargs, set_stations, load_tmp=False, name=set_name)
-        self.data_store.set("stations", set_stations, scope=set_name)
-        # create set generator and store
-        set_args = self.data_store.create_args_dict(DEFAULT_ARGS_LIST, scope=set_name)
-        data_set = DataGenerator(**set_args, **kwargs)
-        self.data_store.set("generator", data_set, scope=set_name)
-        # extract transformation from train set
-        if set_name == "train":
-            self.data_store.set("transformation", data_set.transformation)
+        # create set data_collection and store
+        data_preparation = self.data_store.get("data_preparation")
+        collection, valid_stations = self.validate_station(data_preparation, set_stations, set_name)
+        self.data_store.set("stations", valid_stations, scope=set_name)
+        self.data_store.set("data_collection", collection, scope=set_name)
 
-    @staticmethod
-    def check_valid_stations(args: Dict, kwargs: Dict, all_stations: List[str], load_tmp=True, save_tmp=True,
-                             name=None):
+    def validate_station(self, data_preparation, set_stations, set_name=None, overwrite_local_data=False):
         """
         Check if all given stations in `all_stations` are valid.
 
@@ -225,7 +200,7 @@ class PreProcessing(RunEnvironment):
         loading time are logged in debug mode.
 
         :param args: Dictionary with required parameters for DataGenerator class (`data_path`, `network`, `stations`,
-            `variables`, `interpolation_dim`, `target_dim`, `target_var`).
+            `variables`, `time_dim`, `target_dim`, `target_var`).
         :param kwargs: positional parameters for the DataGenerator class (e.g. `start`, `interpolation_method`,
             `window_lead_time`).
         :param all_stations: All stations to check.
@@ -234,26 +209,31 @@ class PreProcessing(RunEnvironment):
         :return: Corrected list containing only valid station IDs.
         """
         t_outer = TimeTracking()
-        t_inner = TimeTracking(start=False)
-        logging.info(f"check valid stations started{' (%s)' % name if name else ''}")
+        logging.info(f"check valid stations started{' (%s)' % (set_name if set_name is not None else 'all')}")
+        # calculate transformation using train data
+        if set_name == "train":
+            self.transformation(data_preparation, set_stations)
+        # start station check
+        collection = DataCollection()
         valid_stations = []
-
-        # all required arguments of the DataGenerator can be found in args, positional arguments in args and kwargs
-        data_gen = DataGenerator(**args, **kwargs)
-        for pos, station in enumerate(all_stations):
-            t_inner.run()
-            logging.info(f"check station {station} ({pos + 1} / {len(all_stations)})")
+        kwargs = self.data_store.create_args_dict(data_preparation.requirements(), scope=set_name)
+        for station in set_stations:
             try:
-                data = data_gen.get_data_generator(key=station, load_local_tmp_storage=load_tmp,
-                                                   save_local_tmp_storage=save_tmp)
-                if data.history is None:
-                    raise AttributeError
+                dp = data_preparation.build(station, name_affix=set_name, **kwargs)
+                collection.add(dp)
                 valid_stations.append(station)
-                logging.debug(
-                    f'{station}: history_shape = {data.history.transpose("datetime", "window", "Stations", "variables").shape}')
-                logging.debug(f"{station}: loading time = {t_inner}")
             except (AttributeError, EmptyQueryResult):
                 continue
-        logging.info(f"run for {t_outer} to check {len(all_stations)} station(s). Found {len(valid_stations)}/"
-                     f"{len(all_stations)} valid stations.")
-        return valid_stations
+        logging.info(f"run for {t_outer} to check {len(set_stations)} station(s). Found {len(collection)}/"
+                     f"{len(set_stations)} valid stations.")
+        return collection, valid_stations
+
+    def transformation(self, data_preparation, stations):
+        if hasattr(data_preparation, "transformation"):
+            kwargs = self.data_store.create_args_dict(data_preparation.requirements(), scope="train")
+            transformation_dict = data_preparation.transformation(stations, **kwargs)
+            if transformation_dict is not None:
+                self.data_store.set("transformation", transformation_dict)
+
+
+
diff --git a/mlair/run_modules/training.py b/mlair/run_modules/training.py
index 23347a30b6e55c6903154128aab055d39045c965..f8909e15341f959455b1e8da0b0cb7502bdfa81b 100644
--- a/mlair/run_modules/training.py
+++ b/mlair/run_modules/training.py
@@ -11,7 +11,7 @@ from typing import Union
 import keras
 from keras.callbacks import Callback, History
 
-from mlair.data_handling import Distributor
+from mlair.data_handler import KerasIterator
 from mlair.model_modules.keras_extensions import CallbackHandler
 from mlair.plotting.training_monitoring import PlotModelHistory, PlotModelLearningRate
 from mlair.run_modules.run_environment import RunEnvironment
@@ -65,9 +65,9 @@ class Training(RunEnvironment):
         """Set up and run training."""
         super().__init__()
         self.model: keras.Model = self.data_store.get("model", "model")
-        self.train_set: Union[Distributor, None] = None
-        self.val_set: Union[Distributor, None] = None
-        self.test_set: Union[Distributor, None] = None
+        self.train_set: Union[KerasIterator, None] = None
+        self.val_set: Union[KerasIterator, None] = None
+        self.test_set: Union[KerasIterator, None] = None
         self.batch_size = self.data_store.get("batch_size")
         self.epochs = self.data_store.get("epochs")
         self.callbacks: CallbackHandler = self.data_store.get("callbacks", "model")
@@ -104,9 +104,9 @@ class Training(RunEnvironment):
 
         :param mode: name of set, should be from ["train", "val", "test"]
         """
-        gen = self.data_store.get("generator", mode)
-        kwargs = self.data_store.create_args_dict(["permute_data", "upsampling"], scope=mode)
-        setattr(self, f"{mode}_set", Distributor(gen, self.model, self.batch_size, **kwargs))
+        collection = self.data_store.get("data_collection", mode)
+        kwargs = self.data_store.create_args_dict(["upsampling", "shuffle_batches", "batch_path"], scope=mode)
+        setattr(self, f"{mode}_set", KerasIterator(collection, self.batch_size, model=self.model, name=mode, **kwargs))
 
     def set_generators(self) -> None:
         """
@@ -130,15 +130,15 @@ class Training(RunEnvironment):
         """
         logging.info(f"Train with {len(self.train_set)} mini batches.")
         logging.info(f"Train with option upsampling={self.train_set.upsampling}.")
-        logging.info(f"Train with option data_permutation={self.train_set.do_data_permutation}.")
+        logging.info(f"Train with option shuffle={self.train_set.shuffle}.")
 
         checkpoint = self.callbacks.get_checkpoint()
         if not os.path.exists(checkpoint.filepath) or self._create_new_model:
-            history = self.model.fit_generator(generator=self.train_set.distribute_on_batches(),
+            history = self.model.fit_generator(generator=self.train_set,
                                                steps_per_epoch=len(self.train_set),
                                                epochs=self.epochs,
                                                verbose=2,
-                                               validation_data=self.val_set.distribute_on_batches(),
+                                               validation_data=self.val_set,
                                                validation_steps=len(self.val_set),
                                                callbacks=self.callbacks.get_callbacks(as_dict=False))
         else:
@@ -148,11 +148,11 @@ class Training(RunEnvironment):
             self.model = keras.models.load_model(checkpoint.filepath)
             hist: History = self.callbacks.get_callback_by_name("hist")
             initial_epoch = max(hist.epoch) + 1
-            _ = self.model.fit_generator(generator=self.train_set.distribute_on_batches(),
+            _ = self.model.fit_generator(generator=self.train_set,
                                          steps_per_epoch=len(self.train_set),
                                          epochs=self.epochs,
                                          verbose=2,
-                                         validation_data=self.val_set.distribute_on_batches(),
+                                         validation_data=self.val_set,
                                          validation_steps=len(self.val_set),
                                          callbacks=self.callbacks.get_callbacks(as_dict=False),
                                          initial_epoch=initial_epoch)
@@ -234,7 +234,7 @@ class Training(RunEnvironment):
     def report_training(self):
         data = {"mini batches": len(self.train_set),
                 "upsampling extremes": self.train_set.upsampling,
-                "shuffling": self.train_set.do_data_permutation,
+                "shuffling": self.train_set.shuffle,
                 "created new model": self._create_new_model,
                 "epochs": self.epochs,
                 "batch size": self.batch_size}
diff --git a/mlair/run_script.py b/mlair/run_script.py
index 55e20e1e6914de27fc9d13893edacc504ab554f7..00a28f686bf392f76787b56a48790999e9fa5c05 100644
--- a/mlair/run_script.py
+++ b/mlair/run_script.py
@@ -6,17 +6,15 @@ import inspect
 
 
 def run(stations=None,
-        station_type=None,
         trainable=None, create_new_model=None,
         window_history_size=None,
         experiment_date="testrun",
-        network=None,
         variables=None, statistics_per_var=None,
         start=None, end=None,
         target_var=None, target_dim=None,
         window_lead_time=None,
         dimensions=None,
-        interpolate_method=None, interpolate_dim=None, limit_nan_fill=None,
+        interpolation_method=None, interpolation_dim=None, interpolation_limit=None,
         train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None,
         use_all_stations_on_all_data_sets=None, fraction_of_train=None,
         experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None,
@@ -29,15 +27,17 @@ def run(stations=None,
         model=None,
         batch_size=None,
         epochs=None,
-        data_preparation=None):
+        data_preparation=None,
+        **kwargs):
 
     params = inspect.getfullargspec(DefaultWorkflow).args
-    kwargs = {k: v for k, v in locals().items() if k in params and v is not None}
+    kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None}
 
-    workflow = DefaultWorkflow(**kwargs)
+    workflow = DefaultWorkflow(**kwargs_default, **kwargs)
     workflow.run()
 
 
 if __name__ == "__main__":
-
-    run()
+    from mlair.model_modules.model_class import MyBranchedModel
+    run(statistics_per_var={'o3': 'dma8eu', "temp": "maximum"}, trainable=True,
+        create_new_model=True, model=MyBranchedModel, station_type="background")
diff --git a/mlair/workflows/abstract_workflow.py b/mlair/workflows/abstract_workflow.py
index f187ff11e849960b4a63eddd5d11e2ce1ddf2a11..d3fe480fdfe09393fbf2051d8795735e9217a8ad 100644
--- a/mlair/workflows/abstract_workflow.py
+++ b/mlair/workflows/abstract_workflow.py
@@ -26,4 +26,4 @@ class Workflow:
         """Run workflow embedded in a run environment and according to the stage's ordering."""
         with RunEnvironment():
             for stage, kwargs in self._registry.items():
-                stage(**kwargs)
\ No newline at end of file
+                stage(**kwargs)
diff --git a/mlair/workflows/default_workflow.py b/mlair/workflows/default_workflow.py
index f42c0389d81f655fb0c8582a15e42acc853f757d..3dba7e6c5c5773fa4d74860b2cba67a5804123b7 100644
--- a/mlair/workflows/default_workflow.py
+++ b/mlair/workflows/default_workflow.py
@@ -14,17 +14,15 @@ class DefaultWorkflow(Workflow):
     the mentioned ordering."""
 
     def __init__(self, stations=None,
-        station_type=None,
         trainable=None, create_new_model=None,
         window_history_size=None,
         experiment_date="testrun",
-        network=None,
         variables=None, statistics_per_var=None,
         start=None, end=None,
         target_var=None, target_dim=None,
         window_lead_time=None,
         dimensions=None,
-        interpolate_method=None, interpolate_dim=None, limit_nan_fill=None,
+        interpolation_method=None, time_dim=None, limit_nan_fill=None,
         train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None,
         use_all_stations_on_all_data_sets=None, fraction_of_train=None,
         experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None,
@@ -37,13 +35,14 @@ class DefaultWorkflow(Workflow):
         model=None,
         batch_size=None,
         epochs=None,
-        data_preparation=None):
+        data_preparation=None,
+                 **kwargs):
         super().__init__()
 
         # extract all given kwargs arguments
         params = remove_items(inspect.getfullargspec(self.__init__).args, "self")
-        kwargs = {k: v for k, v in locals().items() if k in params and v is not None}
-        self._setup(**kwargs)
+        kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None}
+        self._setup(**kwargs_default, **kwargs)
 
     def _setup(self, **kwargs):
         """Set up default workflow."""
@@ -59,17 +58,15 @@ class DefaultWorkflowHPC(Workflow):
     Training and PostProcessing in exact the mentioned ordering."""
 
     def __init__(self, stations=None,
-        station_type=None,
         trainable=None, create_new_model=None,
         window_history_size=None,
         experiment_date="testrun",
-        network=None,
         variables=None, statistics_per_var=None,
         start=None, end=None,
         target_var=None, target_dim=None,
         window_lead_time=None,
         dimensions=None,
-        interpolate_method=None, interpolate_dim=None, limit_nan_fill=None,
+        interpolation_method=None, time_dim=None, limit_nan_fill=None,
         train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None,
         use_all_stations_on_all_data_sets=None, fraction_of_train=None,
         experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None,
@@ -82,13 +79,13 @@ class DefaultWorkflowHPC(Workflow):
         model=None,
         batch_size=None,
         epochs=None,
-        data_preparation=None):
+        data_preparation=None, **kwargs):
         super().__init__()
 
         # extract all given kwargs arguments
         params = remove_items(inspect.getfullargspec(self.__init__).args, "self")
-        kwargs = {k: v for k, v in locals().items() if k in params and v is not None}
-        self._setup(**kwargs)
+        kwargs_default = {k: v for k, v in locals().items() if k in params and v is not None}
+        self._setup(**kwargs_default, **kwargs)
 
     def _setup(self, **kwargs):
         """Set up default workflow."""
diff --git a/requirements.txt b/requirements.txt
index 71bb1338effff38092510982d4a2c1f37f7b026a..7da29a05b748531fd4ec327ff17f432ff1ecaabb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -38,9 +38,9 @@ pydot==1.4.1
 pyparsing==2.4.6
 pyproj==2.5.0
 pyshp==2.1.0
-pytest==5.3.5
-pytest-cov==2.8.1
-pytest-html==2.0.1
+pytest==6.0.0
+pytest-cov==2.10.0
+pytest-html==2.1.1
 pytest-lazy-fixture==0.6.3
 pytest-metadata==1.8.0
 pytest-sugar
diff --git a/test/test_configuration/test_path_config.py b/test/test_configuration/test_path_config.py
index 128ddfceeed53920e6424d8d5d8f6addf5451c44..b97763632922fc2aaffaf267cfbc76ff99e25b6f 100644
--- a/test/test_configuration/test_path_config.py
+++ b/test/test_configuration/test_path_config.py
@@ -16,12 +16,12 @@ class TestPrepareHost:
     @mock.patch("getpass.getuser", return_value="testUser")
     @mock.patch("os.path.exists", return_value=True)
     def test_prepare_host(self, mock_host, mock_user, mock_path):
-        assert prepare_host() == "/home/testUser/machinelearningtools/data/toar_daily/"
+        assert prepare_host() == "/home/testUser/mlair/data/toar_daily/"
         assert prepare_host() == "/home/testUser/Data/toar_daily/"
         assert prepare_host() == "/home/testUser/Data/toar_daily/"
         assert prepare_host() == "/p/project/cjjsc42/testUser/DATA/toar_daily/"
         assert prepare_host() == "/p/project/deepacf/intelliaq/testUser/DATA/toar_daily/"
-        assert prepare_host() == '/home/testUser/machinelearningtools/data/toar_daily/'
+        assert prepare_host() == '/home/testUser/mlair/data/toar_daily/'
 
     @mock.patch("socket.gethostname", return_value="NotExistingHostName")
     @mock.patch("getpass.getuser", return_value="zombie21")
@@ -48,7 +48,7 @@ class TestPrepareHost:
     @mock.patch("os.makedirs", side_effect=None)
     def test_os_path_exists(self, mock_host, mock_user, mock_path, mock_check):
         path = prepare_host()
-        assert path == "/home/testUser/machinelearningtools/data/toar_daily/"
+        assert path == "/home/testUser/mlair/data/toar_daily/"
 
 
 class TestSetExperimentName:
diff --git a/test/test_data_handling/test_bootstraps.py b/test/test_data_handler/old_t_bootstraps.py
similarity index 98%
rename from test/test_data_handling/test_bootstraps.py
rename to test/test_data_handler/old_t_bootstraps.py
index 0d5f3a69b08fa646b66691e1265b9bfe05f114a5..9616ed3f457d74e44e8a9eae5a3ed862fa804011 100644
--- a/test/test_data_handling/test_bootstraps.py
+++ b/test/test_data_handler/old_t_bootstraps.py
@@ -7,9 +7,8 @@ import numpy as np
 import pytest
 import xarray as xr
 
-from mlair.data_handling.bootstraps import BootStraps, CreateShuffledData, BootStrapGenerator
-from mlair.data_handling.data_generator import DataGenerator
-from mlair.data_handling import DataPrepJoin
+from mlair.data_handler.bootstraps import BootStraps
+from src.data_handler import DataPrepJoin
 
 
 @pytest.fixture
diff --git a/test/test_data_handling/test_data_generator.py b/test/test_data_handler/old_t_data_generator.py
similarity index 98%
rename from test/test_data_handling/test_data_generator.py
rename to test/test_data_handler/old_t_data_generator.py
index 413d25dd4ac2fe722600bc44f5b2307388e8307a..9198923e2f75601f2ce7e6dc18a663da647eaadb 100644
--- a/test/test_data_handling/test_data_generator.py
+++ b/test/test_data_handler/old_t_data_generator.py
@@ -6,8 +6,7 @@ import numpy as np
 import pytest
 import xarray as xr
 
-from mlair.data_handling.data_generator import DataGenerator
-from mlair.data_handling import DataPrepJoin
+from mlair.data_hander import DataPrepJoin
 from mlair.helpers.join import EmptyQueryResult
 
 
@@ -80,7 +79,7 @@ class TestDataGenerator:
         assert gen.stations == ['DEBW107']
         assert gen.variables == ['o3', 'temp']
         assert gen.station_type is None
-        assert gen.interpolation_dim == 'datetime'
+        assert gen.time_dim == 'datetime'
         assert gen.target_dim == 'variables'
         assert gen.target_var == 'o3'
         assert gen.interpolation_method == "linear"
diff --git a/test/test_data_handling/test_data_preparation.py b/test/test_data_handler/old_t_data_preparation.py
similarity index 99%
rename from test/test_data_handling/test_data_preparation.py
rename to test/test_data_handler/old_t_data_preparation.py
index ebd351b020ce8a5902cbe7ed201876ce610b8f6a..586e17158a93880e2a98bf64189fa947299a64f3 100644
--- a/test/test_data_handling/test_data_preparation.py
+++ b/test/test_data_handler/old_t_data_preparation.py
@@ -8,8 +8,8 @@ import pandas as pd
 import pytest
 import xarray as xr
 
-from mlair.data_handling.data_preparation import AbstractDataPrep
-from mlair.data_handling import DataPrepJoin as DataPrep
+from mlair.data_handler.data_preparation import AbstractDataPrep
+from mlair.data_handler import DataPrepJoin as DataPrep
 from mlair.helpers.join import EmptyQueryResult
 
 
diff --git a/test/test_data_handler/test_iterator.py b/test/test_data_handler/test_iterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff81fc7b89b2cede0f47cdf209e77e373cd0d656
--- /dev/null
+++ b/test/test_data_handler/test_iterator.py
@@ -0,0 +1,228 @@
+
+from mlair.data_handler.iterator import DataCollection, StandardIterator, KerasIterator
+from mlair.helpers.testing import PyTestAllEqual
+from mlair.model_modules.model_class import MyLittleModel, MyBranchedModel
+
+import numpy as np
+import pytest
+import mock
+import os
+import shutil
+
+
+class TestStandardIterator:
+
+    @pytest.fixture
+    def collection(self):
+        return list(range(10))
+
+    def test_blank(self):
+        std_iterator = object.__new__(StandardIterator)
+        assert std_iterator._position is None
+
+    def test_init(self, collection):
+        std_iterator = StandardIterator(collection)
+        assert std_iterator._collection == list(range(10))
+        assert std_iterator._position == 0
+
+    def test_next(self, collection):
+        std_iterator = StandardIterator(collection)
+        for i in range(10):
+            assert i == next(std_iterator)
+        with pytest.raises(StopIteration):
+            next(std_iterator)
+        std_iterator = StandardIterator(collection)
+        for e, i in enumerate(iter(std_iterator)):
+            assert i == e
+
+
+class TestDataCollection:
+
+    @pytest.fixture
+    def collection(self):
+        return list(range(10))
+
+    def test_init(self, collection):
+        data_collection = DataCollection(collection)
+        assert data_collection._collection == collection
+
+    def test_iter(self, collection):
+        data_collection = DataCollection(collection)
+        assert isinstance(iter(data_collection), StandardIterator)
+        for e, i in enumerate(data_collection):
+            assert i == e
+
+
+class DummyData:
+
+    def __init__(self, number_of_samples=np.random.randint(100, 150)):
+        self.number_of_samples = number_of_samples
+
+    def get_X(self, upsampling=False, as_numpy=True):
+        X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5))  # samples, window, variables
+        X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 2))  # samples, window, variables
+        X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 2))  # samples, window, variables
+        return [X1, X2, X3]
+
+    def get_Y(self, upsampling=False, as_numpy=True):
+        Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5, 1))  # samples, window, variables
+        Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 5, 1))  # samples, window, variables
+        return [Y1, Y2]
+
+
+class TestKerasIterator:
+
+    @pytest.fixture
+    def collection(self):
+        coll = []
+        for i in range(3):
+            coll.append(DummyData(50 + i))
+        data_coll = DataCollection(collection=coll)
+        return data_coll
+
+    @pytest.fixture
+    def path(self):
+        p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+        shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None
+        yield p
+        shutil.rmtree(p, ignore_errors=True)
+
+    def test_init(self, collection, path):
+        iterator = KerasIterator(collection, 25, path)
+        assert isinstance(iterator._collection, DataCollection)
+        assert iterator._path == os.path.join(path, str(id(iterator)), "%i.pickle")
+        assert iterator.batch_size == 25
+        assert iterator.shuffle is False
+
+    def test_cleanup_path(self, path):
+        assert os.path.exists(path) is False
+        iterator = object.__new__(KerasIterator)
+        iterator._cleanup_path(path, create_new=False)
+        assert os.path.exists(path) is False
+        iterator._cleanup_path(path)
+        assert os.path.exists(path) is True
+        iterator._cleanup_path(path, create_new=False)
+        assert os.path.exists(path) is False
+
+    def test_get_number_of_mini_batches(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.batch_size = 36
+        assert iterator._get_number_of_mini_batches(30) == 0
+        assert iterator._get_number_of_mini_batches(40) == 1
+        assert iterator._get_number_of_mini_batches(72) == 2
+
+    def test_len(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.indexes = [0, 1, 2, 3, 4, 5]
+        assert len(iterator) == 6
+
+    def test_concatenate(self):
+        arr1 = DummyData(10).get_X()
+        arr2 = DummyData(50).get_X()
+        iterator = object.__new__(KerasIterator)
+        new_arr = iterator._concatenate(arr2, arr1)
+        test_arr = [np.concatenate((arr1[0], arr2[0]), axis=0),
+                    np.concatenate((arr1[1], arr2[1]), axis=0),
+                    np.concatenate((arr1[2], arr2[2]), axis=0)]
+        for i in range(3):
+            assert PyTestAllEqual([new_arr[i], test_arr[i]])
+
+    def test_get_batch(self):
+        arr = DummyData(20).get_X()
+        iterator = object.__new__(KerasIterator)
+        iterator.batch_size = 19
+        batch1 = iterator._get_batch(arr, 0)
+        assert batch1[0].shape[0] == 19
+        batch2 = iterator._get_batch(arr, 1)
+        assert batch2[0].shape[0] == 1
+
+    def test_save_to_pickle(self, path):
+        os.makedirs(path)
+        d = DummyData(20)
+        X, Y = d.get_X(), d.get_Y()
+        iterator = object.__new__(KerasIterator)
+        iterator._path = os.path.join(path, "%i.pickle")
+        assert os.path.exists(iterator._path % 2) is False
+        iterator._save_to_pickle(X=X, Y=Y, index=2)
+        assert os.path.exists(iterator._path % 2) is True
+
+    def test_prepare_batches(self, collection, path):
+        iterator = object.__new__(KerasIterator)
+        iterator._collection = collection
+        iterator.batch_size = 50
+        iterator.indexes = []
+        iterator.model = None
+        iterator.upsampling = False
+        iterator._path = os.path.join(path, "%i.pickle")
+        os.makedirs(path)
+        iterator._prepare_batches()
+        assert len(os.listdir(path)) == 4
+        assert len(iterator.indexes) == 4
+        assert len(iterator) == 4
+        assert iterator.indexes == [0, 1, 2, 3]
+
+    def test_prepare_batches_no_remaining(self, path):
+        iterator = object.__new__(KerasIterator)
+        iterator._collection = DataCollection([DummyData(50)])
+        iterator.batch_size = 50
+        iterator.indexes = []
+        iterator.model = None
+        iterator.upsampling = False
+        iterator._path = os.path.join(path, "%i.pickle")
+        os.makedirs(path)
+        iterator._prepare_batches()
+        assert len(os.listdir(path)) == 1
+        assert len(iterator.indexes) == 1
+        assert len(iterator) == 1
+        assert iterator.indexes == [0]
+
+    def test_data_generation(self, collection, path):
+        iterator = KerasIterator(collection, 50, path)
+        X, Y = iterator._KerasIterator__data_generation(0)
+        expected = next(iter(collection))
+        assert PyTestAllEqual([X, expected.get_X()])
+        assert PyTestAllEqual([Y, expected.get_Y()])
+
+    def test_getitem(self, collection, path):
+        iterator = KerasIterator(collection, 50, path)
+        X, Y = iterator[0]
+        expected = next(iter(collection))
+        assert PyTestAllEqual([X, expected.get_X()])
+        assert PyTestAllEqual([Y, expected.get_Y()])
+        reversed(iterator.indexes)
+        X, Y = iterator[3]
+        assert PyTestAllEqual([X, expected.get_X()])
+        assert PyTestAllEqual([Y, expected.get_Y()])
+
+    def test_on_epoch_end(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.indexes = [0, 1, 2, 3, 4]
+        iterator.shuffle = False
+        iterator.on_epoch_end()
+        assert iterator.indexes == [0, 1, 2, 3, 4]
+        iterator.shuffle = True
+        while iterator.indexes == sorted(iterator.indexes):
+            iterator.on_epoch_end()
+        assert iterator.indexes != [0, 1, 2, 3, 4]
+        assert sorted(iterator.indexes) == [0, 1, 2, 3, 4]
+
+    def test_get_model_rank_no_model(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.model = None
+        assert iterator._get_model_rank() == 1
+
+    def test_get_model_rank_single_output_branch(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.model = MyLittleModel(shape_inputs=[(14, 1, 2)], shape_outputs=[(3,)])
+        assert iterator._get_model_rank() == 1
+
+    def test_get_model_rank_multiple_output_branch(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.model = MyBranchedModel(shape_inputs=[(14, 1, 2)], shape_outputs=[(3,)])
+        assert iterator._get_model_rank() == 3
+
+    def test_get_model_rank_error(self):
+        iterator = object.__new__(KerasIterator)
+        iterator.model = mock.MagicMock(return_value=1)
+        with pytest.raises(TypeError):
+            iterator._get_model_rank()
diff --git a/test/test_data_handling/test_data_distributor.py b/test/test_data_handling/test_data_distributor.py
deleted file mode 100644
index d01133b58c37567f557543e7a4663717d15d71c7..0000000000000000000000000000000000000000
--- a/test/test_data_handling/test_data_distributor.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import math
-import os
-
-import keras
-import numpy as np
-import pytest
-
-from mlair.data_handling.data_distributor import Distributor
-from mlair.data_handling.data_generator import DataGenerator
-from mlair.data_handling import DataPrepJoin
-from test.test_modules.test_training import my_test_model
-
-
-class TestDistributor:
-
-    @pytest.fixture
-    def generator(self):
-        return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), 'DEBW107', ['o3', 'temp'],
-                             'datetime', 'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'},
-                             data_preparation=DataPrepJoin)
-
-    @pytest.fixture
-    def generator_two_stations(self):
-        return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), ['DEBW107', 'DEBW013'],
-                             ['o3', 'temp'], 'datetime', 'variables', 'o3',
-                             statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'},
-                             data_preparation=DataPrepJoin)
-
-    @pytest.fixture
-    def model(self):
-        return my_test_model(keras.layers.PReLU, 5, 3, 0.1, False)
-
-    @pytest.fixture
-    def model_with_minor_branch(self):
-        return my_test_model(keras.layers.PReLU, 5, 3, 0.1, True)
-
-    @pytest.fixture
-    def distributor(self, generator, model):
-        return Distributor(generator, model)
-
-    def test_init_defaults(self, distributor):
-        assert distributor.batch_size == 256
-        assert distributor.do_data_permutation is False
-
-    def test_get_model_rank(self, distributor, model_with_minor_branch):
-        assert distributor._get_model_rank() == 1
-        distributor.model = model_with_minor_branch
-        assert distributor._get_model_rank() == 2
-        distributor.model = 1
-
-    def test_get_number_of_mini_batches(self, distributor):
-        values = np.zeros((2311, 19))
-        assert distributor._get_number_of_mini_batches(values) == math.ceil(2311 / distributor.batch_size)
-
-    def test_distribute_on_batches_single_loop(self, generator_two_stations, model):
-        d = Distributor(generator_two_stations, model)
-        for e in d.distribute_on_batches(fit_call=False):
-            assert e[0].shape[0] <= d.batch_size
-
-    def test_distribute_on_batches_infinite_loop(self, generator_two_stations, model):
-        d = Distributor(generator_two_stations, model)
-        elements = []
-        for i, e in enumerate(d.distribute_on_batches()):
-            if i < len(d):
-                elements.append(e[0])
-            elif i == 2 * len(d):  # check if all elements are repeated
-                assert np.testing.assert_array_equal(e[0], elements[i - len(d)]) is None
-            else:  # break when 3rd iteration starts (is called as infinite loop)
-                break
-
-    def test_len(self, distributor):
-        assert len(distributor) == math.ceil(len(distributor.generator[0][0]) / 256)
-
-    def test_len_two_stations(self, generator_two_stations, model):
-        gen = generator_two_stations
-        d = Distributor(gen, model)
-        expected = math.ceil(len(gen[0][0]) / 256) + math.ceil(len(gen[1][0]) / 256)
-        assert len(d) == expected
-
-    def test_permute_data_no_permutation(self, distributor):
-        x = np.array(range(20)).reshape(2, 10).T
-        y = np.array(range(10)).reshape(10, 1)
-        x_perm, y_perm = distributor._permute_data(x, y)
-        assert np.testing.assert_equal(x, x_perm) is None
-        assert np.testing.assert_equal(y, y_perm) is None
-
-    def test_permute_data(self, distributor):
-        x = np.array(range(20)).reshape(2, 10).T
-        y = np.array(range(10)).reshape(10, 1)
-        distributor.do_data_permutation = True
-        x_perm, y_perm = distributor._permute_data(x, y)
-        assert x_perm[0, 0] == y_perm[0]
-        assert x_perm[0, 1] == y_perm[0] + 10
-        assert x_perm[5, 0] == y_perm[5]
-        assert x_perm[5, 1] == y_perm[5] + 10
-        assert x_perm[-1, 0] == y_perm[-1]
-        assert x_perm[-1, 1] == y_perm[-1] + 10
-        # resort x_perm and compare if equal to x
-        x_perm.sort(axis=0)
-        y_perm.sort(axis=0)
-        assert np.testing.assert_equal(x, x_perm) is None
-        assert np.testing.assert_equal(y, y_perm) is None
-
-    def test_distribute_on_batches_upsampling_no_extremes_given(self, generator, model):
-        d = Distributor(generator, model, upsampling=True)
-        gen_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_transposed_label().shape[0]
-        num_mini_batches = math.ceil(gen_len / d.batch_size)
-        i = 0
-        for i, e in enumerate(d.distribute_on_batches(fit_call=False)):
-            assert e[0].shape[0] <= d.batch_size
-        assert i + 1 == num_mini_batches
-
-    def test_distribute_on_batches_upsampling(self, generator, model):
-        generator.extreme_values = [1]
-        d = Distributor(generator, model, upsampling=True)
-        gen_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_transposed_label().shape[0]
-        extr_len = d.generator.get_data_generator(0, load_local_tmp_storage=False).get_extremes_label().shape[0]
-        i = 0
-        for i, e in enumerate(d.distribute_on_batches(fit_call=False)):
-            assert e[0].shape[0] <= d.batch_size
-        assert i + 1 == math.ceil((gen_len + extr_len) / d.batch_size)
diff --git a/test/test_model_modules/test_model_class.py b/test/test_model_modules/test_model_class.py
index 6025516ba01abdcb35ea65b9c4570d5a8b0928b5..3e77fd17c4cd8151fe76816abf0bef323adb2e96 100644
--- a/test/test_model_modules/test_model_class.py
+++ b/test/test_model_modules/test_model_class.py
@@ -12,7 +12,7 @@ class Paddings:
 class AbstractModelSubClass(AbstractModelClass):
 
     def __init__(self):
-        super().__init__()
+        super().__init__(shape_inputs=(12, 1, 2), shape_outputs=3)
         self.test_attr = "testAttr"
 
 
@@ -20,7 +20,7 @@ class TestAbstractModelClass:
 
     @pytest.fixture
     def amc(self):
-        return AbstractModelClass()
+        return AbstractModelClass(shape_inputs=(14, 1, 2), shape_outputs=(3,))
 
     @pytest.fixture
     def amsc(self):
@@ -31,6 +31,8 @@ class TestAbstractModelClass:
         # assert amc.loss is None
         assert amc.model_name == "AbstractModelClass"
         assert amc.custom_objects == {}
+        assert amc.shape_inputs == (14, 1, 2)
+        assert amc.shape_outputs == 3
 
     def test_model_property(self, amc):
         amc.model = keras.Model()
@@ -179,8 +181,10 @@ class TestAbstractModelClass:
         assert amc.compile == amc.model.compile
 
     def test_get_settings(self, amc, amsc):
-        assert amc.get_settings() == {"model_name": "AbstractModelClass"}
-        assert amsc.get_settings() == {"test_attr": "testAttr", "model_name": "AbstractModelSubClass"}
+        assert amc.get_settings() == {"model_name": "AbstractModelClass", "shape_inputs": (14, 1, 2),
+                                      "shape_outputs": 3}
+        assert amsc.get_settings() == {"test_attr": "testAttr", "model_name": "AbstractModelSubClass",
+                                       "shape_inputs": (12, 1, 2), "shape_outputs": 3}
 
     def test_custom_objects(self, amc):
         amc.custom_objects = {"Test": 123}
@@ -200,7 +204,7 @@ class TestMyPaperModel:
 
     @pytest.fixture
     def mpm(self):
-        return MyPaperModel(window_history_size=6, window_lead_time=4, channels=9)
+        return MyPaperModel(shape_inputs=[(7, 1, 9)], shape_outputs=[(4,)])
 
     def test_init(self, mpm):
         # check if loss number of loss functions fit to model outputs
diff --git a/test/test_modules/test_experiment_setup.py b/test/test_run_modules/test_experiment_setup.py
similarity index 94%
rename from test/test_modules/test_experiment_setup.py
rename to test/test_run_modules/test_experiment_setup.py
index 0f1f7a0cb918b4a1ab4e776fe9f9a563eb244149..abd265f5815d974d6edb474e5a03ed08dc5843cc 100644
--- a/test/test_modules/test_experiment_setup.py
+++ b/test/test_run_modules/test_experiment_setup.py
@@ -51,8 +51,6 @@ class TestExperimentSetup:
         # setup for data
         default_stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
         assert data_store.get("stations", "general") == default_stations
-        assert data_store.get("network", "general") == "AIRBASE"
-        assert data_store.get("station_type", "general") == "background"
         assert data_store.get("variables", "general") == list(default_statistics_per_var.keys())
         assert data_store.get("statistics_per_var", "general") == default_statistics_per_var
         assert data_store.get("start", "general") == "1997-01-01"
@@ -64,9 +62,9 @@ class TestExperimentSetup:
         assert data_store.get("window_lead_time", "general") == 3
         # interpolation
         assert data_store.get("dimensions", "general") == {'new_index': ['datetime', 'Stations']}
-        assert data_store.get("interpolation_dim", "general") == "datetime"
+        assert data_store.get("time_dim", "general") == "datetime"
         assert data_store.get("interpolation_method", "general") == "linear"
-        assert data_store.get("limit_nan_fill", "general") == 1
+        assert data_store.get("interpolation_limit", "general") == 1
         # train parameters
         assert data_store.get("start", "general.train") == "1997-01-01"
         assert data_store.get("end", "general.train") == "2007-12-31"
@@ -93,7 +91,7 @@ class TestExperimentSetup:
                       stations=['DEBY053', 'DEBW059', 'DEBW027'], network="INTERNET", station_type="background",
                       variables=["o3", "temp"], start="1999-01-01", end="2001-01-01", window_history_size=4,
                       target_var="relhum", target_dim="target", window_lead_time=10, dimensions="dim1",
-                      interpolation_dim="int_dim", interpolation_method="cubic", limit_nan_fill=5, train_start="2000-01-01",
+                      time_dim="int_dim", interpolation_method="cubic", interpolation_limit=5, train_start="2000-01-01",
                       train_end="2000-01-02", val_start="2000-01-03", val_end="2000-01-04", test_start="2000-01-05",
                       test_end="2000-01-06", use_all_stations_on_all_data_sets=False, trainable=False,
                       fraction_of_train=0.5, experiment_path=experiment_path, create_new_model=True, val_min_length=20)
@@ -125,9 +123,9 @@ class TestExperimentSetup:
         assert data_store.get("window_lead_time", "general") == 10
         # interpolation
         assert data_store.get("dimensions", "general") == "dim1"
-        assert data_store.get("interpolation_dim", "general") == "int_dim"
+        assert data_store.get("time_dim", "general") == "int_dim"
         assert data_store.get("interpolation_method", "general") == "cubic"
-        assert data_store.get("limit_nan_fill", "general") == 5
+        assert data_store.get("interpolation_limit", "general") == 5
         # train parameters
         assert data_store.get("start", "general.train") == "2000-01-01"
         assert data_store.get("end", "general.train") == "2000-01-02"
diff --git a/test/test_modules/test_model_setup.py b/test/test_run_modules/test_model_setup.py
similarity index 52%
rename from test/test_modules/test_model_setup.py
rename to test/test_run_modules/test_model_setup.py
index 2b83d2549ea2f649091d2f16b67bf0d93789af52..1b3e43b2bbfda44f1a5b5463e876adc578360ff3 100644
--- a/test/test_modules/test_model_setup.py
+++ b/test/test_run_modules/test_model_setup.py
@@ -1,9 +1,11 @@
 import os
+import numpy as np
+import shutil
 
 import pytest
 
-from mlair.data_handling import DataPrepJoin
-from mlair.data_handling.data_generator import DataGenerator
+from mlair.data_handler import KerasIterator
+from mlair.data_handler import DataCollection
 from mlair.helpers.datastore import EmptyScope
 from mlair.model_modules.keras_extensions import CallbackHandler
 from mlair.model_modules.model_class import AbstractModelClass, MyLittleModel
@@ -29,29 +31,40 @@ class TestModelSetup:
         RunEnvironment().__del__()
 
     @pytest.fixture
-    def gen(self):
-        return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), 'DEBW107', ['o3', 'temp'],
-                             'datetime', 'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'},
-                             data_preparation=DataPrepJoin)
+    def path(self):
+        p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+        shutil.rmtree(p, ignore_errors=True) if os.path.exists(p) else None
+        yield p
+        shutil.rmtree(p, ignore_errors=True)
 
     @pytest.fixture
-    def setup_with_gen(self, setup, gen):
-        setup.data_store.set("generator", gen, "general.train")
-        setup.data_store.set("window_history_size", gen.window_history_size, "general")
-        setup.data_store.set("window_lead_time", gen.window_lead_time, "general")
-        setup.data_store.set("channels", 2, "general")
+    def keras_iterator(self, path):
+        coll = []
+        for i in range(3):
+            coll.append(DummyData(50 + i))
+        data_coll = DataCollection(collection=coll)
+        KerasIterator(data_coll, 25, path)
+        return data_coll
+
+    @pytest.fixture
+    def setup_with_gen(self, setup, keras_iterator):
+        setup.data_store.set("data_collection", keras_iterator, "train")
+        shape_inputs = [keras_iterator[0].get_X()[0].shape[1:]]
+        setup.data_store.set("shape_inputs", shape_inputs, "model")
+        shape_outputs = [keras_iterator[0].get_Y()[0].shape[1:]]
+        setup.data_store.set("shape_outputs", shape_outputs, "model")
         yield setup
         RunEnvironment().__del__()
 
     @pytest.fixture
-    def setup_with_gen_tiny(self, setup, gen):
-        setup.data_store.set("generator", gen, "general.train")
+    def setup_with_gen_tiny(self, setup, keras_iterator):
+        setup.data_store.set("data_collection", keras_iterator, "train")
         yield setup
         RunEnvironment().__del__()
 
     @pytest.fixture
     def setup_with_model(self, setup):
-        setup.model = AbstractModelClass()
+        setup.model = AbstractModelClass(shape_inputs=(12, 1), shape_outputs=2)
         setup.model.test_param = "42"
         yield setup
         RunEnvironment().__del__()
@@ -89,14 +102,17 @@ class TestModelSetup:
         assert setup_with_gen.model is None
         setup_with_gen.build_model()
         assert isinstance(setup_with_gen.model, AbstractModelClass)
-        expected = {"window_history_size", "window_lead_time", "channels", "dropout_rate", "regularizer", "initial_lr",
-                    "optimizer", "activation"}
+        expected = {"lr_decay", "model_name", "dropout_rate", "regularizer", "initial_lr", "optimizer", "activation",
+                    "shape_inputs", "shape_outputs"}
         assert expected <= self.current_scope_as_set(setup_with_gen)
 
-    def test_set_channels(self, setup_with_gen_tiny):
-        assert len(setup_with_gen_tiny.data_store.search_name("channels")) == 0
-        setup_with_gen_tiny._set_channels()
-        assert setup_with_gen_tiny.data_store.get("channels", setup_with_gen_tiny.scope) == 2
+    def test_set_shapes(self, setup_with_gen_tiny):
+        assert len(setup_with_gen_tiny.data_store.search_name("shape_inputs")) == 0
+        assert len(setup_with_gen_tiny.data_store.search_name("shape_outputs")) == 0
+        setup_with_gen_tiny._set_shapes()
+        assert setup_with_gen_tiny.data_store.get("shape_inputs", setup_with_gen_tiny.scope) == [(14, 1, 5), (10, 1, 2),
+                                                                                                 (1, 1, 2)]
+        assert setup_with_gen_tiny.data_store.get("shape_outputs", setup_with_gen_tiny.scope) == [(5,), (3,)]
 
     def test_load_weights(self):
         pass
@@ -109,3 +125,20 @@ class TestModelSetup:
 
     def test_init(self):
         pass
+
+
+class DummyData:
+
+    def __init__(self, number_of_samples=np.random.randint(100, 150)):
+        self.number_of_samples = number_of_samples
+
+    def get_X(self, upsampling=False, as_numpy=True):
+        X1 = np.random.randint(0, 10, size=(self.number_of_samples, 14, 1, 5))  # samples, window, variables
+        X2 = np.random.randint(21, 30, size=(self.number_of_samples, 10, 1, 2))  # samples, window, variables
+        X3 = np.random.randint(-5, 0, size=(self.number_of_samples, 1, 1, 2))  # samples, window, variables
+        return [X1, X2, X3]
+
+    def get_Y(self, upsampling=False, as_numpy=True):
+        Y1 = np.random.randint(0, 10, size=(self.number_of_samples, 5))  # samples, window
+        Y2 = np.random.randint(21, 30, size=(self.number_of_samples, 3))  # samples, window
+        return [Y1, Y2]
\ No newline at end of file
diff --git a/test/test_modules/test_partition_check.py b/test/test_run_modules/test_partition_check.py
similarity index 95%
rename from test/test_modules/test_partition_check.py
rename to test/test_run_modules/test_partition_check.py
index 1e576a8ce47c98e395468b76d3496dafa3cc0525..ba5b3d7ef127258eaa6c4f2a1a0b4d0b531eeac5 100644
--- a/test/test_modules/test_partition_check.py
+++ b/test/test_run_modules/test_partition_check.py
@@ -5,7 +5,6 @@ import mock
 from mlair.run_modules.experiment_setup import ExperimentSetup
 from mlair.run_modules.partition_check import PartitionCheck
 from mlair.run_modules.run_environment import RunEnvironment
-from mlair.configuration import get_host
 
 
 class TestPartitionCheck:
@@ -24,6 +23,7 @@ class TestPartitionCheck:
     @mock.patch("os.path.exists", return_value=False)
     @mock.patch("os.makedirs", side_effect=None)
     def obj_with_exp_setup_login(self, mock_host, mock_user,  mock_path, mock_check):
+        RunEnvironment().__del__()
         ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'],
                         statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background")
         pre = object.__new__(PartitionCheck)
@@ -37,6 +37,7 @@ class TestPartitionCheck:
     @mock.patch("os.path.exists", return_value=False)
     @mock.patch("os.makedirs", side_effect=None)
     def obj_with_exp_setup_compute(self, mock_host, mock_user,  mock_path, mock_check):
+        RunEnvironment().__del__()
         ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'],
                         statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background")
         pre = object.__new__(PartitionCheck)
@@ -71,5 +72,5 @@ class TestPartitionCheck:
     @mock.patch("os.path.exists", return_value=False)
     @mock.patch("os.makedirs", side_effect=None)
     def test_run_compute(self, mock_host, mock_user, mock_path, mock_check, obj_with_exp_setup_compute, caplog):
-
-        assert obj_with_exp_setup_compute.__next__()._run() is None
+        obj = obj_with_exp_setup_compute.__next__()
+        assert obj._run() is None
diff --git a/test/test_modules/test_post_processing.py b/test/test_run_modules/test_post_processing.py
similarity index 100%
rename from test/test_modules/test_post_processing.py
rename to test/test_run_modules/test_post_processing.py
diff --git a/test/test_modules/test_pre_processing.py b/test/test_run_modules/test_pre_processing.py
similarity index 68%
rename from test/test_modules/test_pre_processing.py
rename to test/test_run_modules/test_pre_processing.py
index a35e810c2d62ab746004442bffee51d85dc17ab2..97e73204068d334590ee98271080acddf29dfc5f 100644
--- a/test/test_modules/test_pre_processing.py
+++ b/test/test_run_modules/test_pre_processing.py
@@ -2,12 +2,11 @@ import logging
 
 import pytest
 
-from mlair.data_handling import DataPrepJoin
-from mlair.data_handling.data_generator import DataGenerator
+from mlair.data_handler import DefaultDataPreparation, DataCollection, AbstractDataPreparation
 from mlair.helpers.datastore import NameNotFoundInScope
 from mlair.helpers import PyTestRegex
 from mlair.run_modules.experiment_setup import ExperimentSetup
-from mlair.run_modules.pre_processing import PreProcessing, DEFAULT_ARGS_LIST, DEFAULT_KWARGS_LIST
+from mlair.run_modules.pre_processing import PreProcessing
 from mlair.run_modules.run_environment import RunEnvironment
 
 
@@ -29,7 +28,7 @@ class TestPreProcessing:
     def obj_with_exp_setup(self):
         ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'],
                         statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background",
-                        data_preparation=DataPrepJoin)
+                        data_preparation=DefaultDataPreparation)
         pre = object.__new__(PreProcessing)
         super(PreProcessing, pre).__init__()
         yield pre
@@ -42,25 +41,26 @@ class TestPreProcessing:
         caplog.set_level(logging.INFO)
         with PreProcessing():
             assert caplog.record_tuples[0] == ('root', 20, 'PreProcessing started')
-            assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started (all)')
+            assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started (preprocessing)')
             assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 5 '
                                                                         r'station\(s\). Found 5/5 valid stations.'))
         RunEnvironment().__del__()
 
     def test_run(self, obj_with_exp_setup):
-        assert obj_with_exp_setup.data_store.search_name("generator") == []
+        assert obj_with_exp_setup.data_store.search_name("data_collection") == []
         assert obj_with_exp_setup._run() is None
-        assert obj_with_exp_setup.data_store.search_name("generator") == sorted(["general.train", "general.val",
-                                                                                 "general.train_val", "general.test"])
+        assert obj_with_exp_setup.data_store.search_name("data_collection") == sorted(["general.train", "general.val",
+                                                                                       "general.train_val",
+                                                                                       "general.test"])
 
     def test_split_train_val_test(self, obj_with_exp_setup):
-        assert obj_with_exp_setup.data_store.search_name("generator") == []
+        assert obj_with_exp_setup.data_store.search_name("data_collection") == []
         obj_with_exp_setup.split_train_val_test()
         data_store = obj_with_exp_setup.data_store
-        expected_params = ["generator", "start", "end", "stations", "permute_data", "min_length", "extreme_values",
-                           "extremes_on_right_tail_only", "upsampling"]
+        expected_params = ["data_collection", "start", "end", "stations", "permute_data", "min_length",
+                           "extreme_values", "extremes_on_right_tail_only", "upsampling"]
         assert data_store.search_scope("general.train") == sorted(expected_params)
-        assert data_store.search_name("generator") == sorted(["general.train", "general.val", "general.test",
+        assert data_store.search_name("data_collection") == sorted(["general.train", "general.val", "general.test",
                                                               "general.train_val"])
 
     def test_create_set_split_not_all_stations(self, caplog, obj_with_exp_setup):
@@ -69,9 +69,9 @@ class TestPreProcessing:
         obj_with_exp_setup.create_set_split(slice(0, 2), "awesome")
         assert ('root', 10, "Awesome stations (len=2): ['DEBW107', 'DEBY081']") in caplog.record_tuples
         data_store = obj_with_exp_setup.data_store
-        assert isinstance(data_store.get("generator", "general.awesome"), DataGenerator)
+        assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection)
         with pytest.raises(NameNotFoundInScope):
-            data_store.get("generator", "general")
+            data_store.get("data_collection", "general")
         assert data_store.get("stations", "general.awesome") == ["DEBW107", "DEBY081"]
 
     def test_create_set_split_all_stations(self, caplog, obj_with_exp_setup):
@@ -80,22 +80,22 @@ class TestPreProcessing:
         message = "Awesome stations (len=6): ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001']"
         assert ('root', 10, message) in caplog.record_tuples
         data_store = obj_with_exp_setup.data_store
-        assert isinstance(data_store.get("generator", "general.awesome"), DataGenerator)
+        assert isinstance(data_store.get("data_collection", "general.awesome"), DataCollection)
         with pytest.raises(NameNotFoundInScope):
-            data_store.get("generator", "general")
+            data_store.get("data_collection", "general")
         assert data_store.get("stations", "general.awesome") == ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
 
     @pytest.mark.parametrize("name", (None, "tester"))
-    def test_check_valid_stations(self, caplog, obj_with_exp_setup, name):
+    def test_validate_station(self, caplog, obj_with_exp_setup, name):
         pre = obj_with_exp_setup
         caplog.set_level(logging.INFO)
-        args = pre.data_store.create_args_dict(DEFAULT_ARGS_LIST)
-        kwargs = pre.data_store.create_args_dict(DEFAULT_KWARGS_LIST)
         stations = pre.data_store.get("stations", "general")
-        valid_stations = pre.check_valid_stations(args, kwargs, stations, name=name)
+        data_preparation = pre.data_store.get("data_preparation")
+        collection, valid_stations = pre.validate_station(data_preparation, stations, set_name=name)
+        assert isinstance(collection, DataCollection)
         assert len(valid_stations) < len(stations)
         assert valid_stations == stations[:-1]
-        expected = 'check valid stations started (tester)' if name else 'check valid stations started'
+        expected = "check valid stations started" + ' (%s)' % (name if name else 'all')
         assert caplog.record_tuples[0] == ('root', 20, expected)
         assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+:\d+:\d+ \(hh:mm:ss\) to check 6 '
                                                                     r'station\(s\). Found 5/6 valid stations.'))
@@ -107,3 +107,11 @@ class TestPreProcessing:
         assert dummy_list[val] == list(range(10, 13))
         assert dummy_list[test] == list(range(13, 15))
         assert dummy_list[train_val] == list(range(0, 13))
+
+    def test_transformation(self):
+        pre = object.__new__(PreProcessing)
+        data_preparation = AbstractDataPreparation
+        stations = ['DEBW107', 'DEBY081']
+        assert pre.transformation(data_preparation, stations) is None
+        class data_preparation_no_trans: pass
+        assert pre.transformation(data_preparation_no_trans, stations) is None
diff --git a/test/test_modules/test_run_environment.py b/test/test_run_modules/test_run_environment.py
similarity index 100%
rename from test/test_modules/test_run_environment.py
rename to test/test_run_modules/test_run_environment.py
diff --git a/test/test_modules/test_training.py b/test/test_run_modules/test_training.py
similarity index 72%
rename from test/test_modules/test_training.py
rename to test/test_run_modules/test_training.py
index b80570bb51ec5886f163842a3a40411148df3419..1fec8f4e56e2925bff0bc4af859dac1fe5fbb2b6 100644
--- a/test/test_modules/test_training.py
+++ b/test/test_run_modules/test_training.py
@@ -9,9 +9,7 @@ import mock
 import pytest
 from keras.callbacks import History
 
-from mlair.data_handling import DataPrepJoin
-from mlair.data_handling.data_distributor import Distributor
-from mlair.data_handling.data_generator import DataGenerator
+from mlair.data_handler import DataCollection, KerasIterator, DefaultDataPreparation
 from mlair.helpers import PyTestRegex
 from mlair.model_modules.flatten import flatten_tail
 from mlair.model_modules.inception_model import InceptionModelBase
@@ -20,7 +18,7 @@ from mlair.run_modules.run_environment import RunEnvironment
 from mlair.run_modules.training import Training
 
 
-def my_test_model(activation, window_history_size, channels, dropout_rate, add_minor_branch=False):
+def my_test_model(activation, window_history_size, channels, output_size, dropout_rate, add_minor_branch=False):
     inception_model = InceptionModelBase()
     conv_settings_dict1 = {
         'tower_1': {'reduction_filter': 8, 'tower_filter': 8 * 2, 'tower_kernel': (3, 1), 'activation': activation},
@@ -29,7 +27,6 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m
     X_input = keras.layers.Input(shape=(window_history_size + 1, 1, channels))
     X_in = inception_model.inception_block(X_input, conv_settings_dict1, pool_settings_dict1)
     if add_minor_branch:
-        # out = [flatten_tail(X_in, 'Minor_1', activation=activation)]
         out = [flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=4,
                             output_activation='linear', reduction_filter=64,
                             name='Minor_1', dropout_rate=dropout_rate,
@@ -37,8 +34,7 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m
     else:
         out = []
     X_in = keras.layers.Dropout(dropout_rate)(X_in)
-    # out.append(flatten_tail(X_in, 'Main', activation=activation))
-    out.append(flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=4,
+    out.append(flatten_tail(X_in, inner_neurons=64, activation=activation, output_neurons=output_size,
                             output_activation='linear', reduction_filter=64,
                             name='Main', dropout_rate=dropout_rate,
                             ))
@@ -48,7 +44,7 @@ def my_test_model(activation, window_history_size, channels, dropout_rate, add_m
 class TestTraining:
 
     @pytest.fixture
-    def init_without_run(self, path: str, model: keras.Model, callbacks: CallbackHandler, model_path):
+    def init_without_run(self, path: str, model: keras.Model, callbacks: CallbackHandler, model_path, batch_path):
         obj = object.__new__(Training)
         super(Training, obj).__init__()
         obj.model = model
@@ -62,15 +58,18 @@ class TestTraining:
         obj.lr_sc = lr
         obj.hist = hist
         obj.experiment_name = "TestExperiment"
-        obj.data_store.set("generator", mock.MagicMock(return_value="mock_train_gen"), "general.train")
-        obj.data_store.set("generator", mock.MagicMock(return_value="mock_val_gen"), "general.val")
-        obj.data_store.set("generator", mock.MagicMock(return_value="mock_test_gen"), "general.test")
+        obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_train_gen"), "general.train")
+        obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_val_gen"), "general.val")
+        obj.data_store.set("data_collection", mock.MagicMock(return_value="mock_test_gen"), "general.test")
         os.makedirs(path)
         obj.data_store.set("experiment_path", path, "general")
+        os.makedirs(batch_path)
+        obj.data_store.set("batch_path", batch_path, "general")
         os.makedirs(model_path)
         obj.data_store.set("model_path", model_path, "general")
         obj.data_store.set("model_name", os.path.join(model_path, "test_model.h5"), "general.model")
         obj.data_store.set("experiment_name", "TestExperiment", "general")
+
         path_plot = os.path.join(path, "plots")
         os.makedirs(path_plot)
         obj.data_store.set("plot_path", path_plot, "general")
@@ -109,14 +108,35 @@ class TestTraining:
         return os.path.join(path, "model")
 
     @pytest.fixture
-    def generator(self, path):
-        return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), ['DEBW107'], ['o3', 'temp'], 'datetime',
-                             'variables', 'o3', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'},
-                             data_preparation=DataPrepJoin)
+    def batch_path(self, path):
+        return os.path.join(path, "batch")
+
+    @pytest.fixture
+    def window_history_size(self):
+        return 7
+
+    @pytest.fixture
+    def window_lead_time(self):
+        return 2
 
     @pytest.fixture
-    def model(self):
-        return my_test_model(keras.layers.PReLU, 7, 2, 0.1, False)
+    def statistics_per_var(self):
+        return {'o3': 'dma8eu', 'temp': 'maximum'}
+
+    @pytest.fixture
+    def data_collection(self, path, window_history_size, window_lead_time, statistics_per_var):
+        data_prep = DefaultDataPreparation.build(['DEBW107'], data_path=os.path.join(os.path.dirname(__file__), 'data'),
+                                                 statistics_per_var=statistics_per_var, station_type="background",
+                                                 network="AIRBASE", sampling="daily", target_dim="variables",
+                                                 target_var="o3", time_dim="datetime",
+                                                 window_history_size=window_history_size,
+                                                 window_lead_time=window_lead_time, name_affix="train")
+        return DataCollection([data_prep])
+
+    @pytest.fixture
+    def model(self, window_history_size, window_lead_time, statistics_per_var):
+        channels = len(list(statistics_per_var.keys()))
+        return my_test_model(keras.layers.PReLU, window_history_size, channels, window_lead_time, 0.1, False)
 
     @pytest.fixture
     def callbacks(self, path):
@@ -130,29 +150,31 @@ class TestTraining:
         return clbk, hist, lr
 
     @pytest.fixture
-    def ready_to_train(self, generator: DataGenerator, init_without_run: Training):
-        init_without_run.train_set = Distributor(generator, init_without_run.model, init_without_run.batch_size)
-        init_without_run.val_set = Distributor(generator, init_without_run.model, init_without_run.batch_size)
+    def ready_to_train(self, data_collection: DataCollection, init_without_run: Training, batch_path: str):
+        batch_size = init_without_run.batch_size
+        model = init_without_run.model
+        init_without_run.train_set = KerasIterator(data_collection, batch_size, batch_path, model=model, name="train")
+        init_without_run.val_set = KerasIterator(data_collection, batch_size, batch_path, model=model, name="val")
         init_without_run.model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error)
         return init_without_run
 
     @pytest.fixture
-    def ready_to_run(self, generator, init_without_run):
+    def ready_to_run(self, data_collection, init_without_run):
         obj = init_without_run
-        obj.data_store.set("generator", generator, "general.train")
-        obj.data_store.set("generator", generator, "general.val")
-        obj.data_store.set("generator", generator, "general.test")
+        obj.data_store.set("data_collection", data_collection, "general.train")
+        obj.data_store.set("data_collection", data_collection, "general.val")
+        obj.data_store.set("data_collection", data_collection, "general.test")
         obj.model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error)
         return obj
 
     @pytest.fixture
-    def ready_to_init(self, generator, model, callbacks, path, model_path):
+    def ready_to_init(self, data_collection, model, callbacks, path, model_path, batch_path):
         os.makedirs(path)
         os.makedirs(model_path)
         obj = RunEnvironment()
-        obj.data_store.set("generator", generator, "general.train")
-        obj.data_store.set("generator", generator, "general.val")
-        obj.data_store.set("generator", generator, "general.test")
+        obj.data_store.set("data_collection", data_collection, "general.train")
+        obj.data_store.set("data_collection", data_collection, "general.val")
+        obj.data_store.set("data_collection", data_collection, "general.test")
         model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.mean_absolute_error)
         obj.data_store.set("model", model, "general.model")
         obj.data_store.set("model_path", model_path, "general")
@@ -167,6 +189,8 @@ class TestTraining:
         obj.data_store.set("experiment_path", path, "general")
         obj.data_store.set("trainable", True, "general")
         obj.data_store.set("create_new_model", True, "general")
+        os.makedirs(batch_path)
+        obj.data_store.set("batch_path", batch_path, "general")
         path_plot = os.path.join(path, "plots")
         os.makedirs(path_plot)
         obj.data_store.set("plot_path", path_plot, "general")
@@ -177,6 +201,13 @@ class TestTraining:
     def test_init(self, ready_to_init):
         assert isinstance(Training(), Training)  # just test, if nothing fails
 
+    def test_no_training(self, ready_to_init, caplog):
+        caplog.set_level(logging.INFO)
+        ready_to_init.data_store.set("trainable", False)
+        Training()
+        message = "No training has started, because trainable parameter was false."
+        assert caplog.record_tuples[-2] == ("root", 20, message)
+
     def test_run(self, ready_to_run):
         assert ready_to_run._run() is None  # just test, if nothing fails
 
@@ -188,8 +219,8 @@ class TestTraining:
     def test_set_gen(self, init_without_run):
         assert init_without_run.train_set is None
         init_without_run._set_gen("train")
-        assert isinstance(init_without_run.train_set, Distributor)
-        assert init_without_run.train_set.generator.return_value == "mock_train_gen"
+        assert isinstance(init_without_run.train_set, KerasIterator)
+        assert init_without_run.train_set._collection.return_value == "mock_train_gen"
 
     def test_set_generators(self, init_without_run):
         sets = ["train", "val", "test"]
@@ -197,7 +228,7 @@ class TestTraining:
         init_without_run.set_generators()
         assert not all([getattr(init_without_run, f"{obj}_set") is None for obj in sets])
         assert all(
-            [getattr(init_without_run, f"{obj}_set").generator.return_value == f"mock_{obj}_gen" for obj in sets])
+            [getattr(init_without_run, f"{obj}_set")._collection.return_value == f"mock_{obj}_gen" for obj in sets])
 
     def test_train(self, ready_to_train, path):
         assert not hasattr(ready_to_train.model, "history")