diff --git a/run.py b/run.py
index 8e4d9c46f4a39224335dd65b689f519943166d0f..556cd0b59ed023178fa7e6df1b5b03b9f6c5f157 100644
--- a/run.py
+++ b/run.py
@@ -17,7 +17,7 @@ def main(parser_args):
 
     with RunEnvironment():
         ExperimentSetup(parser_args, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'],
-                        station_type='background', trainable=True)
+                        station_type='background', trainable=False, create_new_model=False)
         PreProcessing()
 
         ModelSetup()
diff --git a/src/data_handling/bootstraps.py b/src/data_handling/bootstraps.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c6c2c57f36bafd410987e721164ce2d52c98bfc
--- /dev/null
+++ b/src/data_handling/bootstraps.py
@@ -0,0 +1,205 @@
+__author__ = 'Felix Kleinert, Lukas Leufen'
+__date__ = '2020-02-07'
+
+
+from src.run_modules.run_environment import RunEnvironment
+from src.data_handling.data_generator import DataGenerator
+import numpy as np
+import logging
+import dask.array as da
+import xarray as xr
+import os
+import re
+from src import helpers
+
+
+class BootStrapGenerator:
+
+    def __init__(self, orig_generator, boots, chunksize, bootstrap_path):
+        self.orig_generator: DataGenerator = orig_generator
+        self.stations = self.orig_generator.stations
+        self.variables = self.orig_generator.variables
+        self.boots = boots
+        self.chunksize = chunksize
+        self.bootstrap_path = bootstrap_path
+        self._iterator = 0
+        self.bootstrap_meta = []
+
+    def __len__(self):
+        """
+        display the number of stations
+        """
+        return len(self.orig_generator)*self.boots*len(self.variables)
+
+    def get_labels(self, key):
+        _, label = self.orig_generator[key]
+        for _ in range(self.boots):
+            yield label
+
+    def get_generator(self):
+        """
+        This is the implementation of the __next__ method of the iterator protocol. Get the data generator, and return
+        the history and label data of this generator.
+        :return:
+        """
+        while True:
+            for i, data in enumerate(self.orig_generator):
+                station = self.orig_generator.get_station_key(i)
+                logging.info(f"station: {station}")
+                hist, label = data
+                len_of_label = len(label)
+                shuffled_data = self.load_boot_data(station)
+                for var in self.variables:
+                    logging.info(f"  var: {var}")
+                    for boot in range(self.boots):
+                        logging.debug(f"boot: {boot}")
+                        boot_hist = hist.sel(variables=helpers.list_pop(self.variables, var))
+                        shuffled_var = shuffled_data.sel(variables=var, boots=boot).expand_dims("variables").drop("boots").transpose("datetime", "window", "Stations", "variables")
+                        boot_hist = boot_hist.combine_first(shuffled_var)
+                        boot_hist = boot_hist.sortby("variables")
+                        self.bootstrap_meta.extend([[var, station]]*len_of_label)
+                        yield boot_hist, label
+            return
+
+    def get_orig_prediction(self, path, file_name, prediction_name="CNN"):
+        file = os.path.join(path, file_name)
+        data = xr.open_dataarray(file)
+        for _ in range(self.boots):
+            yield data.sel(type=prediction_name).squeeze()
+
+    def load_boot_data(self, station):
+        files = os.listdir(self.bootstrap_path)
+        regex = re.compile(rf"{station}_\w*\.nc")
+        file_name = os.path.join(self.bootstrap_path, list(filter(regex.search, files))[0])
+        shuffled_data = xr.open_dataarray(file_name, chunks=100)
+        return shuffled_data
+
+
+class BootStraps(RunEnvironment):
+
+    def __init__(self, data, bootstrap_path, number_bootstraps=10):
+
+        super().__init__()
+        self.data: DataGenerator = data
+        self.number_bootstraps = number_bootstraps
+        self.bootstrap_path = bootstrap_path
+        self.chunks = self.get_chunk_size()
+        self.create_shuffled_data()
+        self._boot_strap_generator = BootStrapGenerator(self.data, self.number_bootstraps, self.chunks, self.bootstrap_path)
+
+    def get_boot_strap_meta(self):
+        return self._boot_strap_generator.bootstrap_meta
+
+    def boot_strap_generator(self):
+        return self._boot_strap_generator.get_generator()
+
+    def get_boot_strap_generator_length(self):
+        return self._boot_strap_generator.__len__()
+
+    def get_labels(self, key):
+        labels_list = []
+        chunks = None
+        for labels in self._boot_strap_generator.get_labels(key):
+            if len(labels_list) == 0:
+                chunks = (100, labels.data.shape[1])
+            labels_list.append(da.from_array(labels.data, chunks=chunks))
+        labels_out = da.concatenate(labels_list, axis=0)
+        return labels_out.compute()
+
+    def get_orig_prediction(self, path, name):
+        labels_list = []
+        chunks = None
+        for labels in self._boot_strap_generator.get_orig_prediction(path, name):
+            if len(labels_list) == 0:
+                chunks = (100, labels.data.shape[1])
+            labels_list.append(da.from_array(labels.data, chunks=chunks))
+        labels_out = da.concatenate(labels_list, axis=0)
+        labels_out = labels_out.compute()
+        return labels_out[~np.isnan(labels_out).any(axis=1), :]
+
+    def get_chunk_size(self):
+        hist, _ = self.data[0]
+        return (100, *hist.shape[1:], self.number_bootstraps)
+
+    def create_shuffled_data(self):
+        """
+        Create shuffled data. Use original test data, add dimension 'boots' with length number of bootstraps and insert
+        randomly selected variables. If there is a suitable local file for requested window size and number of
+        bootstraps, no additional file will be created inside this function.
+        """
+        logging.info("create shuffled bootstrap data")
+        variables_str = '_'.join(sorted(self.data.variables))
+        window = self.data.window_history_size
+        for station in self.data.stations:
+            valid, nboot = self.valid_bootstrap_file(station, variables_str, window)
+            if not valid:
+                logging.info(f'create bootstap data for {station}')
+                hist, _ = self.data[station]
+                data = hist.copy()
+                file_name = f"{station}_{variables_str}_hist{window}_nboots{nboot}_shuffled.nc"
+                file_path = os.path.join(self.bootstrap_path, file_name)
+                data = data.expand_dims({'boots': range(nboot)}, axis=-1)
+                shuffled_variable = []
+                for i, var in enumerate(data.coords['variables']):
+                    single_variable = data.sel(variables=var).values
+                    shuffled_variable.append(self.shuffle_single_variable(single_variable, chunks=(100, *data.shape[1:3], data.shape[-1])))
+                shuffled_variable_da = da.stack(shuffled_variable, axis=-2, ).rechunk("auto")
+                shuffled_data = xr.DataArray(shuffled_variable_da, coords=data.coords, dims=data.dims)
+                shuffled_data.to_netcdf(file_path)
+
+    def valid_bootstrap_file(self, station, variables, window):
+        """
+        Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. If a
+        match was found, this method returns a tuple (True, None). In any other case, it returns (False, max_nboot),
+        where max_nboot is the highest boot number found in the local storage. A match is defined so that the window
+        length is ge than given window size form args and the number of boots is also ge than the given number of boots
+        from this class. Furthermore, this functions deletes local files, if the match the station pattern but don't fit
+        the window and bootstrap condition. This is performed, because it is assumed, that the corresponding file will
+        be created with a longer or at least same window size and numbers of bootstraps.
+        :param station:
+        :param variables:
+        :param window:
+        :return:
+        """
+        regex = re.compile(rf"{station}_{variables}_hist(\d+)_nboots(\d+)_shuffled")
+        max_nboot = self.number_bootstraps
+        for file in os.listdir(self.bootstrap_path):
+            match = regex.match(file)
+            if match:
+                window_file = int(match.group(1))
+                nboot_file = int(match.group(2))
+                max_nboot = max([max_nboot, nboot_file])
+                if (window_file >= window) and (nboot_file >= self.number_bootstraps):
+                    return True, None
+                else:
+                    os.remove(os.path.join(self.bootstrap_path, file))
+        return False, max_nboot
+
+    @staticmethod
+    def shuffle_single_variable(data: da.array, chunks) -> np.ndarray:
+        size = data.shape
+        return da.random.choice(data.reshape(-1,), size=size, chunks=chunks)
+
+
+if __name__ == "__main__":
+
+    from src.run_modules.experiment_setup import ExperimentSetup
+    from src.run_modules.run_environment import RunEnvironment
+    from src.run_modules.pre_processing import PreProcessing
+
+    formatter = '%(asctime)s - %(levelname)s: %(message)s  [%(filename)s:%(funcName)s:%(lineno)s]'
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    with RunEnvironment() as run_env:
+        ExperimentSetup(stations=['DEBW107', 'DEBY081', 'DEBW013'],
+                        station_type='background', trainable=True, window_history_size=9)
+        PreProcessing()
+
+        data = run_env.data_store.get("generator", "general.test")
+        path = run_env.data_store.get("bootstrap_path", "general")
+        number_bootstraps = 10
+
+        boots = BootStraps(data, path, number_bootstraps)
+        for b in boots.boot_strap_generator():
+            a, c = b
+        logging.info(f"len is {len(boots.get_boot_strap_meta())}")
diff --git a/src/data_handling/data_generator.py b/src/data_handling/data_generator.py
index cf4a0e3d8bc2483e787dfea31c5c9a32fb437fe1..60e2da20dfb8d0c4bf9d6ae633a60591a27a9b2b 100644
--- a/src/data_handling/data_generator.py
+++ b/src/data_handling/data_generator.py
@@ -80,8 +80,7 @@ class DataGenerator(keras.utils.Sequence):
             data = self.get_data_generator()
             self._iterator += 1
             if data.history is not None and data.label is not None:  # pragma: no branch
-                return data.history.transpose("datetime", "window", "Stations", "variables"), \
-                    data.label.squeeze("Stations").transpose("datetime", "window")
+                return data.get_transposed_history(), data.get_transposed_label()
             else:
                 self.__next__()  # pragma: no cover
         else:
@@ -94,7 +93,7 @@ class DataGenerator(keras.utils.Sequence):
         :return: The generator's time series of history data and its labels
         """
         data = self.get_data_generator(key=item)
-        return data.get_transposed_history(), data.label.squeeze("Stations").transpose("datetime", "window")
+        return data.get_transposed_history(), data.get_transposed_label()
 
     def setup_transformation(self, transformation):
         if transformation is None:
diff --git a/src/data_handling/data_preparation.py b/src/data_handling/data_preparation.py
index 98b47a6df3825581564fa9aaef7be8698408760e..594a4733f73e3d18025414f2d382d1840799abdb 100644
--- a/src/data_handling/data_preparation.py
+++ b/src/data_handling/data_preparation.py
@@ -395,7 +395,10 @@ class DataPrep(object):
         return data
 
     def get_transposed_history(self):
-        return self.history.transpose("datetime", "window", "Stations", "variables")
+        return self.history.transpose("datetime", "window", "Stations", "variables").copy()
+
+    def get_transposed_label(self):
+        return self.label.squeeze("Stations").transpose("datetime", "window").copy()
 
 
 if __name__ == "__main__":
diff --git a/src/helpers.py b/src/helpers.py
index e1496c3b232db29194878892647c59581b6a70a3..621974bfdeae46e87ff62990b68ea326eb38c033 100644
--- a/src/helpers.py
+++ b/src/helpers.py
@@ -49,9 +49,10 @@ class TimeTracking(object):
     method. Duration can always be shown by printing the time tracking object or calling get_current_duration.
     """
 
-    def __init__(self, start=True):
+    def __init__(self, start=True, name="undefined job"):
         self.start = None
         self.end = None
+        self._name = name
         if start:
             self._start()
 
@@ -93,7 +94,7 @@ class TimeTracking(object):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.stop()
-        logging.info(f"undefined job finished after {self}")
+        logging.info(f"{self._name} finished after {self}")
 
 
 def prepare_host(create_new=True, sampling="daily"):
@@ -147,6 +148,13 @@ def set_experiment_name(experiment_date=None, experiment_path=None, sampling=Non
     return experiment_name, experiment_path
 
 
+def set_bootstrap_path(bootstrap_path, data_path, sampling):
+    if bootstrap_path is None:
+        bootstrap_path = os.path.join(data_path, "..", f"bootstrap_{sampling}")
+    check_path_and_create(bootstrap_path)
+    return bootstrap_path
+
+
 class PyTestRegex:
     """Assert that a given string meets some expectations."""
 
@@ -195,3 +203,13 @@ def float_round(number: float, decimals: int = 0, round_type: Callable = math.ce
 def dict_pop(dict: Dict, pop_keys):
     pop_keys = to_list(pop_keys)
     return {k: v for k, v in dict.items() if k not in pop_keys}
+
+
+def list_pop(list_full: list, pop_items):
+    pop_items = to_list(pop_items)
+    if len(pop_items) > 1:
+        return [e for e in list_full if e not in pop_items]
+    else:
+        list_pop = list_full.copy()
+        list_pop.remove(pop_items[0])
+        return list_pop
diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py
index a41c636b5ab17d2039f7976fca625e9c8e11ce6e..b39de8e957a110121c0e8812608d32aad3431570 100644
--- a/src/plotting/postprocessing_plotting.py
+++ b/src/plotting/postprocessing_plotting.py
@@ -479,6 +479,76 @@ class PlotCompetitiveSkillScore(RunEnvironment):
         plt.close()
 
 
+class PlotBootstrapSkillScore(RunEnvironment):
+    """
+    Create plot of climatological skill score after Murphy (1988) as box plot over all stations. A forecast time step
+    (called "ahead") is separately shown to highlight the differences for each prediction time step. Either each single
+    term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed (score_only=True,
+    default). Y-axis is adjusted following the data and not hard coded. The plot is saved under plot_folder path with
+    name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi.
+    """
+
+    def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = ""):
+        """
+        Sets attributes and create plot
+        :param data: dictionary with station names as keys and 2D xarrays as values, consist on axis ahead and terms.
+        :param plot_folder: path to save the plot (default: current directory)
+        :param model_setup: architecture type to specify plot name (default "CNN")
+        """
+        super().__init__()
+        self._labels = None
+        self._data = self._prepare_data(data)
+        self._plot(plot_folder, model_setup)
+
+    def _prepare_data(self, data: Dict) -> pd.DataFrame:
+        """
+        Shrink given data, if only scores are relevant. In any case, transform data to a plot friendly format. Also set
+        plot labels depending on the lead time dimensions.
+        :param data: dictionary with station names as keys and 2D xarrays as values
+        :return: pre-processed data set
+        """
+        data = helpers.dict_to_xarray(data, "station")
+        self._labels = [str(i) + "d" for i in data.coords["ahead"].values]
+        return data.to_dataframe("data").reset_index(level=[0, 1, 2])
+
+    def _label_add(self, score_only: bool):
+        """
+        Adds the phrase "terms and " if score_only is disabled or empty string (if score_only=True).
+        :param score_only: if false all terms are relevant, otherwise only CASE I to IV
+        :return: additional label
+        """
+        return "" if score_only else "terms and "
+
+    def _plot(self, plot_folder,  model_setup):
+        """
+        Main plot function to plot climatological skill score.
+        :param plot_folder: path to save the plot
+        :param model_setup: architecture type to specify plot name
+        """
+        fig, ax = plt.subplots()
+        sns.boxplot(x="boot_var", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d",
+                    showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."})
+        ax.axhline(y=0, color="grey", linewidth=.5)
+        ax.set(ylabel=f"skill score", xlabel="", title="summary of all stations")
+        handles, _ = ax.get_legend_handles_labels()
+        ax.legend(handles, self._labels)
+        plt.tight_layout()
+        self._save(plot_folder, model_setup)
+
+    @staticmethod
+    def _save(plot_folder, model_setup):
+        """
+        Standard save method to store plot locally. The name of this plot is dynamic. It includes the model setup like
+        'CNN' and can additionally be adjusted using an extra name tag.
+        :param plot_folder: path to save the plot
+        :param model_setup: architecture type to specify plot name
+        """
+        plot_name = os.path.join(plot_folder, f"skill_score_bootstrap_{model_setup}.pdf")
+        logging.debug(f"... save plot to {plot_name}")
+        plt.savefig(plot_name, dpi=500)
+        plt.close('all')
+
+
 class PlotTimeSeries(RunEnvironment):
 
     def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".",
diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py
index 4c3b8872575ea9929f1d4ba3f5a42e222ac2fff4..49e586d404c88e177946149542517140db1c6ff9 100644
--- a/src/run_modules/experiment_setup.py
+++ b/src/run_modules/experiment_setup.py
@@ -34,7 +34,7 @@ class ExperimentSetup(RunEnvironment):
                  limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None,
                  test_end=None, use_all_stations_on_all_data_sets=True, trainable=None, fraction_of_train=None,
                  experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None, sampling="daily",
-                 create_new_model=None, permute_data_on_training=None, transformation=None):
+                 create_new_model=None, bootstrap_path=None, permute_data_on_training=None, transformation=None):
 
         # create run framework
         super().__init__()
@@ -44,6 +44,9 @@ class ExperimentSetup(RunEnvironment):
         self._set_param("create_new_model", create_new_model, default=True)
         if self.data_store.get("create_new_model", "general"):
             trainable = True
+        data_path = self.data_store.get("data_path", "general")
+        bootstrap_path = helpers.set_bootstrap_path(bootstrap_path, data_path, sampling)
+        self._set_param("bootstrap_path", bootstrap_path)
         self._set_param("trainable", trainable, default=True)
         self._set_param("fraction_of_training", fraction_of_train, default=0.8)
         self._set_param("permute_data", permute_data_on_training, default=False, scope="general.train")
diff --git a/src/run_modules/model_setup.py b/src/run_modules/model_setup.py
index e3945a542d60b09dc9855bd28be87cdba729ed72..d2c8d93fc957ecb2990e99000cbd3588e2d83eef 100644
--- a/src/run_modules/model_setup.py
+++ b/src/run_modules/model_setup.py
@@ -10,8 +10,8 @@ import tensorflow as tf
 
 from src.model_modules.keras_extensions import HistoryAdvanced, CallbackHandler
 # from src.model_modules.model_class import MyBranchedModel as MyModel
-# from src.model_modules.model_class import MyLittleModel as MyModel
-from src.model_modules.model_class import MyTowerModel as MyModel
+from src.model_modules.model_class import MyLittleModel as MyModel
+# from src.model_modules.model_class import MyTowerModel as MyModel
 from src.run_modules.run_environment import RunEnvironment
 
 
diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py
index 06203c879872891f57c719040482fe052824c65e..5c392a402da47251c51668e0b06a3067104a61e6 100644
--- a/src/run_modules/post_processing.py
+++ b/src/run_modules/post_processing.py
@@ -13,11 +13,12 @@ import xarray as xr
 from src import statistics
 from src.data_handling.data_distributor import Distributor
 from src.data_handling.data_generator import DataGenerator
+from src.data_handling.bootstraps import BootStraps
 from src.datastore import NameNotFoundInDataStore
 from src.helpers import TimeTracking
 from src.model_modules.linear_model import OrdinaryLeastSquaredModel
 from src.plotting.postprocessing_plotting import PlotMonthlySummary, PlotStationMap, PlotClimatologicalSkillScore, \
-    PlotCompetitiveSkillScore, PlotTimeSeries
+    PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore
 from src.plotting.postprocessing_plotting import plot_conditional_quantiles
 from src.run_modules.run_environment import RunEnvironment
 
@@ -37,6 +38,7 @@ class PostProcessing(RunEnvironment):
         self.target_var = self.data_store.get("target_var", "general")
         self._sampling = self.data_store.get("sampling", "general")
         self.skill_scores = None
+        self.bootstrap_skill_scores = None
         self._run()
 
     def _run(self):
@@ -48,9 +50,65 @@ class PostProcessing(RunEnvironment):
             self.make_prediction()
             logging.info("take a look on the next reported time measure. If this increases a lot, one should think to "
                          "skip make_prediction() whenever it is possible to save time.")
+        self.bootstrap_skill_scores = self.create_boot_straps()
         self.skill_scores = self.calculate_skill_scores()
         self.plot()
 
+    def create_boot_straps(self):
+
+        # forecast
+
+        bootstrap_path = self.data_store.get("bootstrap_path", "general")
+        forecast_path = self.data_store.get("forecast_path", "general")
+        window_lead_time = self.data_store.get("window_lead_time", "general")
+        bootstraps = BootStraps(self.test_data, bootstrap_path, 20)
+        with TimeTracking(name="boot predictions"):
+            bootstrap_predictions = self.model.predict_generator(generator=bootstraps.boot_strap_generator(),
+                                                                 steps=bootstraps.get_boot_strap_generator_length())
+        bootstrap_meta = np.array(bootstraps.get_boot_strap_meta())
+        variables = np.unique(bootstrap_meta[:, 0])
+        for station in np.unique(bootstrap_meta[:, 1]):
+            coords = None
+            for boot in variables:
+                ind = np.all(bootstrap_meta == [boot, station], axis=1)
+                length = sum(ind)
+                sel = bootstrap_predictions[ind].reshape((length, window_lead_time, 1))
+                coords = (range(length), range(1, window_lead_time + 1))
+                tmp = xr.DataArray(sel, coords=(*coords, [boot]), dims=["index", "ahead", "type"])
+                file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc")
+                tmp.to_netcdf(file_name)
+            labels = bootstraps.get_labels(station).reshape((length, window_lead_time, 1))
+            file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc")
+            labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=["index", "ahead", "type"])
+            labels.to_netcdf(file_name)
+
+        # file_name = os.path.join(forecast_path, f"bootstraps_orig.nc")
+        # orig = xr.open_dataarray(file_name)
+
+
+        # calc skill scores
+        skill_scores = statistics.SkillScores(None)
+        score = {}
+        for station in np.unique(bootstrap_meta[:, 1]):
+            file_name = os.path.join(forecast_path, f"bootstraps_labels_{station}.nc")
+            labels = xr.open_dataarray(file_name)
+            shape = labels.shape
+            orig = bootstraps.get_orig_prediction(forecast_path,  f"forecasts_norm_{station}_test.nc").reshape(shape)
+            orig = xr.DataArray(orig, coords=(range(shape[0]), range(1, shape[1] + 1), ["orig"]), dims=["index", "ahead", "type"])
+            skill = pd.DataFrame(columns=range(1, window_lead_time + 1))
+            for boot in variables:
+                file_name = os.path.join(forecast_path, f"bootstraps_{boot}_{station}.nc")
+                boot_data = xr.open_dataarray(file_name)
+                boot_data = boot_data.combine_first(labels)
+                boot_data = boot_data.combine_first(orig)
+                boot_scores = []
+                for iahead in range(window_lead_time):
+                    data = boot_data.sel(ahead=iahead + 1)
+                    boot_scores.append(skill_scores.general_skill_score(data, forecast_name=boot, reference_name="orig"))
+                skill.loc[boot] = np.array(boot_scores)
+            score[station] = xr.DataArray(skill, dims=["boot_var", "ahead"])
+        return score
+
     def _load_model(self):
         try:
             model = self.data_store.get("best_model", "general")
@@ -75,6 +133,7 @@ class PostProcessing(RunEnvironment):
         PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False,
                                      extra_name_tag="all_terms_", model_setup="CNN")
         PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup="CNN")
+        PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN")
         PlotTimeSeries(self.test_data.stations, path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, sampling=self._sampling)
 
     def calculate_test_score(self):
@@ -96,64 +155,72 @@ class PostProcessing(RunEnvironment):
         logging.debug("start make_prediction")
         for i, _ in enumerate(self.test_data):
             data = self.test_data.get_data_generator(i)
-
-            nn_prediction, persistence_prediction, ols_prediction = self._create_empty_prediction_arrays(data, count=3)
             input_data = data.get_transposed_history()
 
             # get scaling parameters
             mean, std, transformation_method = data.get_transformation_information(variable=self.target_var)
 
-            # nn forecast
-            nn_prediction = self._create_nn_forecast(input_data, nn_prediction, mean, std, transformation_method)
+            for normalised in [True, False]:
+                # create empty arrays
+                nn_prediction, persistence_prediction, ols_prediction = self._create_empty_prediction_arrays(data, count=3)
+
+                # nn forecast
+                nn_prediction = self._create_nn_forecast(input_data, nn_prediction, mean, std, transformation_method, normalised)
 
-            # persistence
-            persistence_prediction = self._create_persistence_forecast(input_data, persistence_prediction, mean, std, 
-                                                                       transformation_method)
+                # persistence
+                persistence_prediction = self._create_persistence_forecast(input_data, persistence_prediction, mean, std,
+                                                                           transformation_method, normalised)
 
-            # ols
-            ols_prediction = self._create_ols_forecast(input_data, ols_prediction, mean, std, transformation_method)
+                # ols
+                ols_prediction = self._create_ols_forecast(input_data, ols_prediction, mean, std, transformation_method, normalised)
 
-            # observation
-            observation = self._create_observation(data, None, mean, std, transformation_method)
+                # observation
+                observation = self._create_observation(data, None, mean, std, transformation_method, normalised)
 
-            # merge all predictions
-            full_index = self.create_fullindex(data.data.indexes['datetime'], self._get_frequency())
-            all_predictions = self.create_forecast_arrays(full_index, list(data.label.indexes['window']),
-                                                          CNN=nn_prediction,
-                                                          persi=persistence_prediction,
-                                                          obs=observation,
-                                                          OLS=ols_prediction)
+                # merge all predictions
+                full_index = self.create_fullindex(data.data.indexes['datetime'], self._get_frequency())
+                all_predictions = self.create_forecast_arrays(full_index, list(data.label.indexes['window']),
+                                                              CNN=nn_prediction,
+                                                              persi=persistence_prediction,
+                                                              obs=observation,
+                                                              OLS=ols_prediction)
 
-            # save all forecasts locally
-            path = self.data_store.get("forecast_path", "general")
-            file = os.path.join(path, f"forecasts_{data.station[0]}_test.nc")
-            all_predictions.to_netcdf(file)
+                # save all forecasts locally
+                path = self.data_store.get("forecast_path", "general")
+                prefix = "forecasts_norm" if normalised else "forecasts"
+                file = os.path.join(path, f"{prefix}_{data.station[0]}_test.nc")
+                all_predictions.to_netcdf(file)
 
     def _get_frequency(self):
         getter = {"daily": "1D", "hourly": "1H"}
         return getter.get(self._sampling, None)
 
     @staticmethod
-    def _create_observation(data, _, mean, std, transformation_method):
-        return statistics.apply_inverse_transformation(data.label.copy(), mean, std, transformation_method)
+    def _create_observation(data, _, mean, std, transformation_method, normalised):
+        obs = data.label.copy()
+        if not normalised:
+            obs = statistics.apply_inverse_transformation(obs, mean, std, transformation_method)
+        return obs
 
-    def _create_ols_forecast(self, input_data, ols_prediction, mean, std, transformation_method):
+    def _create_ols_forecast(self, input_data, ols_prediction, mean, std, transformation_method, normalised):
         tmp_ols = self.ols_model.predict(input_data)
-        tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method)
+        if not normalised:
+            tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method)
         tmp_ols = np.expand_dims(tmp_ols, axis=1)
         target_shape = ols_prediction.values.shape
         ols_prediction.values = np.swapaxes(tmp_ols, 2, 0) if target_shape != tmp_ols.shape else tmp_ols
         return ols_prediction
 
-    def _create_persistence_forecast(self, input_data, persistence_prediction, mean, std, transformation_method):
+    def _create_persistence_forecast(self, input_data, persistence_prediction, mean, std, transformation_method, normalised):
         tmp_persi = input_data.sel({'window': 0, 'variables': self.target_var})
-        tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method)
+        if not normalised:
+            tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method)
         window_lead_time = self.data_store.get("window_lead_time", "general")
         persistence_prediction.values = np.expand_dims(np.tile(tmp_persi.squeeze('Stations'), (window_lead_time, 1)),
                                                        axis=1)
         return persistence_prediction
 
-    def _create_nn_forecast(self, input_data, nn_prediction, mean, std, transformation_method):
+    def _create_nn_forecast(self, input_data, nn_prediction, mean, std, transformation_method, normalised):
         """
         create the nn forecast for given input data. Inverse transformation is applied to the forecast to get the output
         in the original space. Furthermore, only the output of the main branch is returned (not all minor branches, if
@@ -166,7 +233,8 @@ class PostProcessing(RunEnvironment):
         :return:
         """
         tmp_nn = self.model.predict(input_data)
-        tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method)
+        if not normalised:
+            tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method)
         if tmp_nn.ndim == 3:
             nn_prediction.values = np.swapaxes(np.expand_dims(tmp_nn[-1, ...], axis=1), 2, 0)
         elif tmp_nn.ndim == 2:
diff --git a/test/test_data_handling/test_bootstraps.py b/test/test_data_handling/test_bootstraps.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1edd7ca7f012ccdebc2c75119eb37c5bc56c125
--- /dev/null
+++ b/test/test_data_handling/test_bootstraps.py
@@ -0,0 +1,64 @@
+
+from src.data_handling.bootstraps import BootStraps
+
+import pytest
+import os
+
+import numpy as np
+
+
+class TestBootstraps:
+
+    @pytest.fixture
+    def path(self):
+        path = os.path.join(os.path.dirname(__file__), "data")
+        if not os.path.exists(path):
+            os.makedirs(path)
+        return path
+
+    @pytest.fixture
+    def boot_no_init(self, path):
+        obj = object.__new__(BootStraps)
+        super(BootStraps, obj).__init__()
+        obj.number_bootstraps = 50
+        obj.bootstrap_path = path
+        return obj
+
+    def test_valid_bootstrap_file(self, path, boot_no_init):
+        station = "TESTSTATION"
+        variables = "var1_var2_var3"
+        window = 5
+        # empty case
+        assert len(os.listdir(path)) == 0
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 50)
+        # different cases, where files with bigger range are existing
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots100_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist10_nboots50_shuffled.dat"))
+        os.mknod(os.path.join(path, f"{station}1_{variables}_hist10_nboots50_shuffled.dat"))
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (True, None)
+        #  need to reload data and therefore remove not fitting files for this station
+        assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 100)
+        assert len(os.listdir(path)) == 1
+        # reload because expanded boot number
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots50_shuffled.dat"))
+        boot_no_init.number_bootstraps = 60
+        assert boot_no_init.valid_bootstrap_file(station, variables, window) == (False, 60)
+        assert len(os.listdir(path)) == 1
+        # reload because of expanded window size, but use maximum boot number from file names
+        os.mknod(os.path.join(path, f"{station}_{variables}_hist5_nboots60_shuffled.dat"))
+        boot_no_init.number_bootstraps = 50
+        assert boot_no_init.valid_bootstrap_file(station, variables, 20) == (False, 60)
+
+    def test_shuffle_single_variale(self, boot_no_init):
+        data = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
+        res = boot_no_init.shuffle_single_variable(data)
+        assert res.shape == data.shape
+        assert res.max() == data.max()
+        assert res.min() == data.min()
+        assert set(np.unique(res)).issubset({1, 2, 3})
+
+    def test_create_shuffled_data(self):
+        pass
\ No newline at end of file
diff --git a/test/test_helpers.py b/test/test_helpers.py
index b807d2b8612b9ee006bff43f1ae4cfcfd2dd07e1..07ec244e078f977dca761274260275aab355c183 100644
--- a/test/test_helpers.py
+++ b/test/test_helpers.py
@@ -117,6 +117,14 @@ class TestTimeTracking:
         expression = PyTestRegex(r"undefined job finished after \d+:\d+:\d+ \(hh:mm:ss\)")
         assert caplog.record_tuples[-1] == ('root', 20, expression)
 
+    def test_name_enter_exit(self, caplog):
+        caplog.set_level(logging.INFO)
+        with TimeTracking(name="my job") as t:
+            assert t.start is not None
+            assert t.end is None
+        expression = PyTestRegex(r"my job finished after \d+:\d+:\d+ \(hh:mm:ss\)")
+        assert caplog.record_tuples[-1] == ('root', 20, expression)
+
 
 class TestPrepareHost: