diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt
index a39acca8a7cf887237f595d7992960ac10233a85..8ccf3ba6515d31ebd2b35901d3c9e58734d653d8 100644
--- a/docs/requirements_docs.txt
+++ b/docs/requirements_docs.txt
@@ -3,4 +3,5 @@ sphinx-autoapi==1.3.0
 sphinx-autodoc-typehints==1.10.3
 sphinx-rtd-theme==0.4.3
 #recommonmark==0.6.0
-m2r2==0.2.5
\ No newline at end of file
+m2r2==0.2.5
+docutils<0.18
\ No newline at end of file
diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py
index 242dcd31d01134b75f70a12c2708d4c99bb94301..9b79dc6ac815ce6e7f55f67062fc1e6172544512 100644
--- a/mlair/configuration/defaults.py
+++ b/mlair/configuration/defaults.py
@@ -44,14 +44,19 @@ DEFAULT_TEST_END = "2017-12-31"
 DEFAULT_TEST_MIN_LENGTH = 90
 DEFAULT_TRAIN_VAL_MIN_LENGTH = 180
 DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS = True
-DEFAULT_EVALUATE_BOOTSTRAPS = True
-DEFAULT_CREATE_NEW_BOOTSTRAPS = False
-DEFAULT_NUMBER_OF_BOOTSTRAPS = 20
-DEFAULT_BOOTSTRAP_TYPE = "singleinput"
-DEFAULT_BOOTSTRAP_METHOD = "shuffle"
+DEFAULT_DO_UNCERTAINTY_ESTIMATE = True
+DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH = "1m"
+DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS = True
+DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS = 1000
+DEFAULT_EVALUATE_FEATURE_IMPORTANCE = True
+DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS = False
+DEFAULT_FEATURE_IMPORTANCE_N_BOOTS = 20
+DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE = "singleinput"
+DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD = "shuffle"
 DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries",
                      "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles",
-                     "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram"]
+                     "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram",
+                     "PlotSampleUncertaintyFromBootstrap"]
 DEFAULT_SAMPLING = "daily"
 DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA",
                        "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "",
diff --git a/mlair/data_handler/__init__.py b/mlair/data_handler/__init__.py
index 495b6e7c8604a839a084a2b78a54563c13eb06e6..d119977802038155af0d07d99c75c665622a148c 100644
--- a/mlair/data_handler/__init__.py
+++ b/mlair/data_handler/__init__.py
@@ -9,7 +9,7 @@ __author__ = 'Lukas Leufen, Felix Kleinert'
 __date__ = '2020-04-17'
 
 
-from .bootstraps import BootStraps
+from .input_bootstraps import Bootstraps
 from .iterator import KerasIterator, DataCollection
 from .default_data_handler import DefaultDataHandler
 from .abstract_data_handler import AbstractDataHandler
diff --git a/mlair/data_handler/bootstraps.py b/mlair/data_handler/input_bootstraps.py
similarity index 98%
rename from mlair/data_handler/bootstraps.py
rename to mlair/data_handler/input_bootstraps.py
index e03881484bfc9b8275ede8a4432072c74643994a..ab4c71f1c77ab12b0e751f6991fbf20cd55aa8ae 100644
--- a/mlair/data_handler/bootstraps.py
+++ b/mlair/data_handler/input_bootstraps.py
@@ -28,8 +28,8 @@ class BootstrapIterator(Iterator):
 
     _position: int = None
 
-    def __init__(self, data: "BootStraps", method):
-        assert isinstance(data, BootStraps)
+    def __init__(self, data: "Bootstraps", method):
+        assert isinstance(data, Bootstraps)
         self._data = data
         self._dimension = data.bootstrap_dimension
         self.boot_dim = "boots"
@@ -184,7 +184,7 @@ class MeanBootstraps:
         return np.ones_like(data) * self._mean
 
 
-class BootStraps(Iterable):
+class Bootstraps(Iterable):
     """
     Main class to perform bootstrap operations.
 
diff --git a/mlair/helpers/filter.py b/mlair/helpers/filter.py
index 543cff3624577ac617733b8b593c5f52f25196b3..36c93b04486fc9be013af2c4f34d2b3ee1bd84c2 100644
--- a/mlair/helpers/filter.py
+++ b/mlair/helpers/filter.py
@@ -768,6 +768,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass):
 
 
 def firwin_kzf(m, k):
+    m, k = int(m), int(k)
     coef = np.ones(m)
     for i in range(1, k):
         t = np.zeros((m, m + i * (m - 1)))
diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py
index 16b36921773a4af131065063de23963a56cb4c65..679f5a28fc564d56cd6f3794ee8fe8e1877b2b4c 100644
--- a/mlair/helpers/helpers.py
+++ b/mlair/helpers/helpers.py
@@ -118,8 +118,10 @@ def remove_items(obj: Union[List, Dict, Tuple], items: Any):
 
     def remove_from_list(list_obj, item_list):
         """Remove implementation for lists."""
-        if len(items) > 1:
+        if len(item_list) > 1:
             return [e for e in list_obj if e not in item_list]
+        elif len(item_list) == 0:
+            return list_obj
         else:
             list_obj = list_obj.copy()
             try:
diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py
index 87f780f9a6133edfcb2f9c71c2956b92f332e915..aa89da0ea66e263d076af9abd578ba125c260bec 100644
--- a/mlair/helpers/statistics.py
+++ b/mlair/helpers/statistics.py
@@ -287,7 +287,7 @@ class SkillScores:
         combination_strings = [f"{first}-{second}" for (first, second) in combinations]
         return combinations, combination_strings
 
-    def skill_scores(self) -> pd.DataFrame:
+    def skill_scores(self) -> [pd.DataFrame, pd.DataFrame]:
         """
         Calculate skill scores for all combinations of model names.
 
@@ -296,6 +296,7 @@ class SkillScores:
         ahead_names = list(self.external_data[self.ahead_dim].data)
         combinations, combination_strings = self.get_model_name_combinations()
         skill_score = pd.DataFrame(index=combination_strings)
+        count = pd.DataFrame(index=combination_strings)
         for iahead in ahead_names:
             data = self.external_data.sel({self.ahead_dim: iahead})
             skill_score[iahead] = [self.general_skill_score(data,
@@ -303,7 +304,12 @@ class SkillScores:
                                                             reference_name=second,
                                                             observation_name=self.observation_name)
                                    for (first, second) in combinations]
-        return skill_score
+            count[iahead] = [self.get_count(data,
+                                            forecast_name=first,
+                                            reference_name=second,
+                                            observation_name=self.observation_name)
+                             for (first, second) in combinations]
+        return skill_score, count
 
     def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray:
         """
@@ -376,6 +382,21 @@ class SkillScores:
         skill_score = 1 - mse(observation, forecast) / mse(observation, reference)
         return skill_score.values
 
+    def get_count(self, data: Data, forecast_name: str, reference_name: str,
+                            observation_name: str = None) -> np.ndarray:
+        r"""
+        Calculate general skill score based on mean squared error.
+
+        :param data: internal data containing data for observation, forecast and reference
+        :param observation_name: name of observation
+        :param forecast_name: name of forecast
+        :param reference_name: name of reference
+
+        :return: skill score of forecast
+        """
+        data = data.dropna("index")
+        return data.count("index").max().values
+
     def skill_score_pre_calculations(self, data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray,
                                                                                                            np.ndarray,
                                                                                                            np.ndarray,
@@ -481,3 +502,58 @@ class SkillScores:
 
         return monthly_mean
 
+
+def create_single_bootstrap_realization(data: xr.DataArray, dim_name_time: str) -> xr.DataArray:
+    """
+    Return a bootstraped realization of data
+    :param data: data from which to draw ONE bootstrap realization
+    :param dim_name_time: name of time dimension
+    :return: bootstrapped realization of data
+    """
+
+    num_of_blocks = data.coords[dim_name_time].shape[0]
+    boot_idx = np.random.choice(num_of_blocks, size=num_of_blocks, replace=True)
+    return data.isel({dim_name_time: boot_idx})
+
+
+def calculate_average(data: xr.DataArray, **kwargs) -> xr.DataArray:
+    """
+    Calculate mean of data
+    :param data: data for which to calculate mean
+    :return: mean of data
+    """
+    return data.mean(**kwargs)
+
+
+def create_n_bootstrap_realizations(data: xr.DataArray, dim_name_time: str, dim_name_model: str, n_boots: int = 1000,
+                                    dim_name_boots: str = 'boots') -> xr.DataArray:
+    """
+    Create n bootstrap realizations and calculate averages across realizations
+
+    :param data: original data from which to create bootstrap realizations
+    :param dim_name_time: name of time dimension
+    :param dim_name_model: name of model dimension
+    :param n_boots: number of bootstap realizations
+    :param dim_name_boots: name of bootstap dimension
+    :return:
+    """
+    res_dims = [dim_name_boots]
+    dims = list(data.dims)
+    coords = {dim_name_boots: range(n_boots), dim_name_model: data.coords[dim_name_model] }
+    if len(dims) > 1:
+        res_dims = res_dims + dims[1:]
+    res = xr.DataArray(np.nan, dims=res_dims, coords=coords)
+    for boot in range(n_boots):
+        res[boot] = (calculate_average(
+            create_single_bootstrap_realization(data, dim_name_time=dim_name_time),
+            dim=dim_name_time, skipna=True))
+    return res
+
+
+
+
+
+
+
+
+
diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py
index 68068533cd328b597f6009d91ce96a5cf3550c74..8d4ab2689b1eea24dc9d39d53b04e51405a3a874 100644
--- a/mlair/plotting/data_insight_plotting.py
+++ b/mlair/plotting/data_insight_plotting.py
@@ -21,6 +21,8 @@ from mlair.data_handler import DataCollection
 from mlair.helpers import TimeTrackingWrapper, to_list, remove_items
 from mlair.plotting.abstract_plot_class import AbstractPlotClass
 
+matplotlib.use("Agg")
+
 
 @TimeTrackingWrapper
 class PlotStationMap(AbstractPlotClass):  # pragma: no cover
@@ -632,7 +634,10 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
             self._plot_total(raw=True)
             self._plot_total(raw=False)
             if multiple > 1:
-                self._plot_difference(label_names)
+                self._plot_difference(label_names, plot_name_add="_last")
+                self._prepare_pgram(generator, pos, multiple, use_multiprocessing=use_multiprocessing,
+                                    use_last_input_value=False)
+                self._plot_difference(label_names, plot_name_add="_first")
 
     @staticmethod
     def _has_filter_dimension(g, pos):
@@ -649,7 +654,7 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
                 return check_data.coords[filter_dim].shape[0], check_data.coords[filter_dim].values.tolist()
 
     @TimeTrackingWrapper
-    def _prepare_pgram(self, generator, pos, multiple=1, use_multiprocessing=False):
+    def _prepare_pgram(self, generator, pos, multiple=1, use_multiprocessing=False, use_last_input_value=True):
         """
         Create periodogram data.
         """
@@ -663,7 +668,8 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
             plot_data_raw_single = dict()
             plot_data_mean_single = dict()
             self.f_index = np.logspace(-3, 0 if self._sampling == "daily" else np.log10(24), 1000)
-            raw_data_single = self._prepare_pgram_parallel_gen(generator, m, pos, use_multiprocessing)
+            raw_data_single = self._prepare_pgram_parallel_gen(generator, m, pos, use_multiprocessing,
+                                                               use_last_input_value=use_last_input_value)
             for var in raw_data_single.keys():
                 pgram_com = []
                 pgram_mean = 0
@@ -715,7 +721,7 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
                     raw_data_single[var_str] = raw_data_single[var_str] + [(f, pgram)]
         return raw_data_single
 
-    def _prepare_pgram_parallel_gen(self, generator, m, pos, use_multiprocessing):
+    def _prepare_pgram_parallel_gen(self, generator, m, pos, use_multiprocessing, use_last_input_value=True):
         """Implementation of data preprocessing using parallel generator element processing."""
         raw_data_single = dict()
         res = []
@@ -723,14 +729,15 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
             pool = multiprocessing.Pool(
                 min([psutil.cpu_count(logical=False), len(generator), 16]))  # use only physical cpus
             output = [
-                pool.apply_async(f_proc_2, args=(g, m, pos, self.variables_dim, self.time_dim, self.f_index))
+                pool.apply_async(f_proc_2, args=(g, m, pos, self.variables_dim, self.time_dim, self.f_index,
+                                                 use_last_input_value))
                 for g in generator]
             for i, p in enumerate(output):
                 res.append(p.get())
             pool.close()
         else:
             for g in generator:
-                res.append(f_proc_2(g, m, pos, self.variables_dim, self.time_dim, self.f_index))
+                res.append(f_proc_2(g, m, pos, self.variables_dim, self.time_dim, self.f_index, use_last_input_value))
         for res_dict in res:
             for k, v in res_dict.items():
                 if k not in raw_data_single.keys():
@@ -816,8 +823,8 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
         pdf_pages.close()
         plt.close('all')
 
-    def _plot_difference(self, label_names):
-        plot_name = f"{self.plot_name}_{self._sampling}_{self._add_text}_filter.pdf"
+    def _plot_difference(self, label_names, plot_name_add = ""):
+        plot_name = f"{self.plot_name}_{self._sampling}_{self._add_text}_filter{plot_name_add}.pdf"
         plot_path = os.path.join(os.path.abspath(self.plot_folder), plot_name)
         logging.info(f"... plotting {plot_name}")
         pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path)
@@ -846,19 +853,19 @@ class PlotPeriodogram(AbstractPlotClass):  # pragma: no cover
         plt.close('all')
 
 
-def f_proc(var, d_var, f_index, time_dim="datetime"):  # pragma: no cover
+def f_proc(var, d_var, f_index, time_dim="datetime", use_last_value=True):  # pragma: no cover
     var_str = str(var)
     t = (d_var[time_dim] - d_var[time_dim][0]).astype("timedelta64[h]").values / np.timedelta64(1, "D")
     if len(d_var.shape) > 1:  # use only max value if dimensions are remaining (e.g. max(window) -> latest value)
         to_remove = remove_items(d_var.coords.dims, time_dim)
         for e in to_list(to_remove):
-            d_var = d_var.sel({e: d_var[e].max()})
+            d_var = d_var.sel({e: d_var[e].max() if use_last_value is True else d_var[e].min()})
     pgram = LombScargle(t, d_var.values.flatten(), nterms=1, normalization="psd").power(f_index)
     # f, pgram = LombScargle(t, d_var.values.flatten(), nterms=1, normalization="psd").autopower()
     return var_str, f_index, pgram
 
 
-def f_proc_2(g, m, pos, variables_dim, time_dim, f_index):  # pragma: no cover
+def f_proc_2(g, m, pos, variables_dim, time_dim, f_index, use_last_value):  # pragma: no cover
     raw_data_single = dict()
     if hasattr(g.id_class, "lazy"):
         g.id_class.load_lazy() if g.id_class.lazy is True else None
@@ -880,7 +887,7 @@ def f_proc_2(g, m, pos, variables_dim, time_dim, f_index):  # pragma: no cover
     d = d[pos] if isinstance(d, tuple) else d
     for var in d[variables_dim].values:
         d_var = d.loc[{variables_dim: var}].squeeze().dropna(time_dim)
-        var_str, f, pgram = f_proc(var, d_var, f_index)
+        var_str, f, pgram = f_proc(var, d_var, f_index, use_last_value=use_last_value)
         raw_data_single[var_str] = [(f, pgram)]
     if hasattr(g.id_class, "lazy"):
         g.id_class.clean_up() if g.id_class.lazy is True else None
diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py
index 7ba84656cc00a89a7ec88efcc30bf665e0849e2f..2afb5f08ec6401fa2e693c924cf263204ffbb75d 100644
--- a/mlair/plotting/postprocessing_plotting.py
+++ b/mlair/plotting/postprocessing_plotting.py
@@ -15,6 +15,7 @@ import pandas as pd
 import seaborn as sns
 import xarray as xr
 from matplotlib.backends.backend_pdf import PdfPages
+from matplotlib.offsetbox import AnchoredText
 
 from mlair import helpers
 from mlair.data_handler.iterator import DataCollection
@@ -30,7 +31,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING)
 
 
 @TimeTrackingWrapper
-class PlotMonthlySummary(AbstractPlotClass):
+class PlotMonthlySummary(AbstractPlotClass):  # pragma: no cover
     """
     Show a monthly summary over all stations for each lead time ("ahead") as box and whiskers plot.
 
@@ -137,7 +138,7 @@ class PlotMonthlySummary(AbstractPlotClass):
 
 
 @TimeTrackingWrapper
-class PlotConditionalQuantiles(AbstractPlotClass):
+class PlotConditionalQuantiles(AbstractPlotClass):  # pragma: no cover
     """
     Create cond.quantile plots as originally proposed by Murphy, Brown and Chen (1989) [But in log scale].
 
@@ -381,7 +382,7 @@ class PlotConditionalQuantiles(AbstractPlotClass):
 
 
 @TimeTrackingWrapper
-class PlotClimatologicalSkillScore(AbstractPlotClass):
+class PlotClimatologicalSkillScore(AbstractPlotClass):  # pragma: no cover
     """
     Create plot of climatological skill score after Murphy (1988) as box plot over all stations.
 
@@ -473,7 +474,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass):
 
 
 @TimeTrackingWrapper
-class PlotCompetitiveSkillScore(AbstractPlotClass):
+class PlotCompetitiveSkillScore(AbstractPlotClass):  # pragma: no cover
     """
     Create competitive skill score plot.
 
@@ -491,12 +492,12 @@ class PlotCompetitiveSkillScore(AbstractPlotClass):
 
     """
 
-    def __init__(self, data: pd.DataFrame, plot_folder=".", model_setup="NN"):
+    def __init__(self, data: Dict[str, pd.DataFrame], plot_folder=".", model_setup="NN"):
         """Initialise."""
         super().__init__(plot_folder, f"skill_score_competitive_{model_setup}")
         self._model_setup = model_setup
         self._labels = None
-        self._data = self._prepare_data(data)
+        self._data = self._prepare_data(helpers.remove_items(data, "total"))
         default_plot_name = self.plot_name
         # draw full detail plot
         self.plot_name = default_plot_name + "_full_detail"
@@ -590,7 +591,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass):
 
 
 @TimeTrackingWrapper
-class PlotBootstrapSkillScore(AbstractPlotClass):
+class PlotBootstrapSkillScore(AbstractPlotClass):  # pragma: no cover
     """
     Create plot of climatological skill score after Murphy (1988) as box plot over all stations.
 
@@ -839,7 +840,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass):
 
 
 @TimeTrackingWrapper
-class PlotTimeSeries:
+class PlotTimeSeries:  # pragma: no cover
     """
     Create time series plot.
 
@@ -972,7 +973,7 @@ class PlotTimeSeries:
 
 
 @TimeTrackingWrapper
-class PlotSeparationOfScales(AbstractPlotClass):
+class PlotSeparationOfScales(AbstractPlotClass):  # pragma: no cover
 
     def __init__(self, collection: DataCollection, plot_folder: str = ".", time_dim="datetime", window_dim="window",
                  filter_dim="filter", target_dim="variables"):
@@ -998,6 +999,80 @@ class PlotSeparationOfScales(AbstractPlotClass):
             self._save()
 
 
+@TimeTrackingWrapper
+class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass):  # pragma: no cover
+
+    def __init__(self, data: xr.DataArray, plot_folder: str = ".", model_type_dim: str = "type",
+                 error_measure: str = "mse", error_unit: str = None, dim_name_boots: str = 'boots',
+                 block_length: str = None):
+        super().__init__(plot_folder, "sample_uncertainty_from_bootstrap")
+        default_name = self.plot_name
+        self.model_type_dim = model_type_dim
+        self.error_measure = error_measure
+        self.dim_name_boots = dim_name_boots
+        self.error_unit = error_unit
+        self.block_length = block_length
+        self.prepare_data(data)
+        self._plot(orientation="v")
+
+        self.plot_name = default_name + "_horizontal"
+        self._plot(orientation="h")
+
+        self._apply_root()
+
+        self.plot_name = default_name + "_sqrt"
+        self._plot(orientation="v")
+
+        self.plot_name = default_name + "_horizontal_sqrt"
+        self._plot(orientation="h")
+
+        self._data_table = None
+        self._n_boots = None
+
+    def prepare_data(self, data: xr.DataArray):
+        self._data_table = data.to_pandas()
+        if "persi" in self._data_table.columns:
+            self._data_table["persi"] = self._data_table.pop("persi")
+        self._n_boots = self._data_table.shape[0]
+
+    def _apply_root(self):
+        self._data_table = np.sqrt(self._data_table)
+        self.error_measure = f"root {self.error_measure}"
+        self.error_unit = self.error_unit.replace("$^2$", "")
+
+    def _plot(self, orientation: str = "v"):
+        data_table = self._data_table
+        n_boots = self._n_boots
+        size = len(np.unique(data_table.columns))
+        if orientation == "v":
+            figsize, width = (size, 5), 0.4
+        elif orientation == "h":
+            figsize, width = (6, (1+.5*size)), 0.65
+        else:
+            raise ValueError(f"orientation must be `v' or `h' but is: {orientation}")
+        fig, ax = plt.subplots(figsize=figsize)
+        sns.boxplot(data=data_table, ax=ax, whis=1., color="white",
+                    showmeans=True, meanprops={"markersize": 6, "markeredgecolor": "k"},
+                    flierprops={"marker": "o", "markerfacecolor": "black", "markeredgecolor": "none", "markersize": 3},
+                    boxprops={'facecolor': 'none', 'edgecolor': 'k'},
+                    width=width, orient=orientation)
+        if orientation == "v":
+            ax.set_ylabel(f"{self.error_measure} (in {self.error_unit})")
+            ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
+        elif orientation == "h":
+            ax.set_xlabel(f"{self.error_measure} (in {self.error_unit})")
+        else:
+            raise ValueError(f"orientation must be `v' or `h' but is: {orientation}")
+        text = f"n={n_boots}" if self.block_length is None else f"{self.block_length}, n={n_boots}"
+        text_box = AnchoredText(text, frameon=True, loc=1, pad=0.5)
+        plt.setp(text_box.patch, edgecolor='k', facecolor='w')
+        ax.add_artist(text_box)
+        plt.setp(ax.lines, color='k')
+        plt.tight_layout()
+        self._save()
+        plt.close("all")
+
+
 if __name__ == "__main__":
     stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087']
     path = "../../testrun_network/forecasts"
diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py
index 28277d5057698c01594431008b81d959d415c3e2..63be6eb4c6e8b5f8d3149df023e07d23805f077f 100644
--- a/mlair/run_modules/experiment_setup.py
+++ b/mlair/run_modules/experiment_setup.py
@@ -18,10 +18,12 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT,
     DEFAULT_WINDOW_DIM, DEFAULT_DIMENSIONS, DEFAULT_TIME_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_INTERPOLATION_LIMIT, \
     DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \
     DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \
-    DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \
-    DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \
+    DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_FEATURE_IMPORTANCE, DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS, \
+    DEFAULT_FEATURE_IMPORTANCE_N_BOOTS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \
     DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_MAX_NUMBER_MULTIPROCESSING, \
-    DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA
+    DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE, DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA, \
+    DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, \
+    DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, DEFAULT_DO_UNCERTAINTY_ESTIMATE
 from mlair.data_handler import DefaultDataHandler
 from mlair.run_modules.run_environment import RunEnvironment
 from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel
@@ -212,14 +214,17 @@ class ExperimentSetup(RunEnvironment):
                  sampling: str = None,
                  create_new_model=None, bootstrap_path=None, permute_data_on_training=None, transformation=None,
                  train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None,
-                 extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None,
-                 number_of_bootstraps=None, create_new_bootstraps=None, bootstrap_method=None, bootstrap_type=None,
+                 extremes_on_right_tail_only: bool = None, evaluate_feature_importance: bool = None, plot_list=None,
+                 feature_importance_n_boots: int = None, feature_importance_create_new_bootstraps: bool = None,
+                 feature_importance_bootstrap_method=None, feature_importance_bootstrap_type=None,
                  data_path: str = None, batch_path: str = None, login_nodes=None,
                  hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None,
                  data_origin: Dict = None, competitors: list = None, competitor_path: str = None,
                  use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None,
                  max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None,
-                 overwrite_lazy_data: bool = None, **kwargs):
+                 overwrite_lazy_data: bool = None, uncertainty_estimate_block_length: str = None,
+                 uncertainty_estimate_evaluate_competitors: bool = None, uncertainty_estimate_n_boots: int = None,
+                 do_uncertainty_estimate: bool = None, **kwargs):
 
         # create run framework
         super().__init__()
@@ -349,15 +354,28 @@ class ExperimentSetup(RunEnvironment):
                         default=DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS)
 
         # set post-processing instructions
-        self._set_param("evaluate_bootstraps", evaluate_bootstraps, default=DEFAULT_EVALUATE_BOOTSTRAPS,
-                        scope="general.postprocessing")
-        create_new_bootstraps = max([self.data_store.get("train_model", "general"),
-                                     create_new_bootstraps or DEFAULT_CREATE_NEW_BOOTSTRAPS])
-        self._set_param("create_new_bootstraps", create_new_bootstraps, scope="general.postprocessing")
-        self._set_param("number_of_bootstraps", number_of_bootstraps, default=DEFAULT_NUMBER_OF_BOOTSTRAPS,
-                        scope="general.postprocessing")
-        self._set_param("bootstrap_method", bootstrap_method, default=DEFAULT_BOOTSTRAP_METHOD)
-        self._set_param("bootstrap_type", bootstrap_type, default=DEFAULT_BOOTSTRAP_TYPE)
+        self._set_param("do_uncertainty_estimate", do_uncertainty_estimate,
+                        default=DEFAULT_DO_UNCERTAINTY_ESTIMATE, scope="general.postprocessing")
+        self._set_param("block_length", uncertainty_estimate_block_length,
+                        default=DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, scope="uncertainty_estimate")
+        self._set_param("evaluate_competitors", uncertainty_estimate_evaluate_competitors,
+                        default=DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, scope="uncertainty_estimate")
+        self._set_param("n_boots", uncertainty_estimate_n_boots,
+                        default=DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, scope="uncertainty_estimate")
+
+        self._set_param("evaluate_feature_importance", evaluate_feature_importance,
+                        default=DEFAULT_EVALUATE_FEATURE_IMPORTANCE, scope="general.postprocessing")
+        feature_importance_create_new_bootstraps = max([self.data_store.get("train_model", "general"),
+                                                        feature_importance_create_new_bootstraps or
+                                                        DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS])
+        self._set_param("create_new_bootstraps", feature_importance_create_new_bootstraps, scope="feature_importance")
+        self._set_param("n_boots", feature_importance_n_boots, default=DEFAULT_FEATURE_IMPORTANCE_N_BOOTS,
+                        scope="feature_importance")
+        self._set_param("bootstrap_method", feature_importance_bootstrap_method,
+                        default=DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD, scope="feature_importance")
+        self._set_param("bootstrap_type", feature_importance_bootstrap_type,
+                        default=DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE, scope="feature_importance")
+
         self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing")
         self._set_param("neighbors", ["DEBW030"])  # TODO: just for testing
 
@@ -384,8 +402,10 @@ class ExperimentSetup(RunEnvironment):
                 if len(self.data_store.search_name(k)) == 0:
                     self._set_param(k, v)
                 else:
+                    s = ", ".join([f"{k}({s})={self.data_store.get(k, scope=s)}"
+                                   for s in self.data_store.search_name(k)])
                     raise KeyError(f"Given argument {k} with value {v} cannot be set for this experiment due to a "
-                                   f"conflict with an existing entry with same naming: {k}={self.data_store.get(k)}")
+                                   f"conflict with an existing entry with same naming: {s}")
 
     def _set_param(self, param: str, value: Any, default: Any = None, scope: str = "general",
                    apply: Callable = None) -> Any:
diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py
index fa6326050b611ed4e27d75867e920b7e70d2fa9f..25bd4465556a594393050f80f2a189f4f8039e1b 100644
--- a/mlair/run_modules/post_processing.py
+++ b/mlair/run_modules/post_processing.py
@@ -16,13 +16,14 @@ import pandas as pd
 import xarray as xr
 
 from mlair.configuration import path_config
-from mlair.data_handler import BootStraps, KerasIterator
+from mlair.data_handler import Bootstraps, KerasIterator
 from mlair.helpers.datastore import NameNotFoundInDataStore
 from mlair.helpers import TimeTracking, statistics, extract_value, remove_items, to_list, tables
 from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel
 from mlair.model_modules import AbstractModelClass
 from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \
-    PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales
+    PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, \
+    PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap
 from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \
     PlotPeriodogram, PlotDataHistogram
 from mlair.run_modules.run_environment import RunEnvironment
@@ -48,7 +49,7 @@ class PostProcessing(RunEnvironment):
         * `target_var` [.]
         * `sampling` [.]
         * `output_shape` [model]
-        * `evaluate_bootstraps` [postprocessing] and if enabled:
+        * `evaluate_feature_importance` [postprocessing] and if enabled:
 
             * `create_new_bootstraps` [postprocessing]
             * `bootstrap_path` [postprocessing]
@@ -83,11 +84,17 @@ class PostProcessing(RunEnvironment):
         self._sampling = self.data_store.get("sampling")
         self.window_lead_time = extract_value(self.data_store.get("output_shape", "model"))
         self.skill_scores = None
-        self.bootstrap_skill_scores = None
+        self.feature_importance_skill_scores = None
+        self.uncertainty_estimate = None
         self.competitor_path = self.data_store.get("competitor_path")
         self.competitors = to_list(self.data_store.get_default("competitors", default=[]))
         self.forecast_indicator = "nn"
+        self.observation_indicator = "obs"
         self.ahead_dim = "ahead"
+        self.boot_var_dim = "boot_var"
+        self.uncertainty_estimate_boot_dim = "boots"
+        self.model_type_dim = "type"
+        self.index_dim = "index"
         self._run()
 
     def _run(self):
@@ -101,25 +108,132 @@ class PostProcessing(RunEnvironment):
         # calculate error metrics on test data
         self.calculate_test_score()
 
-        # bootstraps
-        if self.data_store.get("evaluate_bootstraps", "postprocessing"):
-            with TimeTracking(name="calculate bootstraps"):
-                create_new_bootstraps = self.data_store.get("create_new_bootstraps", "postprocessing")
-                bootstrap_method = self.data_store.get("bootstrap_method", "postprocessing")
-                bootstrap_type = self.data_store.get("bootstrap_type", "postprocessing")
-                self.bootstrap_postprocessing(create_new_bootstraps, bootstrap_type=bootstrap_type,
-                                              bootstrap_method=bootstrap_method)
+        # sample uncertainty
+        if self.data_store.get("do_uncertainty_estimate", "postprocessing"):
+            self.estimate_sample_uncertainty()
+
+        # feature importance bootstraps
+        if self.data_store.get("evaluate_feature_importance", "postprocessing"):
+            with TimeTracking(name="calculate feature importance using bootstraps"):
+                create_new_bootstraps = self.data_store.get("create_new_bootstraps", "feature_importance")
+                bootstrap_method = self.data_store.get("bootstrap_method", "feature_importance")
+                bootstrap_type = self.data_store.get("bootstrap_type", "feature_importance")
+                self.calculate_feature_importance(create_new_bootstraps, bootstrap_type=bootstrap_type,
+                                                  bootstrap_method=bootstrap_method)
+            if self.feature_importance_skill_scores is not None:
+                self.report_feature_importance_results(self.feature_importance_skill_scores)
 
         # skill scores and error metrics
         with TimeTracking(name="calculate skill scores"):
-            skill_score_competitive, skill_score_climatological, errors = self.calculate_error_metrics()
+            skill_score_competitive, _, skill_score_climatological, errors = self.calculate_error_metrics()
             self.skill_scores = (skill_score_competitive, skill_score_climatological)
         self.report_error_metrics(errors)
-        self.report_error_metrics(skill_score_climatological)
+        self.report_error_metrics({self.forecast_indicator: skill_score_climatological})
+        self.report_error_metrics({"skill_score": skill_score_competitive})
 
         # plotting
         self.plot()
 
+    def estimate_sample_uncertainty(self, separate_ahead=False):
+        """
+        Estimate sample uncertainty by using a bootstrap approach. Forecasts are split into individual blocks along time
+        and randomly drawn with replacement. The resulting behaviour of the error indicates the robustness of each
+        analyzed model to quantify which model might be superior compared to others.
+        """
+        n_boots = self.data_store.get_default("n_boots", default=1000, scope="uncertainty_estimate")
+        block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate")
+        evaluate_competitors = self.data_store.get_default("evaluate_competitors", default=True,
+                                                           scope="uncertainty_estimate")
+        block_mse = self.calculate_block_mse(evaluate_competitors=evaluate_competitors, separate_ahead=separate_ahead,
+                                             block_length=block_length)
+        self.uncertainty_estimate = statistics.create_n_bootstrap_realizations(
+            block_mse, dim_name_time=self.index_dim, dim_name_model=self.model_type_dim,
+            dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots)
+        self.report_sample_uncertainty()
+
+    def report_sample_uncertainty(self, percentiles: list = None):
+        """
+        Store raw results of uncertainty estimate and calculate aggregate statistcs and store as raw data but also as
+        markdown and latex.
+        """
+        report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
+        path_config.check_path_and_create(report_path)
+
+        # store raw results as nc
+        file_name = os.path.join(report_path, "uncertainty_estimate_raw_results.nc")
+        self.uncertainty_estimate.to_netcdf(path=file_name)
+
+        # store statistics
+        if percentiles is None:
+            percentiles = [.05, .1, .25, .5, .75, .9, .95]
+        df_descr = self.uncertainty_estimate.to_pandas().describe(percentiles=percentiles).astype("float32")
+        column_format = tables.create_column_format_for_tex(df_descr)
+        file_name = os.path.join(report_path, "uncertainty_estimate_statistics.%s")
+        tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr)
+        tables.save_to_md(report_path, file_name % "md", df=df_descr)
+        df_descr.to_csv(file_name % "csv", sep=";")
+
+    def calculate_block_mse(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"):
+        """
+        Transform data into blocks along time axis. Block length can be any frequency like '1m' or '7d. Data are only
+        split along time axis, which means that a single block can have very diverse quantities regarding the number of
+        station or actual data contained. This is intended to analyze not only the robustness against the time but also
+        against the number of observations and diversity ot stations.
+        """
+        path = self.data_store.get("forecast_path")
+        all_stations = self.data_store.get("stations")
+        start = self.data_store.get("start", "test")
+        end = self.data_store.get("end", "test")
+        index_dim = self.index_dim
+        coll_dim = "station"
+        collector = []
+        for station in all_stations:
+            # test data
+            external_data = self._get_external_data(station, path)
+            if external_data is not None:
+                pass
+            # competitors
+            if evaluate_competitors is True:
+                competitor = self.load_competitors(station)
+                combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim)
+            else:
+                combined = external_data
+
+            if combined is None:
+                continue
+            else:
+                combined = self.create_full_time_dim(combined, index_dim, self._sampling, start, end)
+                # get squared errors
+                errors = self.create_error_array(combined)
+                # calc mse for each block (single station)
+                mse = errors.resample(indexer={index_dim: block_length}).mean(skipna=True)
+                collector.append(mse.assign_coords({coll_dim: station}))
+        # calc mse for each block (average over all stations)
+        mse_blocks = xr.concat(collector, dim=coll_dim).mean(dim=coll_dim, skipna=True)
+        # average also on ahead steps
+        if separate_ahead is False:
+            mse_blocks = mse_blocks.mean(dim=self.ahead_dim, skipna=True)
+        return mse_blocks
+
+    def create_error_array(self, data):
+        """Calculate squared error of all given time series in relation to observation."""
+        errors = data.drop_sel({self.model_type_dim: self.observation_indicator})
+        errors1 = errors - data.sel({self.model_type_dim: self.observation_indicator})
+        errors2 = errors1 ** 2
+        return errors2
+
+    @staticmethod
+    def create_full_time_dim(data, dim, sampling, start, end):
+        """Ensure time dimension to be equidistant. Sometimes dates if missing values have been dropped."""
+        start_data = data.coords[dim].values[0]
+        freq = {"daily": "1D", "hourly": "1H"}.get(sampling)
+        datetime_index = pd.DataFrame(index=pd.date_range(start, end, freq=freq))
+        t = data.sel({dim: start_data}, drop=True)
+        res = xr.DataArray(coords=[datetime_index.index, *[t.coords[c] for c in t.coords]], dims=[dim, *t.coords])
+        res = res.transpose(*data.dims)
+        res.loc[data.coords] = data
+        return res
+
     def load_competitors(self, station_name: str) -> xr.DataArray:
         """
         Load all requested and available competitors for a given station. Forecasts must be available in the competitor
@@ -139,10 +253,10 @@ class PostProcessing(RunEnvironment):
             except (FileNotFoundError, KeyError):
                 logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.")
                 continue
-        return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None
+        return xr.concat(competing_predictions, self.model_type_dim) if len(competing_predictions) > 0 else None
 
-    def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput",
-                                 bootstrap_method="shuffle") -> None:
+    def calculate_feature_importance(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput",
+                                     bootstrap_method="shuffle") -> None:
         """
         Calculate skill scores of bootstrapped data.
 
@@ -155,26 +269,29 @@ class PostProcessing(RunEnvironment):
         :param _iter: internal counter to reduce unnecessary recursive calls (maximum number is 2, otherwise something
             went wrong).
         """
-        self.bootstrap_skill_scores = {}
+        if _iter == 0:
+            self.feature_importance_skill_scores = {}
         for boot_type in to_list(bootstrap_type):
-            self.bootstrap_skill_scores[boot_type] = {}
+            self.feature_importance_skill_scores[boot_type] = {}
             for boot_method in to_list(bootstrap_method):
                 try:
                     if create_new_bootstraps:
-                        self.create_bootstrap_forecast(bootstrap_type=boot_type, bootstrap_method=boot_method)
-                    boot_skill_score = self.calculate_bootstrap_skill_scores(bootstrap_type=boot_type,
-                                                                             bootstrap_method=boot_method)
-                    self.bootstrap_skill_scores[boot_type][boot_method] = boot_skill_score
+                        self.create_feature_importance_bootstrap_forecast(bootstrap_type=boot_type,
+                                                                          bootstrap_method=boot_method)
+                    boot_skill_score = self.calculate_feature_importance_skill_scores(bootstrap_type=boot_type,
+                                                                                      bootstrap_method=boot_method)
+                    self.feature_importance_skill_scores[boot_type][boot_method] = boot_skill_score
                 except FileNotFoundError:
                     if _iter != 0:
-                        raise RuntimeError(f"bootstrap_postprocessing ({boot_type}, {boot_type}) was called for the 2nd"
-                                           f" time. This means, that something internally goes wrong. Please check for "
-                                           f"possible errors")
-                    logging.info(f"Could not load all files for bootstrapping ({boot_type}, {boot_type}), restart "
-                                 f"bootstrap postprocessing with create_new_bootstraps=True.")
-                    self.bootstrap_postprocessing(True, _iter=1, bootstrap_type=boot_type, bootstrap_method=boot_method)
+                        raise RuntimeError(f"calculate_feature_importance ({boot_type}, {boot_type}) was called for the "
+                                           f"2nd time. This means, that something internally goes wrong. Please check "
+                                           f"for possible errors")
+                    logging.info(f"Could not load all files for feature importance ({boot_type}, {boot_type}), restart "
+                                 f"calculate_feature_importance with create_new_bootstraps=True.")
+                    self.calculate_feature_importance(True, _iter=1, bootstrap_type=boot_type,
+                                                      bootstrap_method=boot_method)
 
-    def create_bootstrap_forecast(self, bootstrap_type, bootstrap_method) -> None:
+    def create_feature_importance_bootstrap_forecast(self, bootstrap_type, bootstrap_method) -> None:
         """
         Create bootstrapped predictions for all stations and variables.
 
@@ -185,11 +302,11 @@ class PostProcessing(RunEnvironment):
         with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"):
             # extract all requirements from data store
             forecast_path = self.data_store.get("forecast_path")
-            number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing")
-            dims = ["index", self.ahead_dim, "type"]
+            number_of_bootstraps = self.data_store.get("n_boots", "feature_importance")
+            dims = [self.index_dim, self.ahead_dim, self.model_type_dim]
             for station in self.test_data:
                 X, Y = None, None
-                bootstraps = BootStraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type,
+                bootstraps = Bootstraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type,
                                         bootstrap_method=bootstrap_method)
                 for boot in bootstraps:
                     X, Y, (index, dimension) = boot
@@ -210,10 +327,10 @@ class PostProcessing(RunEnvironment):
                     # store also true labels for each station
                     labels = np.expand_dims(Y, axis=-1)
                     file_name = os.path.join(forecast_path, f"bootstraps_{station}_{bootstrap_method}_labels.nc")
-                    labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=dims)
+                    labels = xr.DataArray(labels, coords=(*coords, [self.observation_indicator]), dims=dims)
                     labels.to_netcdf(file_name)
 
-    def calculate_bootstrap_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]:
+    def calculate_feature_importance_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]:
         """
         Calculate skill score of bootstrapped variables.
 
@@ -226,10 +343,11 @@ class PostProcessing(RunEnvironment):
         with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"):
             # extract all requirements from data store
             forecast_path = self.data_store.get("forecast_path")
-            number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing")
+            number_of_bootstraps = self.data_store.get("n_boots", "feature_importance")
             forecast_file = f"forecasts_norm_%s_test.nc"
+            reference_name = "orig"
 
-            bootstraps = BootStraps(self.test_data[0], number_of_bootstraps, bootstrap_type=bootstrap_type,
+            bootstraps = Bootstraps(self.test_data[0], number_of_bootstraps, bootstrap_type=bootstrap_type,
                                     bootstrap_method=bootstrap_method)
             number_of_bootstraps = bootstraps.number_of_bootstraps
             bootstrap_iter = bootstraps.bootstraps()
@@ -245,8 +363,8 @@ class PostProcessing(RunEnvironment):
                 # get original forecasts
                 orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps)
                 orig = orig.reshape(shape)
-                coords = (range(shape[0]), range(1, shape[1] + 1), ["orig"])
-                orig = xr.DataArray(orig, coords=coords, dims=["index", self.ahead_dim, "type"])
+                coords = (range(shape[0]), range(1, shape[1] + 1), [reference_name])
+                orig = xr.DataArray(orig, coords=coords, dims=[self.index_dim, self.ahead_dim, self.model_type_dim])
 
                 # calculate skill scores for each variable
                 skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1))
@@ -261,11 +379,12 @@ class PostProcessing(RunEnvironment):
                     for ahead in range(1, self.window_lead_time + 1):
                         data = boot_data.sel({self.ahead_dim: ahead})
                         boot_scores.append(
-                            skill_scores.general_skill_score(data, forecast_name=boot_var, reference_name="orig"))
+                            skill_scores.general_skill_score(data, forecast_name=boot_var,
+                                                             reference_name=reference_name))
                     skill.loc[boot_var] = np.array(boot_scores)
 
                 # collect all results in single dictionary
-                score[str(station)] = xr.DataArray(skill, dims=["boot_var", self.ahead_dim])
+                score[str(station)] = xr.DataArray(skill, dims=[self.boot_var_dim, self.ahead_dim])
             return score
 
     def get_orig_prediction(self, path, file_name, number_of_bootstraps, prediction_name=None):
@@ -342,8 +461,8 @@ class PostProcessing(RunEnvironment):
                           f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}")
 
         try:
-            if (self.bootstrap_skill_scores is not None) and ("PlotBootstrapSkillScore" in plot_list):
-                for boot_type, boot_data in self.bootstrap_skill_scores.items():
+            if (self.feature_importance_skill_scores is not None) and ("PlotBootstrapSkillScore" in plot_list):
+                for boot_type, boot_data in self.feature_importance_skill_scores.items():
                     for boot_method, boot_skill_score in boot_data.items():
                         try:
                             PlotBootstrapSkillScore(boot_skill_score, plot_folder=self.plot_path,
@@ -453,6 +572,17 @@ class PostProcessing(RunEnvironment):
             logging.error(f"Could not create plot PlotDataHistogram due to the following error: {e}"
                           f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}")
 
+        try:
+            if "PlotSampleUncertaintyFromBootstrap" in plot_list and self.uncertainty_estimate is not None:
+                block_length= self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate")
+                PlotSampleUncertaintyFromBootstrap(
+                    data=self.uncertainty_estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim,
+                    dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error",
+                    error_unit=r"ppb$^2$", block_length=block_length)
+        except Exception as e:
+            logging.error(f"Could not create plot PlotSampleUncertaintyFromBootstrap due to the following error: {e}"
+                          f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}")
+
     def calculate_test_score(self):
         """Evaluate test score of model and save locally."""
 
@@ -512,10 +642,11 @@ class PostProcessing(RunEnvironment):
                 full_index = self.create_fullindex(observation_data.indexes[time_dimension], self._get_frequency())
                 prediction_dict = {self.forecast_indicator: nn_prediction,
                                    "persi": persistence_prediction,
-                                   "obs": observation,
+                                   self.observation_indicator: observation,
                                    "ols": ols_prediction}
                 all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes[window_dim]),
                                                               time_dimension, ahead_dim=self.ahead_dim,
+                                                              index_dim=self.index_dim, type_dim=self.model_type_dim,
                                                               **prediction_dict)
 
                 # save all forecasts locally
@@ -545,7 +676,7 @@ class PostProcessing(RunEnvironment):
         with xr.open_dataarray(file) as da:
             data = da.load()
         forecast = data.sel(type=[self.forecast_indicator])
-        forecast.coords["type"] = [competitor_name]
+        forecast.coords[self.model_type_dim] = [competitor_name]
         return forecast
 
     def _create_observation(self, data, _, transformation_func: Callable, normalised: bool) -> xr.DataArray:
@@ -675,7 +806,7 @@ class PostProcessing(RunEnvironment):
 
     @staticmethod
     def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], time_dimension,
-                               ahead_dim="ahead", **kwargs):
+                               ahead_dim="ahead", index_dim="index", type_dim="type", **kwargs):
         """
         Combine different forecast types into single xarray.
 
@@ -688,7 +819,7 @@ class PostProcessing(RunEnvironment):
         """
         keys = list(kwargs.keys())
         res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan),
-                           coords=[index.index, ahead_names, keys], dims=['index', ahead_dim, 'type'])
+                           coords=[index.index, ahead_names, keys], dims=[index_dim, ahead_dim, type_dim])
         for k, v in kwargs.items():
             intersection = set(res.index.values) & set(v.indexes[time_dimension].values)
             match_index = np.array(list(intersection))
@@ -727,18 +858,19 @@ class PostProcessing(RunEnvironment):
         except (IndexError, KeyError, FileNotFoundError):
             return None
 
-    @staticmethod
-    def _combine_forecasts(forecast, competitor, dim="type"):
+    def _combine_forecasts(self, forecast, competitor, dim=None):
         """
         Combine forecast and competitor if both are xarray. If competitor is None, this returns forecasts and vise
         versa.
         """
+        if dim is None:
+            dim = self.model_type_dim
         try:
             return xr.concat([forecast, competitor], dim=dim)
         except (TypeError, AttributeError):
             return forecast if competitor is None else competitor
 
-    def calculate_error_metrics(self) -> Tuple[Dict, Dict, Dict]:
+    def calculate_error_metrics(self) -> Tuple[Dict, Dict, Dict, Dict]:
         """
         Calculate error metrics and skill scores of NN forecast.
 
@@ -751,6 +883,7 @@ class PostProcessing(RunEnvironment):
         path = self.data_store.get("forecast_path")
         all_stations = self.data_store.get("stations")
         skill_score_competitive = {}
+        skill_score_competitive_count = {}
         skill_score_climatological = {}
         errors = {}
         for station in all_stations:
@@ -758,24 +891,61 @@ class PostProcessing(RunEnvironment):
 
             # test errors
             if external_data is not None:
-                errors[station] = statistics.calculate_error_metrics(*map(lambda x: external_data.sel(type=x),
-                                                                          [self.forecast_indicator, "obs"]),
-                                                                     dim="index")
-            # skill score
+                model_type_list = external_data.coords[self.model_type_dim].values.tolist()
+                for model_type in remove_items(model_type_list, self.observation_indicator):
+                    if model_type not in errors.keys():
+                        errors[model_type] = {}
+                    errors[model_type][station] = statistics.calculate_error_metrics(
+                        *map(lambda x: external_data.sel(**{self.model_type_dim: x}),
+                             [model_type, self.observation_indicator]), dim=self.index_dim)
+
+            # load competitors
             competitor = self.load_competitors(station)
-            combined = self._combine_forecasts(external_data, competitor, dim="type")
-            model_list = remove_items(list(combined.type.values), "obs") if combined is not None else None
+            combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim)
+            if combined is not None:
+                model_list = remove_items(combined.coords[self.model_type_dim].values.tolist(),
+                                          self.observation_indicator)
+            else:
+                model_list = None
+
+            # test errors of competitors
+            for model_type in remove_items(model_list or [], list(errors.keys())):
+                if self.observation_indicator not in combined.coords[self.model_type_dim]:
+                    continue
+                if model_type not in errors.keys():
+                    errors[model_type] = {}
+                errors[model_type][station] = statistics.calculate_error_metrics(
+                    *map(lambda x: combined.sel(**{self.model_type_dim: x}),
+                         [model_type, self.observation_indicator]), dim=self.index_dim)
+
+            # skill score
             skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim)
             if external_data is not None:
-                skill_score_competitive[station] = skill_score.skill_scores()
+                skill_score_competitive[station], skill_score_competitive_count[station] = skill_score.skill_scores()
 
             internal_data = self._get_internal_data(station, path)
             if internal_data is not None:
                 skill_score_climatological[station] = skill_score.climatological_skill_scores(
                     internal_data, forecast_name=self.forecast_indicator)
 
-        errors.update({"total": self.calculate_average_errors(errors)})
-        return skill_score_competitive, skill_score_climatological, errors
+        for model_type in errors.keys():
+            errors[model_type].update({"total": self.calculate_average_errors(errors[model_type])})
+        skill_score_competitive.update({"total": self.calculate_average_skill_scores(skill_score_competitive,
+                                                                                     skill_score_competitive_count)})
+        return skill_score_competitive, skill_score_competitive_count, skill_score_climatological, errors
+
+    @staticmethod
+    def calculate_average_skill_scores(scores, counts):
+        avg_skill_score = 0
+        n_total = None
+        for vals in counts.values():
+            n_total = vals if n_total is None else n_total.add(vals, fill_value=0)
+        for station, station_scores in scores.items():
+            n = counts.get(station)
+            avg_skill_score = station_scores.mul(n.div(n_total, fill_value=0), fill_value=0).add(avg_skill_score,
+                                                                                                 fill_value=0)
+        return avg_skill_score
+
 
     @staticmethod
     def calculate_average_errors(errors):
@@ -788,28 +958,55 @@ class PostProcessing(RunEnvironment):
                 avg_error[error_metric] = new_val
         return avg_error
 
+    def report_feature_importance_results(self, results):
+        """Create a csv file containing all results from feature importance."""
+        report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
+        path_config.check_path_and_create(report_path)
+        res = [[self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, "vals"]]
+        for boot_type, d0 in results.items():
+            for boot_method, d1 in d0.items():
+                for station_name, vals in d1.items():
+                    for boot_var in vals.coords[self.boot_var_dim].values.tolist():
+                        for ahead in vals.coords[self.ahead_dim].values.tolist():
+                            res.append([boot_type, boot_method, station_name, boot_var, ahead,
+                                        float(vals.sel({self.boot_var_dim: boot_var, self.ahead_dim: ahead}))])
+        col_names = res.pop(0)
+        df = pd.DataFrame(res, columns=col_names)
+        file_name = "feature_importance_skill_score_report_raw.csv"
+        df.to_csv(os.path.join(report_path, file_name), sep=";")
+
     def report_error_metrics(self, errors):
         report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report")
         path_config.check_path_and_create(report_path)
-        metric_collection = {}
-        for station, station_errors in errors.items():
-            if isinstance(station_errors, xr.DataArray):
-                dim = station_errors.dims[0]
-                sel_index = [sel for sel in station_errors.coords[dim] if "CASE" in str(sel)]
-                station_errors = {str(i.values): station_errors.sel(**{dim: i}) for i in sel_index}
-            for metric, vals in station_errors.items():
-                if metric == "n":
-                    continue
-                pd_vals = pd.DataFrame.from_dict({station: vals}).T
-                pd_vals.columns = [f"{metric}(t+{x})" for x in vals.coords["ahead"].values]
-                mc = metric_collection.get(metric, pd.DataFrame())
-                mc = mc.append(pd_vals)
-                metric_collection[metric] = mc
-        for metric, error_df in metric_collection.items():
-            df = error_df.sort_index()
-            if "total" in df.index:
-                df.reindex(df.index.drop(["total"]).to_list() + ["total"], )
-            column_format = tables.create_column_format_for_tex(df)
-            file_name = f"error_report_{metric}.%s".replace(' ', '_')
-            tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df)
-            tables.save_to_md(report_path, file_name % "md", df=df)
+        for model_type in errors.keys():
+            metric_collection = {}
+            for station, station_errors in errors[model_type].items():
+                if isinstance(station_errors, xr.DataArray):
+                    dim = station_errors.dims[0]
+                    sel_index = [sel for sel in station_errors.coords[dim] if "CASE" in str(sel)]
+                    station_errors = {str(i.values): station_errors.sel(**{dim: i}) for i in sel_index}
+                elif isinstance(station_errors, pd.DataFrame):
+                    sel_index = station_errors.index.tolist()
+                    ahead = station_errors.columns.values
+                    station_errors = {k: xr.DataArray(station_errors[station_errors.index == k].values.flatten(),
+                                                      dims=["ahead"], coords={"ahead": ahead}).astype(float)
+                                      for k in sel_index}
+                for metric, vals in station_errors.items():
+                    if metric == "n":
+                        metric = "count"
+                    pd_vals = pd.DataFrame.from_dict({station: vals}).T
+                    pd_vals.columns = [f"{metric}(t+{x})" for x in vals.coords["ahead"].values]
+                    mc = metric_collection.get(metric, pd.DataFrame())
+                    mc = mc.append(pd_vals)
+                    metric_collection[metric] = mc
+            for metric, error_df in metric_collection.items():
+                df = error_df.sort_index()
+                if "total" in df.index:
+                    df.reindex(df.index.drop(["total"]).to_list() + ["total"], )
+                column_format = tables.create_column_format_for_tex(df)
+                if model_type == "skill_score":
+                    file_name = f"error_report_{model_type}_{metric}.%s".replace(' ', '_')
+                else:
+                    file_name = f"error_report_{metric}_{model_type}.%s".replace(' ', '_')
+                tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df)
+                tables.save_to_md(report_path, file_name % "md", df=df)
diff --git a/mlair/workflows/default_workflow.py b/mlair/workflows/default_workflow.py
index 5894555a6af52299efcd8d88d76c0d3791a1599e..961979cb774e928bda96d4cd1a3a7b0f8565e968 100644
--- a/mlair/workflows/default_workflow.py
+++ b/mlair/workflows/default_workflow.py
@@ -31,7 +31,6 @@ class DefaultWorkflow(Workflow):
                  permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None,
                  transformation=None,
                  train_min_length=None, val_min_length=None, test_min_length=None,
-                 evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None,
                  plot_list=None,
                  model=None,
                  batch_size=None,
diff --git a/run.py b/run.py
index 1d607acfc8a789080a38da9ee546e516be845797..82bb0e2814d403b5be602eaebd1bc44b6cf6d6f9 100644
--- a/run.py
+++ b/run.py
@@ -28,8 +28,8 @@ def main(parser_args):
         # stations=["DEBW087","DEBW013", "DEBW107",  "DEBW076"],
         stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"],
         train_model=False, create_new_model=True, network="UBA",
-        evaluate_bootstraps=False,  # plot_list=["PlotCompetitiveSkillScore"],
-        competitors=["test_model", "test_model2"], # model=chosen_model,
+        evaluate_feature_importance=False,  # plot_list=["PlotCompetitiveSkillScore"],
+        competitors=["test_model", "test_model2"],
         competitor_path=os.path.join(os.getcwd(), "data", "comp_test"),
         **parser_args.__dict__, start_script=__file__)
     workflow.run()
diff --git a/run_climate_filter.py b/run_climate_filter.py
index 5577375e2fc135676f71151791c1d564dcb25a2e..7f3fcbaaba94506faeadd884073620496155f8ea 100644
--- a/run_climate_filter.py
+++ b/run_climate_filter.py
@@ -65,9 +65,9 @@ def main(parser_args):
         train_model=False, create_new_model=True,
         epochs=1,
         model=NN,
-        evaluate_bootstraps=False,
-        bootstrap_type=["singleinput", "branch", "variable"],
-        bootstrap_method=["shuffle", "zero_mean"],
+        evaluate_feature_importance=False,
+        feature_importance_bootstrap_type=["singleinput", "branch", "variable"],
+        feature_importance_bootstrap_method=["shuffle", "zero_mean"],
         plot_list=DEFAULT_PLOT_LIST,
         # competitors=["IntelliO3-ts-v1", "MFCN_64_32_16_climFIR", "FCN_1449_512_32_4"],  # "FCN_1449_16_8_4",],
         lazy_preprocessing=True,
diff --git a/run_mixed_sampling.py b/run_mixed_sampling.py
index 819ef51129854b4539632ef91a55e33a2607eb55..784f653fbfb2eb4c78e6e858acf67cd0ae47a593 100644
--- a/run_mixed_sampling.py
+++ b/run_mixed_sampling.py
@@ -20,7 +20,7 @@ data_origin = {'o3': '', 'no': '', 'no2': '',
 def main(parser_args):
     args = dict(stations=["DEBW107", "DEBW013"],
                 network="UBA",
-                evaluate_bootstraps=False, plot_list=[],
+                evaluate_feature_importance=False, plot_list=[],
                 data_origin=data_origin, data_handler=DataHandlerMixedSampling,
                 interpolation_limit=(3, 1), overwrite_local_data=False,
                 sampling=("hourly", "daily"),
diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py
index b6bdd9556f73ff711003b01c3a2b65a1c20c66d3..8644181185203186bb6c8549e8faa99e75a31a81 100644
--- a/test/test_configuration/test_defaults.py
+++ b/test/test_configuration/test_defaults.py
@@ -62,10 +62,18 @@ class TestAllDefaults:
         assert DEFAULT_HPC_LOGIN_LIST == ["ju", "hdfmll"]
 
     def test_postprocessing_parameters(self):
-        assert DEFAULT_EVALUATE_BOOTSTRAPS is True
-        assert DEFAULT_CREATE_NEW_BOOTSTRAPS is False
-        assert DEFAULT_NUMBER_OF_BOOTSTRAPS == 20
+        assert DEFAULT_DO_UNCERTAINTY_ESTIMATE is True
+        assert DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH == "1m"
+        assert DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS is True
+        assert DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS == 1000
+        assert DEFAULT_EVALUATE_FEATURE_IMPORTANCE is True
+        assert DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS is False
+        assert DEFAULT_FEATURE_IMPORTANCE_N_BOOTS == 20
+        assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE == "singleinput"
+        assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD == "shuffle"
         assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore",
                                      "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore",
                                      "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram",
-                                     "PlotDataHistogram", "PlotPeriodogram"]
+                                     "PlotDataHistogram", "PlotPeriodogram", "PlotSampleUncertaintyFromBootstrap"]
+
+
diff --git a/test/test_data_handler/old_t_bootstraps.py b/test/test_data_handler/old_t_bootstraps.py
index 21c18c6c2d6f6a6a38a41250f00d3d14a29ed457..e21af9f614d4cfaaadf946cdab72cc69cd5b19a7 100644
--- a/test/test_data_handler/old_t_bootstraps.py
+++ b/test/test_data_handler/old_t_bootstraps.py
@@ -7,7 +7,7 @@ import numpy as np
 import pytest
 import xarray as xr
 
-from mlair.data_handler.bootstraps import BootStraps
+from mlair.data_handler.input_bootstraps import Bootstraps
 from src.data_handler import DataPrepJoin
 
 
@@ -171,22 +171,22 @@ class TestBootStraps:
 
     @pytest.fixture
     def bootstrap(self, orig_generator, data_path):
-        return BootStraps(orig_generator, data_path, 20)
+        return Bootstraps(orig_generator, data_path, 20)
 
     @pytest.fixture
     @mock.patch("mlair.data_handling.bootstraps.CreateShuffledData", return_value=None)
     def bootstrap_no_shuffling(self, mock_create_shuffle_data, orig_generator, data_path):
         shutil.rmtree(data_path)
-        return BootStraps(orig_generator, data_path, 20)
+        return Bootstraps(orig_generator, data_path, 20)
 
     def test_init_no_shuffling(self, bootstrap_no_shuffling, data_path):
-        assert isinstance(bootstrap_no_shuffling, BootStraps)
+        assert isinstance(bootstrap_no_shuffling, Bootstraps)
         assert bootstrap_no_shuffling.number_of_bootstraps == 20
         assert bootstrap_no_shuffling.bootstrap_path == data_path
 
     def test_init_with_shuffling(self, orig_generator, data_path, caplog):
         caplog.set_level(logging.INFO)
-        BootStraps(orig_generator, data_path, 20)
+        Bootstraps(orig_generator, data_path, 20)
         assert caplog.record_tuples[0] == ('root', logging.INFO, "create / check shuffled bootstrap data")
 
     def test_stations(self, bootstrap_no_shuffling, orig_generator):
@@ -213,9 +213,9 @@ class TestBootStraps:
 
     @mock.patch("mlair.data_handling.data_generator.DataGenerator._load_pickle_data", side_effect=FileNotFoundError)
     def test_get_generator_different_generator(self, mock_load_pickle, data_path, orig_generator):
-        BootStraps(orig_generator, data_path, 20)  # to create
+        Bootstraps(orig_generator, data_path, 20)  # to create
         orig_generator.window_history_size = 4
-        bootstrap = BootStraps(orig_generator, data_path, 20)
+        bootstrap = Bootstraps(orig_generator, data_path, 20)
         station = bootstrap.stations[0]
         var = bootstrap.variables[0]
         var_others = bootstrap.variables[1:]
diff --git a/test/test_helpers/test_statistics.py b/test/test_helpers/test_statistics.py
index 2a77f0806b886bee1a3961ff0a972e8f0ee62873..f5148cdc293939d5711afb57c2fa009c47b6c86d 100644
--- a/test/test_helpers/test_statistics.py
+++ b/test/test_helpers/test_statistics.py
@@ -4,7 +4,8 @@ import pytest
 import xarray as xr
 
 from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \
-    centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply
+    centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply, \
+    create_single_bootstrap_realization, calculate_average, create_n_bootstrap_realizations
 
 lazy = pytest.lazy_fixture
 
@@ -221,3 +222,36 @@ class TestLog:
         data_ref, opts = log(data_orig, dim)
         data_test = log_apply(data_orig, opts["mean"], opts["std"])
         assert np.testing.assert_array_almost_equal(data_ref, data_test) is None
+
+
+class TestCreateBootstrapRealizations:
+
+    @pytest.fixture
+    def data(self):
+        return xr.DataArray(np.array(range(20)).reshape(2 , -1).T,
+                            dims={'time': range(10), 'model': ['m1', 'm2']},
+                            coords={'time': range(10), 'model': ['m1', 'm2']})
+
+    def test_create_single_bootstrap_realization(self, data):
+        np.random.seed(42)
+        proc_data = create_single_bootstrap_realization(data, "time")
+        assert isinstance(proc_data, xr.DataArray)
+        assert (proc_data.coords['time'].values == np.array([6, 3, 7, 4, 6, 9, 2, 6, 7, 4])).all()
+        # check if all time index values of proc_data are from data
+        assert np.in1d(proc_data.indexes['time'].values, data.indexes['time'].values).all()
+
+    def test_calculate_average(self, data):
+        assert isinstance(data, xr.DataArray)
+        assert calculate_average(data) == data.mean()
+        assert (calculate_average(data, axis=0) == data.mean(axis=0)).all()
+
+    def test_create_n_bootstrap_realizations(self, data):
+        boot_data = create_n_bootstrap_realizations(data, dim_name_time='time', dim_name_model='model',
+                                                    n_boots=1000, dim_name_boots='boots')
+        assert isinstance(boot_data, xr.DataArray)
+        assert boot_data.shape == (1000, 2)
+
+        boot_data = create_n_bootstrap_realizations(data.sel(model='m1').squeeze(), dim_name_time='time',
+                                                    dim_name_model='model', n_boots=1000, dim_name_boots='boots')
+        assert isinstance(boot_data, xr.DataArray)
+        assert boot_data.shape == (1000,)