From 690fca578908175b2cd930e7645376a1642a9f49 Mon Sep 17 00:00:00 2001
From: "v.gramlich1" <v.gramlichfz-juelich.de>
Date: Mon, 21 Jun 2021 16:21:37 +0200
Subject: [PATCH] Trying to make bins and rates_cap more flexible, inserting
 default values.

---
 mlair/configuration/defaults.py            |  5 ++--
 mlair/data_handler/default_data_handler.py | 32 ++++++++++++++++++++++
 mlair/run_modules/experiment_setup.py      | 11 +++++---
 mlair/run_modules/pre_processing.py        | 14 ++++++----
 4 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py
index 2b817f53..c6e61782 100644
--- a/mlair/configuration/defaults.py
+++ b/mlair/configuration/defaults.py
@@ -55,9 +55,8 @@ DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA
                        "pm10": "", "so2": ""}
 DEFAULT_USE_MULTIPROCESSING = True
 DEFAULT_USE_MULTIPROCESSING_ON_DEBUG = False
-
-DEFAULT_BINS = 10
-DEFAULT_RATES_CAP = 20
+DEFAULT_OVERSAMPLING_BINS = 10
+DEFAULT_OVERSAMPLING_RATES_CAP = 20
 
 
 def get_defaults():
diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py
index 11461ad7..0c6d2ddc 100644
--- a/mlair/data_handler/default_data_handler.py
+++ b/mlair/data_handler/default_data_handler.py
@@ -166,6 +166,38 @@ class DefaultDataHandler(AbstractDataHandler):
     def apply_transformation(self, data, base="target", dim=0, inverse=False):
         return self.id_class.apply_transformation(data, dim=dim, base=base, inverse=inverse)
 
+    def apply_oversampling(self, bin_edges, oversampling_rates):
+        self._load()
+        if (self._X is None) or (self._Y is None):
+            logging.debug(f"{str(self.id_class)} has no data for X or Y, skip multiply extremes")
+            return
+        Y = self._Y
+        X = self._X
+        for i_bin in range(len(bin_edges)-1):
+            bin_start = bin_edges[i_bin]
+            if i_bin == len(bin_edges) - 1:
+                bin_end = bin_edges[i_bin+1]+1
+            else:
+                bin_end = bin_edges[i_bin + 1]
+            rate = oversampling_rates[i_bin]
+
+            # extract extremes based on occurrence in labels
+            other_dims = remove_items(list(Y.dims), self.time_dim)
+            extreme_idx = xr.concat([(Y >= bin_start).any(dim=other_dims[0]),
+                                         (Y < bin_end).any(dim=other_dims[0])],
+                                        dim=other_dims[0]).all(dim=other_dims[0])
+
+            extremes_X = list(map(lambda x: x.sel(**{self.time_dim: extreme_idx}), X))
+            self._add_timedelta(extremes_X, dim, timedelta)
+            # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X))
+
+            extremes_Y = Y.sel(**{dim: extreme_idx})
+            extremes_Y.coords[dim].values += np.timedelta64(*timedelta)
+
+            self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim)
+            self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X))
+
+
     def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False,
                           timedelta: Tuple[int, str] = (1, 'm'), dim=DEFAULT_TIME_DIM):
         """
diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py
index e28eb76d..b249491a 100644
--- a/mlair/run_modules/experiment_setup.py
+++ b/mlair/run_modules/experiment_setup.py
@@ -19,7 +19,7 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT,
     DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \
     DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \
     DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \
-    DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG
+    DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_OVERSAMPLING_BINS, DEFAULT_OVERSAMPLING_RATES_CAP
 from mlair.data_handler import DefaultDataHandler
 from mlair.run_modules.run_environment import RunEnvironment
 from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel
@@ -183,6 +183,9 @@ class ExperimentSetup(RunEnvironment):
     :param use_multiprocessing: Enable parallel preprocessing (postprocessing not implemented yet) by setting this
         parameter to `True` (default). If set to `False` the computation is performed in an serial approach.
         Multiprocessing is disabled when running in debug mode and cannot be switched on.
+    :param oversampling_bins: Sets the number of classes in which the training data is split. The training samples are then
+        oversampled according to the frequency of the different classes.
+    :param oversampling_rates_cap: Sets the maximum oversampling rate that is applied to a class
 
     """
 
@@ -216,7 +219,7 @@ class ExperimentSetup(RunEnvironment):
                  hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None,
                  data_origin: Dict = None, competitors: list = None, competitor_path: str = None,
                  use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None,
-                 bins=None, rates_cap=None, **kwargs):
+                 oversampling_bins=None, oversampling_rates_cap=None, **kwargs):
 
         # create run framework
         super().__init__()
@@ -362,8 +365,8 @@ class ExperimentSetup(RunEnvironment):
         self._set_param("model_class", model, VanillaModel)
 
         # set params for oversampling
-        self._set_param("bins", bins, default=DEFAULT_BINS)
-        self._set_param("rates_cap", rates_cap, default=DEFAULT_RATES_CAP)
+        self._set_param("oversampling_bins", oversampling_bins, default=DEFAULT_OVERSAMPLING_BINS)
+        self._set_param("oversampling_rates_cap", oversampling_rates_cap, default=DEFAULT_OVERSAMPLING_RATES_CAP)
 
         # set remaining kwargs
         if len(kwargs) > 0:
diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py
index 92cdac47..4e41e847 100644
--- a/mlair/run_modules/pre_processing.py
+++ b/mlair/run_modules/pre_processing.py
@@ -74,8 +74,8 @@ class PreProcessing(RunEnvironment):
     def apply_oversampling(self):
         #if request for oversampling=True/False
         data = self.data_store.get('data_collection', 'train')
-        bins = self.data_store.get_default('bins')
-        rates_cap = self.data_store.get_default('rates_cap')
+        bins = self.data_store.get('oversampling_bins')
+        rates_cap = self.data_store.get('oversampling_rates_cap')
         histogram = np.array(bins)
         #get min and max of the whole data
         total_min = 0
@@ -83,9 +83,10 @@ class PreProcessing(RunEnvironment):
         for station in data:
             total_min = np.minimum(np.amin(station.get_Y(as_numpy=True)), total_min)
             total_max = np.maximum(np.amax(station.get_Y(as_numpy=True)), total_max)
+        bin_edges = []
         for station in data:
             # Create histogram for each station
-            hist, _ = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max))
+            hist, bin_edges = np.histogram(station.get_Y(as_numpy=True), bins=bins, range=(total_min,total_max))
             # Add up histograms
             histogram = histogram + hist
         # Scale down to most frequent class=1
@@ -94,8 +95,11 @@ class PreProcessing(RunEnvironment):
         oversampling_rates = 1 / histogram
         oversampling_rates_capped = np.minimum(oversampling_rates, rates_cap)
         # Add to datastore
-        self.data_store.set('oversampling_rates', oversampling_rates, 'training')
-        self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'training')
+        self.data_store.set('oversampling_rates', oversampling_rates, 'train')
+        self.data_store.set('oversampling_rates_capped', oversampling_rates_capped, 'train')
+        self.data_store.set('oversampling_bin_edges', bin_edges)
+        for station in data:
+            station.apply_oversampling(bin_edges, oversampling_rates_capped)
 
     def report_pre_processing(self):
         """Log some metrics on data and create latex report."""
-- 
GitLab