kzf per variable seems to be faster than over variables, check on HPC

61486654 · leufen1 · de191bed · 61486654 · 61486654
Commit 61486654 authored 4 years ago by leufen1
--- a/mlair/data_handler/data_handler_mixed_sampling.py
+++ b/mlair/data_handler/data_handler_mixed_sampling.py
@@ -114,6 +114,12 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi
        self._data = list(map(self.load_and_interpolate, [0, 1]))  # load input (0) and target (1) data
        self.set_inputs_and_targets()
        self.apply_kz_filter()
+        # lazy data loading on first time if possible
+        # * store the kz data locally in data path under different folder /e.g. kzf_data
+        # * create a checksum for the name and reuse this data always if checksum fits (this will replace all previous
+        #   steps and save a lot of computation time.
+        # lazy create of subsets by reusing as much as possible
+        # * start here when using preprocessed data, select new start and end
        if self.do_transformation is True:
            self.call_transform()
        self.make_samples()

--- a/mlair/helpers/statistics.py
+++ b/mlair/helpers/statistics.py
@@ -11,8 +11,10 @@ import pandas as pd
 from typing import Union, Tuple, Dict, List
 from matplotlib import pyplot as plt
 import itertools
+import gc
+import warnings

-from mlair.helpers import to_list
+from mlair.helpers import to_list, TimeTracking, TimeTrackingWrapper

 Data = Union[xr.DataArray, pd.DataFrame]

@@ -608,6 +610,48 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass):
        else:
            return None

+    @TimeTrackingWrapper
+    def kz_filter_new(self, df, wl, itr):
+        """
+        It passes the low frequency time series.
+
+        If filter method is from mean, max, min this method will call construct and rechunk before the actual
+        calculation to improve performance. If filter method is either median or percentile this approach is not
+        applicable and depending on the data and window size, this method can become slow.
+
+        Args:
+             wl(int): a window length
+             itr(int): a number of iteration
+        """
+        warnings.filterwarnings("ignore")
+        df_itr = df.__deepcopy__()
+        try:
+            kwargs = {"min_periods": int(0.7 * wl),
+                      "center": True,
+                      self.filter_dim: wl}
+            for i in np.arange(0, itr):
+                print(i)
+                rolling = df_itr.chunk().rolling(**kwargs)
+                if self.method not in ["percentile", "median"]:
+                    rolling = rolling.construct("construct").chunk("auto")
+                if self.method == "median":
+                    df_mv_avg_tmp = rolling.median()
+                elif self.method == "percentile":
+                    df_mv_avg_tmp = rolling.quantile(self.percentile)
+                elif self.method == "max":
+                    df_mv_avg_tmp = rolling.max("construct")
+                elif self.method == "min":
+                    df_mv_avg_tmp = rolling.min("construct")
+                else:
+                    df_mv_avg_tmp = rolling.mean("construct")
+                df_itr = df_mv_avg_tmp.compute()
+                del df_mv_avg_tmp, rolling
+                gc.collect()
+            return df_itr
+        except ValueError:
+            raise ValueError
+
+    @TimeTrackingWrapper
    def kz_filter(self, df, wl, itr):
        """
        It passes the low frequency time series.
@@ -639,7 +683,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass):
                    else:
                        df_mv_avg_tmp = rolling.mean()
                    df_itr_var = df_mv_avg_tmp.compute()
-                df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var)
+                df_itr.loc[{"variables": [var]}] = df_itr_var
            return df_itr
        except ValueError:
            raise ValueError