From 614866546c57b9ac85eb9b173066dd703ba711e1 Mon Sep 17 00:00:00 2001 From: leufen1 <l.leufen@fz-juelich.de> Date: Tue, 16 Mar 2021 14:44:24 +0100 Subject: [PATCH] kzf per variable seems to be faster than over variables, check on HPC --- .../data_handler_mixed_sampling.py | 6 +++ mlair/helpers/statistics.py | 48 ++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index 8159abda..c56499dc 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -114,6 +114,12 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi self._data = list(map(self.load_and_interpolate, [0, 1])) # load input (0) and target (1) data self.set_inputs_and_targets() self.apply_kz_filter() + # lazy data loading on first time if possible + # * store the kz data locally in data path under different folder /e.g. kzf_data + # * create a checksum for the name and reuse this data always if checksum fits (this will replace all previous + # steps and save a lot of computation time. + # lazy create of subsets by reusing as much as possible + # * start here when using preprocessed data, select new start and end if self.do_transformation is True: self.call_transform() self.make_samples() diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 57d7802e..0b73bc27 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -11,8 +11,10 @@ import pandas as pd from typing import Union, Tuple, Dict, List from matplotlib import pyplot as plt import itertools +import gc +import warnings -from mlair.helpers import to_list +from mlair.helpers import to_list, TimeTracking, TimeTrackingWrapper Data = Union[xr.DataArray, pd.DataFrame] @@ -608,6 +610,48 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: return None + @TimeTrackingWrapper + def kz_filter_new(self, df, wl, itr): + """ + It passes the low frequency time series. + + If filter method is from mean, max, min this method will call construct and rechunk before the actual + calculation to improve performance. If filter method is either median or percentile this approach is not + applicable and depending on the data and window size, this method can become slow. + + Args: + wl(int): a window length + itr(int): a number of iteration + """ + warnings.filterwarnings("ignore") + df_itr = df.__deepcopy__() + try: + kwargs = {"min_periods": int(0.7 * wl), + "center": True, + self.filter_dim: wl} + for i in np.arange(0, itr): + print(i) + rolling = df_itr.chunk().rolling(**kwargs) + if self.method not in ["percentile", "median"]: + rolling = rolling.construct("construct").chunk("auto") + if self.method == "median": + df_mv_avg_tmp = rolling.median() + elif self.method == "percentile": + df_mv_avg_tmp = rolling.quantile(self.percentile) + elif self.method == "max": + df_mv_avg_tmp = rolling.max("construct") + elif self.method == "min": + df_mv_avg_tmp = rolling.min("construct") + else: + df_mv_avg_tmp = rolling.mean("construct") + df_itr = df_mv_avg_tmp.compute() + del df_mv_avg_tmp, rolling + gc.collect() + return df_itr + except ValueError: + raise ValueError + + @TimeTrackingWrapper def kz_filter(self, df, wl, itr): """ It passes the low frequency time series. @@ -639,7 +683,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: df_mv_avg_tmp = rolling.mean() df_itr_var = df_mv_avg_tmp.compute() - df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var) + df_itr.loc[{"variables": [var]}] = df_itr_var return df_itr except ValueError: raise ValueError -- GitLab