diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index 8159abda17207d78113a66e0a9f4da5a38e123a8..c56499dc543fb3805b02a7353aa32598703d9ec3 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -114,6 +114,12 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi self._data = list(map(self.load_and_interpolate, [0, 1])) # load input (0) and target (1) data self.set_inputs_and_targets() self.apply_kz_filter() + # lazy data loading on first time if possible + # * store the kz data locally in data path under different folder /e.g. kzf_data + # * create a checksum for the name and reuse this data always if checksum fits (this will replace all previous + # steps and save a lot of computation time. + # lazy create of subsets by reusing as much as possible + # * start here when using preprocessed data, select new start and end if self.do_transformation is True: self.call_transform() self.make_samples() diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 57d7802e58c69f39e17aaa244871b68dfaba00b7..0b73bc27bf5621b4e6fb88bbd449d4002cd7f9ba 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -11,8 +11,10 @@ import pandas as pd from typing import Union, Tuple, Dict, List from matplotlib import pyplot as plt import itertools +import gc +import warnings -from mlair.helpers import to_list +from mlair.helpers import to_list, TimeTracking, TimeTrackingWrapper Data = Union[xr.DataArray, pd.DataFrame] @@ -608,6 +610,48 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: return None + @TimeTrackingWrapper + def kz_filter_new(self, df, wl, itr): + """ + It passes the low frequency time series. + + If filter method is from mean, max, min this method will call construct and rechunk before the actual + calculation to improve performance. If filter method is either median or percentile this approach is not + applicable and depending on the data and window size, this method can become slow. + + Args: + wl(int): a window length + itr(int): a number of iteration + """ + warnings.filterwarnings("ignore") + df_itr = df.__deepcopy__() + try: + kwargs = {"min_periods": int(0.7 * wl), + "center": True, + self.filter_dim: wl} + for i in np.arange(0, itr): + print(i) + rolling = df_itr.chunk().rolling(**kwargs) + if self.method not in ["percentile", "median"]: + rolling = rolling.construct("construct").chunk("auto") + if self.method == "median": + df_mv_avg_tmp = rolling.median() + elif self.method == "percentile": + df_mv_avg_tmp = rolling.quantile(self.percentile) + elif self.method == "max": + df_mv_avg_tmp = rolling.max("construct") + elif self.method == "min": + df_mv_avg_tmp = rolling.min("construct") + else: + df_mv_avg_tmp = rolling.mean("construct") + df_itr = df_mv_avg_tmp.compute() + del df_mv_avg_tmp, rolling + gc.collect() + return df_itr + except ValueError: + raise ValueError + + @TimeTrackingWrapper def kz_filter(self, df, wl, itr): """ It passes the low frequency time series. @@ -639,7 +683,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: df_mv_avg_tmp = rolling.mean() df_itr_var = df_mv_avg_tmp.compute() - df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var) + df_itr.loc[{"variables": [var]}] = df_itr_var return df_itr except ValueError: raise ValueError