Skip to content
Snippets Groups Projects
Commit 61486654 authored by leufen1's avatar leufen1
Browse files

kzf per variable seems to be faster than over variables, check on HPC

parent de191bed
No related branches found
No related tags found
7 merge requests!319add all changes of dev into release v1.4.0 branch,!318Resolve "release v1.4.0",!283Merge latest develop into falcos issue,!279include Develop,!278Felix issue295 transformation parameters in data handler,!274Resolve "implement lazy data preprocessing",!259Draft: Resolve "WRF-Datahandler should inherit from SingleStationDatahandler"
Pipeline #63126 passed
......@@ -114,6 +114,12 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi
self._data = list(map(self.load_and_interpolate, [0, 1])) # load input (0) and target (1) data
self.set_inputs_and_targets()
self.apply_kz_filter()
# lazy data loading on first time if possible
# * store the kz data locally in data path under different folder /e.g. kzf_data
# * create a checksum for the name and reuse this data always if checksum fits (this will replace all previous
# steps and save a lot of computation time.
# lazy create of subsets by reusing as much as possible
# * start here when using preprocessed data, select new start and end
if self.do_transformation is True:
self.call_transform()
self.make_samples()
......
......@@ -11,8 +11,10 @@ import pandas as pd
from typing import Union, Tuple, Dict, List
from matplotlib import pyplot as plt
import itertools
import gc
import warnings
from mlair.helpers import to_list
from mlair.helpers import to_list, TimeTracking, TimeTrackingWrapper
Data = Union[xr.DataArray, pd.DataFrame]
......@@ -608,6 +610,48 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass):
else:
return None
@TimeTrackingWrapper
def kz_filter_new(self, df, wl, itr):
"""
It passes the low frequency time series.
If filter method is from mean, max, min this method will call construct and rechunk before the actual
calculation to improve performance. If filter method is either median or percentile this approach is not
applicable and depending on the data and window size, this method can become slow.
Args:
wl(int): a window length
itr(int): a number of iteration
"""
warnings.filterwarnings("ignore")
df_itr = df.__deepcopy__()
try:
kwargs = {"min_periods": int(0.7 * wl),
"center": True,
self.filter_dim: wl}
for i in np.arange(0, itr):
print(i)
rolling = df_itr.chunk().rolling(**kwargs)
if self.method not in ["percentile", "median"]:
rolling = rolling.construct("construct").chunk("auto")
if self.method == "median":
df_mv_avg_tmp = rolling.median()
elif self.method == "percentile":
df_mv_avg_tmp = rolling.quantile(self.percentile)
elif self.method == "max":
df_mv_avg_tmp = rolling.max("construct")
elif self.method == "min":
df_mv_avg_tmp = rolling.min("construct")
else:
df_mv_avg_tmp = rolling.mean("construct")
df_itr = df_mv_avg_tmp.compute()
del df_mv_avg_tmp, rolling
gc.collect()
return df_itr
except ValueError:
raise ValueError
@TimeTrackingWrapper
def kz_filter(self, df, wl, itr):
"""
It passes the low frequency time series.
......@@ -639,7 +683,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass):
else:
df_mv_avg_tmp = rolling.mean()
df_itr_var = df_mv_avg_tmp.compute()
df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var)
df_itr.loc[{"variables": [var]}] = df_itr_var
return df_itr
except ValueError:
raise ValueError
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment