diff --git a/HPC_setup/requirements_HDFML_additionals.txt b/HPC_setup/requirements_HDFML_additionals.txt index 7d6163a6d676cd54588ccd2ab8fe85e1375e31c6..b2a29fbfb353f24d8c99d8429693022ea1fd406f 100644 --- a/HPC_setup/requirements_HDFML_additionals.txt +++ b/HPC_setup/requirements_HDFML_additionals.txt @@ -2,6 +2,7 @@ absl-py==0.11.0 appdirs==1.4.4 astor==0.8.1 attrs==20.3.0 +bottleneck==1.3.2 cached-property==1.5.2 certifi==2020.12.5 cftime==1.4.1 diff --git a/HPC_setup/requirements_JUWELS_additionals.txt b/HPC_setup/requirements_JUWELS_additionals.txt index 7d6163a6d676cd54588ccd2ab8fe85e1375e31c6..b2a29fbfb353f24d8c99d8429693022ea1fd406f 100644 --- a/HPC_setup/requirements_JUWELS_additionals.txt +++ b/HPC_setup/requirements_JUWELS_additionals.txt @@ -2,6 +2,7 @@ absl-py==0.11.0 appdirs==1.4.4 astor==0.8.1 attrs==20.3.0 +bottleneck==1.3.2 cached-property==1.5.2 certifi==2020.12.5 cftime==1.4.1 diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index b359a26df539e95736650a9d8fc5116ea68db8ee..86e6f856b7bf061287261ae711063d71ed7c8963 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -226,7 +226,7 @@ class DataHandlerSeparationOfScalesSingleStation(DataHandlerMixedSamplingWithFil res_filter.append(data_filter.shift({dim: -w * delta})) res_filter = xr.concat(res_filter, dim=window_array).chunk() res.append(res_filter) - res = xr.concat(res, dim="filter") + res = xr.concat(res, dim="filter").compute() return res def estimate_filter_width(self): diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 3631597aedb90b3411163a42490e9c023bad706a..3e99357c36d556f093701325964500bf8d46c698 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -11,8 +11,10 @@ import pandas as pd from typing import Union, Tuple, Dict, List from matplotlib import pyplot as plt import itertools +import gc +import warnings -from mlair.helpers import to_list +from mlair.helpers import to_list, TimeTracking, TimeTrackingWrapper Data = Union[xr.DataArray, pd.DataFrame] @@ -438,7 +440,7 @@ class SkillScores: """Calculate CASE IV.""" AI, BI, CI, data, suffix = self.skill_score_pre_calculations(internal_data, observation_name, forecast_name) monthly_mean_external = self.create_monthly_mean_from_daily_data(external_data, index=data.index) - data = xr.concat([data, monthly_mean_external], dim="type") + data = xr.concat([data, monthly_mean_external], dim="type").dropna(dim="index") mean, sigma = suffix["mean"], suffix["sigma"] mean_external = monthly_mean_external.mean() sigma_external = np.sqrt(monthly_mean_external.var()) @@ -608,6 +610,48 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: return None + @TimeTrackingWrapper + def kz_filter_new(self, df, wl, itr): + """ + It passes the low frequency time series. + + If filter method is from mean, max, min this method will call construct and rechunk before the actual + calculation to improve performance. If filter method is either median or percentile this approach is not + applicable and depending on the data and window size, this method can become slow. + + Args: + wl(int): a window length + itr(int): a number of iteration + """ + warnings.filterwarnings("ignore") + df_itr = df.__deepcopy__() + try: + kwargs = {"min_periods": int(0.7 * wl), + "center": True, + self.filter_dim: wl} + for i in np.arange(0, itr): + print(i) + rolling = df_itr.chunk().rolling(**kwargs) + if self.method not in ["percentile", "median"]: + rolling = rolling.construct("construct").chunk("auto") + if self.method == "median": + df_mv_avg_tmp = rolling.median() + elif self.method == "percentile": + df_mv_avg_tmp = rolling.quantile(self.percentile) + elif self.method == "max": + df_mv_avg_tmp = rolling.max("construct") + elif self.method == "min": + df_mv_avg_tmp = rolling.min("construct") + else: + df_mv_avg_tmp = rolling.mean("construct") + df_itr = df_mv_avg_tmp.compute() + del df_mv_avg_tmp, rolling + gc.collect() + return df_itr + except ValueError: + raise ValueError + + @TimeTrackingWrapper def kz_filter(self, df, wl, itr): """ It passes the low frequency time series. @@ -616,15 +660,18 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): wl(int): a window length itr(int): a number of iteration """ + import warnings + warnings.filterwarnings("ignore") df_itr = df.__deepcopy__() try: - kwargs = {"min_periods": 1, + kwargs = {"min_periods": int(0.7 * wl), "center": True, self.filter_dim: wl} iter_vars = df_itr.coords["variables"].values for var in iter_vars: - df_itr_var = df_itr.sel(variables=[var]).chunk() + df_itr_var = df_itr.sel(variables=[var]) for _ in np.arange(0, itr): + df_itr_var = df_itr_var.chunk() rolling = df_itr_var.rolling(**kwargs) if self.method == "median": df_mv_avg_tmp = rolling.median() @@ -637,7 +684,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): else: df_mv_avg_tmp = rolling.mean() df_itr_var = df_mv_avg_tmp.compute() - df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var) + df_itr.loc[{"variables": [var]}] = df_itr_var return df_itr except ValueError: raise ValueError diff --git a/requirements.txt b/requirements.txt index af742fdea75902515cfc180cdbe43f80cef25614..85655e237f8e10e98f77c379be6acd0a7bb65d46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ absl-py==0.11.0 appdirs==1.4.4 astor==0.8.1 attrs==20.3.0 +bottleneck==1.3.2 cached-property==1.5.2 certifi==2020.12.5 cftime==1.4.1 diff --git a/requirements_gpu.txt b/requirements_gpu.txt index 7dd443a45df25a9e990888ab2ff061388ce36436..cc189496bdf4e1e1ee86902a1953c2058d58c8e4 100644 --- a/requirements_gpu.txt +++ b/requirements_gpu.txt @@ -2,6 +2,7 @@ absl-py==0.11.0 appdirs==1.4.4 astor==0.8.1 attrs==20.3.0 +bottleneck==1.3.2 cached-property==1.5.2 certifi==2020.12.5 cftime==1.4.1