diff --git a/.coveragerc b/.coveragerc index 69c1dcd3f1ca5068733a54fdb231bab80170169d..bc1fedc1454539088f93a014376f198ada7985e5 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,6 +1,9 @@ # .coveragerc to control coverage.py [run] branch = True +omit = + # do not test keras legacy + mlair/keras_legacy [report] diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index 3de749d02375243269f9eb51c08400840fd0656a..6c256f86c6289578030bd0f116315fd724eb9b7a 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -7,7 +7,7 @@ from mlair.data_handler.data_handler_with_filter import DataHandlerFirFilterSing from mlair.data_handler.data_handler_with_filter import DataHandlerClimateFirFilter, DataHandlerFirFilter from mlair.data_handler import DefaultDataHandler from mlair import helpers -from mlair.helpers import to_list +from mlair.helpers import to_list, sort_like from mlair.configuration.defaults import DEFAULT_SAMPLING, DEFAULT_INTERPOLATION_LIMIT, DEFAULT_INTERPOLATION_METHOD from mlair.helpers.filter import filter_width_kzf @@ -244,11 +244,11 @@ class DataHandlerMixedSamplingWithClimateFirFilter(DataHandlerClimateFirFilter): def build(cls, station: str, **kwargs): sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler.requirements() if k in kwargs} filter_add_unfiltered = kwargs.get("filter_add_unfiltered", False) - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="filtered") + sp_keys = cls.build_update_transformation(sp_keys, dh_type="filtered") sp = cls.data_handler(station, **sp_keys) if filter_add_unfiltered is True: sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="unfiltered") + sp_keys = cls.build_update_transformation(sp_keys, dh_type="unfiltered") sp_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) else: sp_unfiltered = None @@ -256,7 +256,7 @@ class DataHandlerMixedSamplingWithClimateFirFilter(DataHandlerClimateFirFilter): return cls(sp, data_handler_class_unfiltered=sp_unfiltered, **dp_args) @classmethod - def build_update_kwargs(cls, kwargs_dict, dh_type="filtered"): + def build_update_transformation(cls, kwargs_dict, dh_type="filtered"): if "transformation" in kwargs_dict: trafo_opts = kwargs_dict.get("transformation") if isinstance(trafo_opts, dict): @@ -306,11 +306,15 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi # _requirements = list(set(data_handler.requirements() + data_handler_unfiltered.requirements())) # DEFAULT_FILTER_ADD_UNFILTERED = False data_handler_climate_fir = DataHandlerMixedSamplingWithClimateFirFilterSingleStation - data_handler_fir = DataHandlerMixedSamplingWithFirFilterSingleStation + data_handler_fir = (DataHandlerMixedSamplingWithFirFilterSingleStation, + DataHandlerMixedSamplingWithClimateFirFilterSingleStation) + data_handler_fir_pos = None data_handler = None data_handler_unfiltered = DataHandlerMixedSamplingSingleStation - _requirements = list(set(data_handler_climate_fir.requirements() + data_handler_fir.requirements() + - data_handler_unfiltered.requirements())) + _requirements = list(set(data_handler_climate_fir.requirements() + data_handler_fir[0].requirements() + + data_handler_fir[1].requirements() + data_handler_unfiltered.requirements())) + chem_indicator = "chem" + meteo_indicator = "meteo" def __init__(self, data_handler_class_chem, data_handler_class_meteo, data_handler_class_chem_unfiltered, data_handler_class_meteo_unfiltered, chem_vars, meteo_vars, *args, **kwargs): @@ -338,7 +342,7 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi chem_vars = cls.data_handler_climate_fir.chem_vars chem = set(variables).intersection(chem_vars) meteo = set(variables).difference(chem_vars) - return to_list(chem), to_list(meteo) + return sort_like(to_list(chem), variables), sort_like(to_list(meteo), variables) @classmethod def build(cls, station: str, **kwargs): @@ -349,43 +353,60 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi if len(chem_vars) > 0: sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_climate_fir.requirements() if k in kwargs} - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="filtered_chem") - sp_keys.update({"variables": chem_vars}) - cls.adjust_window_opts("chem", "window_history_size", sp_keys) - cls.adjust_window_opts("chem", "window_history_offset", sp_keys) + sp_keys = cls.build_update_transformation(sp_keys, dh_type="filtered_chem") + + cls.prepare_build(sp_keys, chem_vars, cls.chem_indicator) sp_chem = cls.data_handler_climate_fir(station, **sp_keys) if filter_add_unfiltered is True: sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="unfiltered_chem") - sp_keys.update({"variables": chem_vars}) - cls.adjust_window_opts("chem", "window_history_size", sp_keys) - cls.adjust_window_opts("chem", "window_history_offset", sp_keys) + sp_keys = cls.build_update_transformation(sp_keys, dh_type="unfiltered_chem") + cls.prepare_build(sp_keys, chem_vars, cls.chem_indicator) sp_chem_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) if len(meteo_vars) > 0: - sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_fir.requirements() if k in kwargs} - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="filtered_meteo") - sp_keys.update({"variables": meteo_vars}) - cls.adjust_window_opts("meteo", "window_history_size", sp_keys) - cls.adjust_window_opts("meteo", "window_history_offset", sp_keys) - sp_meteo = cls.data_handler_fir(station, **sp_keys) + if cls.data_handler_fir_pos is None: + if "extend_length_opts" in kwargs: + if isinstance(kwargs["extend_length_opts"], dict) and cls.meteo_indicator not in kwargs["extend_length_opts"].keys(): + cls.data_handler_fir_pos = 0 # use faster fir version without climate estimate + else: + cls.data_handler_fir_pos = 1 # use slower fir version with climate estimate + else: + cls.data_handler_fir_pos = 0 # use faster fir version without climate estimate + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_fir[cls.data_handler_fir_pos].requirements() if k in kwargs} + sp_keys = cls.build_update_transformation(sp_keys, dh_type="filtered_meteo") + cls.prepare_build(sp_keys, meteo_vars, cls.meteo_indicator) + sp_meteo = cls.data_handler_fir[cls.data_handler_fir_pos](station, **sp_keys) if filter_add_unfiltered is True: sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} - sp_keys = cls.build_update_kwargs(sp_keys, dh_type="unfiltered_meteo") - sp_keys.update({"variables": meteo_vars}) - cls.adjust_window_opts("meteo", "window_history_size", sp_keys) - cls.adjust_window_opts("meteo", "window_history_offset", sp_keys) + sp_keys = cls.build_update_transformation(sp_keys, dh_type="unfiltered_meteo") + cls.prepare_build(sp_keys, meteo_vars, cls.meteo_indicator) sp_meteo_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} return cls(sp_chem, sp_meteo, sp_chem_unfiltered, sp_meteo_unfiltered, chem_vars, meteo_vars, **dp_args) + @classmethod + def prepare_build(cls, kwargs, var_list, var_type): + kwargs.update({"variables": var_list}) + for k in list(kwargs.keys()): + v = kwargs[k] + if isinstance(v, dict): + if len(set(v.keys()).intersection({cls.chem_indicator, cls.meteo_indicator})) > 0: + try: + new_v = kwargs.pop(k) + kwargs[k] = new_v[var_type] + except KeyError: + pass + @staticmethod def adjust_window_opts(key: str, parameter_name: str, kwargs: dict): - if parameter_name in kwargs: - window_opt = kwargs.pop(parameter_name) - if isinstance(window_opt, dict): - window_opt = window_opt[key] - kwargs[parameter_name] = window_opt + try: + if parameter_name in kwargs: + window_opt = kwargs.pop(parameter_name) + if isinstance(window_opt, dict): + window_opt = window_opt[key] + kwargs[parameter_name] = window_opt + except KeyError: + pass def _create_collection(self): collection = super()._create_collection() @@ -402,39 +423,39 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi return chem_vars, meteo_vars = cls._split_chem_and_meteo_variables(**kwargs) - + transformation_chem, transformation_meteo = None, None # chem transformation - kwargs_chem = copy.deepcopy(kwargs) - kwargs_chem["variables"] = chem_vars - cls.adjust_window_opts("chem", "window_history_size", kwargs_chem) - cls.adjust_window_opts("chem", "window_history_offset", kwargs_chem) - dh_transformation = (cls.data_handler_climate_fir, cls.data_handler_unfiltered) - transformation_chem = super().transformation(set_stations, tmp_path=tmp_path, - dh_transformation=dh_transformation, **kwargs_chem) + if len(chem_vars) > 0: + kwargs_chem = copy.deepcopy(kwargs) + cls.prepare_build(kwargs_chem, chem_vars, cls.chem_indicator) + dh_transformation = (cls.data_handler_climate_fir, cls.data_handler_unfiltered) + transformation_chem = super().transformation(set_stations, tmp_path=tmp_path, + dh_transformation=dh_transformation, **kwargs_chem) # meteo transformation - kwargs_meteo = copy.deepcopy(kwargs) - kwargs_meteo["variables"] = meteo_vars - cls.adjust_window_opts("meteo", "window_history_size", kwargs_meteo) - cls.adjust_window_opts("meteo", "window_history_offset", kwargs_meteo) - dh_transformation = (cls.data_handler_fir, cls.data_handler_unfiltered) - transformation_meteo = super().transformation(set_stations, tmp_path=tmp_path, - dh_transformation=dh_transformation, **kwargs_meteo) + if len(meteo_vars) > 0: + kwargs_meteo = copy.deepcopy(kwargs) + cls.prepare_build(kwargs_meteo, meteo_vars, cls.meteo_indicator) + dh_transformation = (cls.data_handler_fir[cls.data_handler_fir_pos or 0], cls.data_handler_unfiltered) + transformation_meteo = super().transformation(set_stations, tmp_path=tmp_path, + dh_transformation=dh_transformation, **kwargs_meteo) # combine all transformations transformation_res = {} - if isinstance(transformation_chem, dict): - if len(transformation_chem) > 0: - transformation_res["filtered_chem"] = transformation_chem.pop("filtered") - transformation_res["unfiltered_chem"] = transformation_chem.pop("unfiltered") - else: # if no unfiltered chem branch - transformation_res["filtered_chem"] = transformation_chem - if isinstance(transformation_meteo, dict): - if len(transformation_meteo) > 0: - transformation_res["filtered_meteo"] = transformation_meteo.pop("filtered") - transformation_res["unfiltered_meteo"] = transformation_meteo.pop("unfiltered") - else: # if no unfiltered meteo branch - transformation_res["filtered_meteo"] = transformation_meteo + if transformation_chem is not None: + if isinstance(transformation_chem, dict): + if len(transformation_chem) > 0: + transformation_res["filtered_chem"] = transformation_chem.pop("filtered") + transformation_res["unfiltered_chem"] = transformation_chem.pop("unfiltered") + else: # if no unfiltered chem branch + transformation_res["filtered_chem"] = transformation_chem + if transformation_meteo is not None: + if isinstance(transformation_meteo, dict): + if len(transformation_meteo) > 0: + transformation_res["filtered_meteo"] = transformation_meteo.pop("filtered") + transformation_res["unfiltered_meteo"] = transformation_meteo.pop("unfiltered") + else: # if no unfiltered meteo branch + transformation_res["filtered_meteo"] = transformation_meteo return transformation_res if len(transformation_res) > 0 else None def get_X_original(self): @@ -451,76 +472,3 @@ class DataHandlerMixedSamplingWithClimateAndFirFilter(DataHandlerMixedSamplingWi return X else: return super().get_X_original() - - -# -# class DataHandlerSeparationOfScalesSingleStation(DataHandlerMixedSamplingWithKzFilterSingleStation): -# """ -# Data handler using mixed sampling for input and target. Inputs are temporal filtered and depending on the -# separation frequency of a filtered time series the time step delta for input data is adjusted (see image below). -# -# .. image:: ../../../../../_source/_plots/separation_of_scales.png -# :width: 400 -# -# """ -# -# _requirements = DataHandlerMixedSamplingWithKzFilterSingleStation.requirements() -# _hash = DataHandlerMixedSamplingWithKzFilterSingleStation._hash + ["time_delta"] -# -# def __init__(self, *args, time_delta=np.sqrt, **kwargs): -# assert isinstance(time_delta, Callable) -# self.time_delta = time_delta -# super().__init__(*args, **kwargs) -# -# def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None: -# """ -# Create a xr.DataArray containing history data. -# -# Shift the data window+1 times and return a xarray which has a new dimension 'window' containing the shifted -# data. This is used to represent history in the data. Results are stored in history attribute. -# -# :param dim_name_of_inputs: Name of dimension which contains the input variables -# :param window: number of time steps to look back in history -# Note: window will be treated as negative value. This should be in agreement with looking back on -# a time line. Nonetheless positive values are allowed but they are converted to its negative -# expression -# :param dim_name_of_shift: Dimension along shift will be applied -# """ -# window = -abs(window) -# data = self.input_data -# self.history = self.stride(data, dim_name_of_shift, window, offset=self.window_history_offset) -# -# def stride(self, data: xr.DataArray, dim: str, window: int, offset: int = 0) -> xr.DataArray: -# time_deltas = np.round(self.time_delta(self.cutoff_period)).astype(int) -# start, end = window, 1 -# res = [] -# _range = list(map(lambda x: x + offset, range(start, end))) -# window_array = self.create_index_array(self.window_dim, _range, squeeze_dim=self.target_dim) -# for delta, filter_name in zip(np.append(time_deltas, 1), data.coords["filter"]): -# res_filter = [] -# data_filter = data.sel({"filter": filter_name}) -# for w in _range: -# res_filter.append(data_filter.shift({dim: -(w - offset) * delta - offset})) -# res_filter = xr.concat(res_filter, dim=window_array).chunk() -# res.append(res_filter) -# res = xr.concat(res, dim="filter").compute() -# return res -# -# def estimate_filter_width(self): -# """ -# Attention: this method returns the maximum value of -# * either estimated filter width f = 0.5 / (len * sqrt(itr)) -> T = 1 / f or -# * time delta method applied on the estimated filter width mupliplied by window_history_size -# to provide a sufficiently wide filter width. -# """ -# est = self.kz_filter_length[0] * np.sqrt(self.kz_filter_iter[0]) * 2 -# return int(max([self.time_delta(est) * self.window_history_size, est])) -# -# -# class DataHandlerSeparationOfScales(DefaultDataHandler): -# """Data handler using mixed sampling for input and target. Inputs are temporal filtered and different time step -# sizes are applied in relation to frequencies.""" -# -# data_handler = DataHandlerSeparationOfScalesSingleStation -# data_handler_transformation = DataHandlerSeparationOfScalesSingleStation -# _requirements = data_handler.requirements() diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 3e962fa8c4fa96637dea86db98e29a180deeac5b..a202e49411b01513a5fa8303e2134645370487f3 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -32,6 +32,12 @@ data_or_none = Union[xr.DataArray, None] class DataHandlerSingleStation(AbstractDataHandler): + """ + :param window_history_offset: used to shift t0 according to the specified value. + :param window_history_end: used to set the last time step that is used to create a sample. A negative value + indicates that not all values up to t0 are used, a positive values indicates usage of values at t>t0. Default + is 0. + """ DEFAULT_STATION_TYPE = "background" DEFAULT_NETWORK = "AIRBASE" DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', @@ -40,6 +46,7 @@ class DataHandlerSingleStation(AbstractDataHandler): DEFAULT_WINDOW_LEAD_TIME = 3 DEFAULT_WINDOW_HISTORY_SIZE = 13 DEFAULT_WINDOW_HISTORY_OFFSET = 0 + DEFAULT_WINDOW_HISTORY_END = 0 DEFAULT_TIME_DIM = "datetime" DEFAULT_TARGET_VAR = "o3" DEFAULT_TARGET_DIM = "variables" @@ -53,14 +60,14 @@ class DataHandlerSingleStation(AbstractDataHandler): _hash = ["station", "statistics_per_var", "data_origin", "station_type", "network", "sampling", "target_dim", "target_var", "time_dim", "iter_dim", "window_dim", "window_history_size", "window_history_offset", - "window_lead_time", "interpolation_limit", "interpolation_method", "variables"] + "window_lead_time", "interpolation_limit", "interpolation_method", "variables", "window_history_end"] def __init__(self, station, data_path, statistics_per_var=None, station_type=DEFAULT_STATION_TYPE, network=DEFAULT_NETWORK, sampling: Union[str, Tuple[str]] = DEFAULT_SAMPLING, target_dim=DEFAULT_TARGET_DIM, target_var=DEFAULT_TARGET_VAR, time_dim=DEFAULT_TIME_DIM, iter_dim=DEFAULT_ITER_DIM, window_dim=DEFAULT_WINDOW_DIM, window_history_size=DEFAULT_WINDOW_HISTORY_SIZE, window_history_offset=DEFAULT_WINDOW_HISTORY_OFFSET, - window_lead_time=DEFAULT_WINDOW_LEAD_TIME, + window_history_end=DEFAULT_WINDOW_HISTORY_END, window_lead_time=DEFAULT_WINDOW_LEAD_TIME, interpolation_limit: Union[int, Tuple[int]] = DEFAULT_INTERPOLATION_LIMIT, interpolation_method: Union[str, Tuple[str]] = DEFAULT_INTERPOLATION_METHOD, overwrite_local_data: bool = False, transformation=None, store_data_locally: bool = True, @@ -92,6 +99,7 @@ class DataHandlerSingleStation(AbstractDataHandler): self.window_dim = window_dim self.window_history_size = window_history_size self.window_history_offset = window_history_offset + self.window_history_end = window_history_end self.window_lead_time = window_lead_time self.interpolation_limit = interpolation_limit @@ -107,7 +115,7 @@ class DataHandlerSingleStation(AbstractDataHandler): # internal self._data: xr.DataArray = None # loaded raw data self.meta = None - self.variables = list(statistics_per_var.keys()) if variables is None else variables + self.variables = sorted(list(statistics_per_var.keys())) if variables is None else variables self.history = None self.label = None self.observation = None @@ -157,7 +165,7 @@ class DataHandlerSingleStation(AbstractDataHandler): return self.label.squeeze([self.iter_dim, self.target_dim]).transpose(self.time_dim, self.window_dim).copy() def get_X(self, **kwargs): - return self.get_transposed_history() + return self.get_transposed_history().sel({self.target_dim: self.variables}) def get_Y(self, **kwargs): return self.get_transposed_label() @@ -309,7 +317,7 @@ class DataHandlerSingleStation(AbstractDataHandler): self.target_data = targets def make_samples(self): - self.make_history_window(self.target_dim, self.window_history_size, self.time_dim) + self.make_history_window(self.target_dim, self.window_history_size, self.time_dim) #todo stopped here self.make_labels(self.target_dim, self.target_var, self.time_dim, self.window_lead_time) self.make_observation(self.target_dim, self.target_var, self.time_dim) self.remove_nan(self.time_dim) @@ -570,7 +578,8 @@ class DataHandlerSingleStation(AbstractDataHandler): if data is None: data = self.input_data window = -abs(window) - history = self.shift(data, dim_name_of_shift, window, offset=self.window_history_offset) + offset = self.window_history_offset + self.window_history_end + history = self.shift(data, dim_name_of_shift, window, offset=offset) return history def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, diff --git a/mlair/data_handler/data_handler_with_filter.py b/mlair/data_handler/data_handler_with_filter.py index 07fdc41fc4dae49bd44a071dd2228c4bff860b04..997ecbf51740cce159d2339b589728b5e708de53 100644 --- a/mlair/data_handler/data_handler_with_filter.py +++ b/mlair/data_handler/data_handler_with_filter.py @@ -124,7 +124,8 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): DEFAULT_WINDOW_TYPE = ("kaiser", 5) - def __init__(self, *args, filter_cutoff_period, filter_order, filter_window_type=DEFAULT_WINDOW_TYPE, plot_path=None, **kwargs): + def __init__(self, *args, filter_cutoff_period, filter_order, filter_window_type=DEFAULT_WINDOW_TYPE, + plot_path=None, filter_plot_dates=None, **kwargs): # self.original_data = None # ToDo: implement here something to store unfiltered data self.fs = self._get_fs(**kwargs) if filter_window_type == "kzf": @@ -136,6 +137,7 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): self.filter_window_type = filter_window_type self.unfiltered_name = "unfiltered" self.plot_path = plot_path # use this path to create insight plots + self.plot_dates = filter_plot_dates super().__init__(*args, **kwargs) @staticmethod @@ -190,11 +192,13 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): def apply_filter(self): """Apply FIR filter only on inputs.""" fir = FIRFilter(self.input_data.astype("float32"), self.fs, self.filter_order, self.filter_cutoff_freq, - self.filter_window_type, self.target_dim, self.time_dim, station_name=self.station[0], - minimum_length=self.window_history_size, offset=self.window_history_offset, plot_path=self.plot_path) + self.filter_window_type, self.target_dim, self.time_dim, display_name=self.station[0], + minimum_length=self.window_history_size, offset=self.window_history_offset, + plot_path=self.plot_path, plot_dates=self.plot_dates) self.fir_coeff = fir.filter_coefficients filter_data = fir.filtered_data - self.input_data = xr.concat(filter_data, pd.Index(self.create_filter_index(), name=self.filter_dim)) + input_data = xr.concat(filter_data, pd.Index(self.create_filter_index(), name=self.filter_dim)) + self.input_data = input_data.sel({self.target_dim: self.variables}) # this is just a code snippet to check the results of the kz filter # import matplotlib # matplotlib.use("TkAgg") @@ -326,21 +330,30 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation apriori_type is None or `zeros`, and a climatology of the residuum is used for `residuum_stats`. :param apriori_diurnal: use diurnal anomalies of each hour as addition to the apriori information type chosen by parameter apriori_type. This is only applicable for hourly resolution data. + :param apriori_sel_opts: specify some parameters to select a subset of data before calculating the apriori + information. Use this parameter for example, if apriori shall only calculated on a shorter time period than + available in given data. + :param extend_length_opts: use this parameter to use future data in the filter calculation. This parameter does not + affect the size of the history samples as this is handled by the window_history_size parameter. Example: set + extend_length_opts=7*24 to use the observation of the next 7 days to calculate the filtered components. Which + data are finally used for the input samples is not affected by these 7 days. In case the range of history sample + exceeds the horizon of extend_length_opts, the history sample will also include data from climatological + estimates. """ + DEFAULT_EXTEND_LENGTH_OPTS = 0 _hash = DataHandlerFirFilterSingleStation._hash + ["apriori_type", "apriori_sel_opts", "apriori_diurnal", "extend_length_opts"] _store_attributes = DataHandlerFirFilterSingleStation.store_attributes() + ["apriori"] def __init__(self, *args, apriori=None, apriori_type=None, apriori_diurnal=False, apriori_sel_opts=None, - name_affix=None, extend_length_opts=None, **kwargs): + extend_length_opts=DEFAULT_EXTEND_LENGTH_OPTS, **kwargs): self.apriori_type = apriori_type self.climate_filter_coeff = None # coefficents of the used FIR filter self.apriori = apriori # exogenous apriori information or None to calculate from data (endogenous) self.apriori_diurnal = apriori_diurnal self.all_apriori = None # collection of all apriori information self.apriori_sel_opts = apriori_sel_opts # ensure to separate exogenous and endogenous information - self.plot_name_affix = name_affix - self.extend_length_opts = extend_length_opts if extend_length_opts is not None else {} + self.extend_length_opts = extend_length_opts super().__init__(*args, **kwargs) @TimeTrackingWrapper @@ -355,7 +368,8 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation apriori_diurnal=self.apriori_diurnal, sel_opts=self.apriori_sel_opts, plot_path=self.plot_path, minimum_length=self.window_history_size, new_dim=self.window_dim, - station_name=self.station[0], extend_length_opts=self.extend_length_opts) + display_name=self.station[0], extend_length_opts=self.extend_length_opts, + offset=self.window_history_end, plot_dates=self.plot_dates) self.climate_filter_coeff = climate_filter.filter_coefficients # store apriori information: store all if residuum_stat method was used, otherwise just store initial apriori @@ -365,24 +379,15 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation self.apriori = climate_filter.initial_apriori_data self.all_apriori = climate_filter.apriori_data - if isinstance(self.extend_length_opts, int): - climate_filter_data = [c.sel({self.window_dim: slice(-self.window_history_size, self.extend_length_opts)}) - for c in climate_filter.filtered_data] - else: - climate_filter_data = [] - for c in climate_filter.filtered_data: - coll_tmp = [] - for v in c.coords[self.target_dim].values: - upper_lim = self.extend_length_opts.get(v, 0) - coll_tmp.append(c.sel({self.target_dim: v, - self.window_dim: slice(-self.window_history_size, upper_lim)})) - climate_filter_data.append(xr.concat(coll_tmp, self.target_dim)) + climate_filter_data = [c.sel({self.window_dim: slice(self.window_history_end-self.window_history_size, + self.window_history_end)}) + for c in climate_filter.filtered_data] # create input data with filter index input_data = xr.concat(climate_filter_data, pd.Index(self.create_filter_index(add_unfiltered_index=False), name=self.filter_dim)) - self.input_data = input_data + self.input_data = input_data.sel({self.target_dim: self.variables}) # this is just a code snippet to check the results of the filter # import matplotlib @@ -432,16 +437,12 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None: """ - Create a xr.DataArray containing history data. - - Shift the data window+1 times and return a xarray which has a new dimension 'window' containing the shifted - data. This is used to represent history in the data. Results are stored in history attribute. + Create a xr.DataArray containing history data. As 'input_data' already consists of a dimension 'window', this + method only shifts the data along 'window' dimension x times where x is given by 'window_history_offset'. + Results are stored in history attribute. :param dim_name_of_inputs: Name of dimension which contains the input variables - :param window: number of time steps to look back in history - Note: window will be treated as negative value. This should be in agreement with looking back on - a time line. Nonetheless positive values are allowed but they are converted to its negative - expression + :param window: this parameter is not used in the inherited method :param dim_name_of_shift: Dimension along shift will be applied """ data = self.input_data @@ -450,6 +451,11 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation sampling) data.coords[self.window_dim] = data.coords[self.window_dim] + self.window_history_offset self.history = data + # from matplotlib import pyplot as plt + # d = self.load_and_interpolate(0) + # data.sel(datetime="2007-07-07 00:00").sum("filter").plot() + # plt.plot(data.sel(datetime="2007-07-07 00:00").sum("filter").window.values, d.sel(datetime=slice("2007-07-05 00:00", "2007-07-07 16:00")).values.flatten()) + # plt.plot(data.sel(datetime="2007-07-07 00:00").sum("filter").window.values, d.sel(datetime=slice("2007-07-05 00:00", "2007-07-11 16:00")).values.flatten()) def call_transform(self, inverse=False): opts_input = self._transformation[0] diff --git a/mlair/helpers/__init__.py b/mlair/helpers/__init__.py index bb30a594fca5b5b161571d2b3485b48467018900..3a5b8699a11ae39c0d3510a534db1dd144419d09 100644 --- a/mlair/helpers/__init__.py +++ b/mlair/helpers/__init__.py @@ -3,4 +3,4 @@ from .testing import PyTestRegex, PyTestAllEqual from .time_tracking import TimeTracking, TimeTrackingWrapper from .logger import Logger -from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value, select_from_dict, make_keras_pickable +from .helpers import remove_items, float_round, dict_to_xarray, to_list, extract_value, select_from_dict, make_keras_pickable, sort_like diff --git a/mlair/helpers/filter.py b/mlair/helpers/filter.py index b25d6ee10f89bfa49c2147d1758a2d24b8e7687e..247c4fc9c7c6d57d721c1d0895cc8c719b1bd4a5 100644 --- a/mlair/helpers/filter.py +++ b/mlair/helpers/filter.py @@ -2,8 +2,6 @@ import gc import warnings from typing import Union, Callable, Tuple, Dict, Any import logging -import os -import time import datetime import numpy as np @@ -19,7 +17,8 @@ from mlair.helpers import to_list, TimeTrackingWrapper, TimeTracking class FIRFilter: from mlair.plotting.data_insight_plotting import PlotFirFilter - def __init__(self, data, fs, order, cutoff, window, var_dim, time_dim, station_name=None, minimum_length=None, offset=0, plot_path=None): + def __init__(self, data, fs, order, cutoff, window, var_dim, time_dim, display_name=None, minimum_length=None, + offset=0, plot_path=None, plot_dates=None): self._filtered = [] self._h = [] self.data = data @@ -29,33 +28,35 @@ class FIRFilter: self.window = window self.var_dim = var_dim self.time_dim = time_dim - self.station_name = station_name + self.display_name = display_name self.minimum_length = minimum_length self.offset = offset self.plot_path = plot_path + self.plot_dates = plot_dates self.run() def run(self): - logging.info(f"{self.station_name}: start {self.__class__.__name__}") + logging.info(f"{self.display_name}: start {self.__class__.__name__}") filtered = [] h = [] input_data = self.data.__deepcopy__() # collect some data for visualization - plot_pos = np.array([0.25, 1.5, 2.75, 4]) * 365 * self.fs - plot_dates = [input_data.isel({self.time_dim: int(pos)}).coords[self.time_dim].values for pos in plot_pos if - pos < len(input_data.coords[self.time_dim])] + if self.plot_dates is None: + plot_pos = np.array([0.25, 1.5, 2.75, 4]) * 365 * self.fs + self.plot_dates = [input_data.isel({self.time_dim: int(pos)}).coords[self.time_dim].values + for pos in plot_pos if pos < len(input_data.coords[self.time_dim])] plot_data = [] for i in range(len(self.order)): # apply filter fi, hi = self.fir_filter(input_data, self.fs, self.cutoff[i], self.order[i], time_dim=self.time_dim, - var_dim=self.var_dim, window=self.window, station_name=self.station_name) + var_dim=self.var_dim, window=self.window, display_name=self.display_name) filtered.append(fi) h.append(hi) # visualization - plot_data.append(self.create_visualization(fi, input_data, plot_dates, self.time_dim, self.fs, hi, + plot_data.append(self.create_visualization(fi, input_data, self.plot_dates, self.time_dim, self.fs, hi, self.minimum_length, self.order, i, self.offset, self.var_dim)) # calculate residuum input_data = input_data - fi @@ -69,7 +70,7 @@ class FIRFilter: # visualize if self.plot_path is not None: try: - self.PlotFirFilter(self.plot_path, plot_data, self.station_name) # not working when t0 != 0 + self.PlotFirFilter(self.plot_path, plot_data, self.display_name) # not working when t0 != 0 except Exception as e: logging.info(f"Could not plot climate fir filter due to following reason:\n{e}") @@ -106,7 +107,7 @@ class FIRFilter: @TimeTrackingWrapper def fir_filter(self, data, fs, cutoff_high, order, sampling="1d", time_dim="datetime", var_dim="variables", window: Union[str, Tuple] = "hamming", - minimum_length=None, new_dim="window", plot_dates=None, station_name=None): + minimum_length=None, new_dim="window", plot_dates=None, display_name=None): # calculate FIR filter coefficients h = self._calculate_filter_coefficients(window, order, cutoff_high, fs) @@ -121,7 +122,7 @@ class FIRFilter: filtered = xr.concat(coll, var_dim) # create result array with same shape like input data, gaps are filled by nans - filtered = self._create_full_filter_result_array(data, filtered, time_dim, station_name) + filtered = self._create_full_filter_result_array(data, filtered, time_dim, display_name) return filtered, h @staticmethod @@ -145,7 +146,7 @@ class FIRFilter: @staticmethod def _create_full_filter_result_array(template_array: xr.DataArray, result_array: xr.DataArray, new_dim: str, - station_name: str = None) -> xr.DataArray: + display_name: str = None) -> xr.DataArray: """ Create result filter array with same shape line given template data (should be the original input data before filtering the data). All gaps are filled by nans. @@ -153,9 +154,9 @@ class FIRFilter: :param template_array: this array is used as template for shape and ordering of dims :param result_array: array with data that are filled into template :param new_dim: new dimension which is shifted/appended to/at the end (if present or not) - :param station_name: string that is attached to logging (default None) + :param display_name: string that is attached to logging (default None) """ - logging.debug(f"{station_name}: create res_full") + logging.debug(f"{display_name}: create res_full") new_coords = {**{k: template_array.coords[k].values for k in template_array.coords if k != new_dim}, new_dim: result_array.coords[new_dim]} dims = [*template_array.dims, new_dim] if new_dim not in template_array.dims else template_array.dims @@ -168,7 +169,8 @@ class ClimateFIRFilter(FIRFilter): def __init__(self, data, fs, order, cutoff, window, time_dim, var_dim, apriori=None, apriori_type=None, apriori_diurnal=False, sel_opts=None, plot_path=None, - minimum_length=None, new_dim=None, station_name=None, extend_length_opts: Union[dict, int] = 0): + minimum_length=None, new_dim=None, display_name=None, extend_length_opts: int = 0, + offset: Union[dict, int] = 0, plot_dates=None): """ :param data: data to filter :param fs: sampling frequency in 1/days -> 1d: fs=1 -> 1H: fs=24 @@ -184,9 +186,17 @@ class ClimateFIRFilter(FIRFilter): residua is used ("residuum_stats"). :param apriori_diurnal: Use diurnal cycle as additional apriori information (only applicable for hourly resoluted data). The mean anomaly of each hour is added to the apriori_type information. + :param sel_opts: specify some parameters to select a subset of data before calculating the apriori information. + Use this parameter for example, if apriori shall only calculated on a shorter time period than available in + given data. :param extend_length_opts: shift information switch between historical data and apriori estimation by the given values (default None). Must either be a dictionary with keys available in var_dim or a single value that is - applied to all data. + applied to all data. This parameter has only influence on which information is available at t0 for the + filter calculcation but has no influence on the shape of the returned filtered data. + :param offset: This parameter indicates the number of time steps with ti>t0 to return of the filtered data. In + case the offset parameter is larger than the extend_lenght_opts parameter, this leads to the case that not + only observational data but also climatological estimations are returned. Must either be a dictionary with + keys available in var_dim or a single value that is applied to all data. Default is 0. """ self._apriori = apriori self.apriori_type = apriori_type @@ -196,8 +206,8 @@ class ClimateFIRFilter(FIRFilter): self.new_dim = new_dim self.plot_data = [] self.extend_length_opts = extend_length_opts - super().__init__(data, fs, order, cutoff, window, var_dim, time_dim, station_name=station_name, - minimum_length=minimum_length, plot_path=plot_path) + super().__init__(data, fs, order, cutoff, window, var_dim, time_dim, display_name=display_name, + minimum_length=minimum_length, plot_path=plot_path, offset=offset, plot_dates=plot_dates) def run(self): filtered = [] @@ -205,50 +215,51 @@ class ClimateFIRFilter(FIRFilter): if self.sel_opts is not None: self.sel_opts = self.sel_opts if isinstance(self.sel_opts, dict) else {self.time_dim: self.sel_opts} sampling = {1: "1d", 24: "1H"}.get(int(self.fs)) - logging.debug(f"{self.station_name}: create diurnal_anomalies") + logging.debug(f"{self.display_name}: create diurnal_anomalies") if self.apriori_diurnal is True and sampling == "1H": diurnal_anomalies = self.create_seasonal_hourly_mean(self.data, self.time_dim, sel_opts=self.sel_opts, sampling=sampling, as_anomaly=True) else: diurnal_anomalies = 0 - logging.debug(f"{self.station_name}: create monthly apriori") + logging.debug(f"{self.display_name}: create monthly apriori") if self._apriori is None: self._apriori = self.create_monthly_mean(self.data, self.time_dim, sel_opts=self.sel_opts, sampling=sampling) + diurnal_anomalies - logging.debug(f"{self.station_name}: apriori shape = {self._apriori.shape}") + logging.debug(f"{self.display_name}: apriori shape = {self._apriori.shape}") apriori_list = to_list(self._apriori) input_data = self.data.__deepcopy__() - # for viz - plot_dates = None + # for visualization + plot_dates = self.plot_dates # create tmp dimension to apply filter, search for unused name new_dim = self._create_tmp_dimension(input_data) if self.new_dim is None else self.new_dim for i in range(len(self.order)): - logging.info(f"{self.station_name}: start filter for order {self.order[i]}") + logging.info(f"{self.display_name}: start filter for order {self.order[i]}") # calculate climatological filter - _minimum_length = self._minimum_length(self.order, self.minimum_length, i, self.window) - fi, hi, apriori, plot_data = self.clim_filter(input_data, self.fs, self.cutoff[i], self.order[i], - apriori=apriori_list[i], sel_opts=self.sel_opts, - sampling=sampling, time_dim=self.time_dim, - window=self.window, var_dim=self.var_dim, - minimum_length=_minimum_length, new_dim=new_dim, - plot_dates=plot_dates, station_name=self.station_name, - extend_length_opts=self.extend_length_opts) - - logging.info(f"{self.station_name}: finished clim_filter calculation") + next_order = self._next_order(self.order, 0, i, self.window) + fi, input_data, hi, apriori, plot_data = self.clim_filter(input_data, self.fs, self.cutoff[i], self.order[i], + apriori=apriori_list[i], sel_opts=self.sel_opts, + sampling=sampling, time_dim=self.time_dim, + window=self.window, var_dim=self.var_dim, + minimum_length=self.minimum_length, new_dim=new_dim, + plot_dates=plot_dates, display_name=self.display_name, + extend_opts=self.extend_length_opts, + offset=self.offset, next_order=next_order) + + logging.info(f"{self.display_name}: finished clim_filter calculation") if self.minimum_length is None: - filtered.append(fi) + filtered.append(fi.sel({new_dim: slice(None, self.offset)})) else: - filtered.append(fi.sel({new_dim: slice(-self.minimum_length, None)})) + filtered.append(fi.sel({new_dim: slice(self.offset - self.minimum_length, self.offset)})) h.append(hi) gc.collect() self.plot_data.append(plot_data) plot_dates = {e["t0"] for e in plot_data} # calculate residuum - logging.info(f"{self.station_name}: calculate residuum") + logging.info(f"{self.display_name}: calculate residuum") coord_range = range(fi.coords[new_dim].values.min(), fi.coords[new_dim].values.max() + 1) if new_dim in input_data.coords: input_data = input_data.sel({new_dim: coord_range}) - fi @@ -257,14 +268,14 @@ class ClimateFIRFilter(FIRFilter): # create new apriori information for next iteration if no further apriori is provided if len(apriori_list) < len(self.order): - logging.info(f"{self.station_name}: create diurnal_anomalies") + logging.info(f"{self.display_name}: create diurnal_anomalies") if self.apriori_diurnal is True and sampling == "1H": diurnal_anomalies = self.create_seasonal_hourly_mean(input_data.sel({new_dim: 0}, drop=True), self.time_dim, sel_opts=self.sel_opts, sampling=sampling, as_anomaly=True) else: diurnal_anomalies = 0 - logging.info(f"{self.station_name}: create monthly apriori") + logging.info(f"{self.display_name}: create monthly apriori") if self.apriori_type is None or self.apriori_type == "zeros": # zero version apriori_list.append(xr.zeros_like(apriori_list[i]) + diurnal_anomalies) elif self.apriori_type == "residuum_stats": # calculate monthly statistic on residuum @@ -274,11 +285,12 @@ class ClimateFIRFilter(FIRFilter): else: raise ValueError(f"Cannot handle unkown apriori type: {self.apriori_type}. Please choose from None," f" `zeros` or `residuum_stats`.") + # add last residuum to filtered if self.minimum_length is None: - filtered.append(input_data) + filtered.append(input_data.sel({new_dim: slice(None, self.offset)})) else: - filtered.append(input_data.sel({new_dim: slice(-self.minimum_length, None)})) + filtered.append(input_data.sel({new_dim: slice(self.offset - self.minimum_length, self.offset)})) self._filtered = filtered self._h = h @@ -287,12 +299,12 @@ class ClimateFIRFilter(FIRFilter): # visualize if self.plot_path is not None: try: - self.PlotClimateFirFilter(self.plot_path, self.plot_data, sampling, self.station_name) + self.PlotClimateFirFilter(self.plot_path, self.plot_data, sampling, self.display_name) except Exception as e: logging.info(f"Could not plot climate fir filter due to following reason:\n{e}") @staticmethod - def _minimum_length(order: list, minimum_length: Union[int, None], pos: int, window: Union[str, tuple]) -> int: + def _next_order(order: list, minimum_length: Union[int, None], pos: int, window: Union[str, tuple]) -> int: next_order = 0 if pos + 1 < len(order): next_order = order[pos + 1] @@ -300,7 +312,7 @@ class ClimateFIRFilter(FIRFilter): next_order = filter_width_kzf(*next_order) if minimum_length is not None: next_order = next_order + minimum_length - return next_order if next_order > 0 else None + return next_order @staticmethod def create_monthly_unity_array(data: xr.DataArray, time_dim: str, extend_range: int = 366) -> xr.DataArray: @@ -452,7 +464,7 @@ class ClimateFIRFilter(FIRFilter): @staticmethod def extend_apriori(data: xr.DataArray, apriori: xr.DataArray, time_dim: str, sampling: str = "1d", - station_name: str = None) -> xr.DataArray: + display_name: str = None) -> xr.DataArray: """ Extend time range of apriori information to span a longer period as data (or at least of equal length). This method may not working properly if length of apriori contains data from less then one year. @@ -463,7 +475,7 @@ class ClimateFIRFilter(FIRFilter): apriori data. :param time_dim: name of temporal dimension :param sampling: sampling of data (e.g. "1m", "1d", default "1d") - :param station_name: name to use for logging message (default None) + :param display_name: name to use for logging message (default None) :returns: array which adjusted temporal coverage derived from apriori """ dates = data.coords[time_dim].values @@ -471,7 +483,7 @@ class ClimateFIRFilter(FIRFilter): # apriori starts after data if dates[0] < apriori.coords[time_dim].values[0]: - logging.debug(f"{station_name}: apriori starts after data") + logging.debug(f"{display_name}: apriori starts after data") # add difference in full years date_diff = abs(dates[0] - apriori.coords[time_dim].values[0]).astype("timedelta64[D]") @@ -492,7 +504,7 @@ class ClimateFIRFilter(FIRFilter): # apriori ends before data if dates[-1] + np.timedelta64(365, "D") > apriori.coords[time_dim].values[-1]: - logging.debug(f"{station_name}: apriori ends before data") + logging.debug(f"{display_name}: apriori ends before data") # add difference in full years + 1 year (because apriori is used as future estimate) date_diff = abs(dates[-1] - apriori.coords[time_dim].values[-1]).astype("timedelta64[D]") @@ -535,51 +547,54 @@ class ClimateFIRFilter(FIRFilter): interval and start of apriori interval are shifted by this value from t0 (positive or negative). :returns: combined data array """ - # check if shift indicated by extend_length_seperator is inside the outer interval limits - # assert (extend_length_separator > -extend_length_history) and (extend_length_separator < extend_length_future) - # prepare historical data / observation + ext_sep = min(extend_length_separator, extend_length_future) if new_dim not in data.coords: - history = self._shift_data(data, range(int(-extend_length_history), extend_length_separator + 1), + history = self._shift_data(data, range(int(-extend_length_history), ext_sep + 1), time_dim, new_dim) else: - history = data.sel({new_dim: slice(int(-extend_length_history), extend_length_separator)}) - # prepare climatological statistics - if new_dim not in apriori.coords: - future = self._shift_data(apriori, range(extend_length_separator + 1, - extend_length_separator + extend_length_future), - time_dim, new_dim) + history = data.sel({new_dim: slice(int(-extend_length_history), ext_sep)}) + + if extend_length_future > ext_sep + 1: + # prepare climatological statistics + if new_dim not in apriori.coords: + future = self._shift_data(apriori, range(ext_sep + 1, + extend_length_future + 1), + time_dim, new_dim) + else: + future = apriori.sel({new_dim: slice(ext_sep + 1, + extend_length_future)}) + # combine historical data [t0-length,t0+sep] and climatological statistics [t0+sep+1,t0+length] + filter_input_data = xr.concat([history.dropna(time_dim), future], dim=new_dim, join="left") + return filter_input_data else: - future = apriori.sel({new_dim: slice(extend_length_separator + 1, - extend_length_separator + extend_length_future)}) - - # combine historical data [t0-length,t0+sep] and climatological statistics [t0+sep+1,t0+length] - filter_input_data = xr.concat([history.dropna(time_dim), future], dim=new_dim, join="left") - return filter_input_data + return history def create_visualization(self, filtered, data, filter_input_data, plot_dates, time_dim, new_dim, sampling, extend_length_history, extend_length_future, minimum_length, h, - variable_name, extend_length_opts=None): # pragma: no cover + variable_name, extend_length_opts=None, offset=None): # pragma: no cover plot_data = [] + offset = 0 if offset is None else offset extend_length_opts = 0 if extend_length_opts is None else extend_length_opts - for viz_date in set(plot_dates).intersection(filtered.coords[time_dim].values): + for t0 in set(plot_dates).intersection(filtered.coords[time_dim].values): try: td_type = {"1d": "D", "1H": "h"}.get(sampling) - t_minus = viz_date + np.timedelta64(int(-extend_length_history), td_type) - t_plus = viz_date + np.timedelta64(int(extend_length_future + extend_length_opts), td_type) + t_minus = t0 + np.timedelta64(int(-extend_length_history), td_type) + t_plus = t0 + np.timedelta64(int(extend_length_future + 1), td_type) if new_dim not in data.coords: tmp_filter_data = self._shift_data(data.sel({time_dim: slice(t_minus, t_plus)}), range(int(-extend_length_history), - int(extend_length_future + extend_length_opts)), + int(extend_length_future + 1)), time_dim, - new_dim).sel({time_dim: viz_date}) + new_dim).sel({time_dim: t0}) else: tmp_filter_data = None - valid_range = range(int((len(h) + 1) / 2) if minimum_length is None else minimum_length, - extend_length_opts + 1) - plot_data.append({"t0": viz_date, + valid_start = int(filtered[new_dim].min()) + int((len(h) + 1) / 2) + valid_end = min(extend_length_opts + offset + 1, int(filtered[new_dim].max()) - int((len(h) + 1) / 2)) + valid_range = range(valid_start, valid_end) + plot_data.append({"t0": t0, "var": variable_name, - "filter_input": filter_input_data.sel({time_dim: viz_date}), + "filter_input": filter_input_data.sel({time_dim: t0}), "filter_input_nc": tmp_filter_data, "valid_range": valid_range, "time_range": data.sel( @@ -625,7 +640,7 @@ class ClimateFIRFilter(FIRFilter): @staticmethod def _trim_data_to_minimum_length(data: xr.DataArray, extend_length_history: int, dim: str, - minimum_length: int = None, extend_length_opts: int = 0) -> xr.DataArray: + extend_length_future: int = 0) -> xr.DataArray: """ Trim data along given axis between either -minimum_length (if given) or -extend_length_history and extend_length_opts (which is default set to 0). @@ -634,18 +649,14 @@ class ClimateFIRFilter(FIRFilter): :param extend_length_history: start number for trim range (transformed to negative), only used if parameter minimum_length is not provided :param dim: dim to apply trim on - :param minimum_length: start number for trim range (transformed to negative), preferably used (default None) - :param extend_length_opts: number to use in "future" + :param extend_length_future: number to use in "future" :returns: trimmed data """ - if minimum_length is None: - return data.sel({dim: slice(-extend_length_history, extend_length_opts)}, drop=True) - else: - return data.sel({dim: slice(-minimum_length, extend_length_opts)}, drop=True) + return data.sel({dim: slice(-extend_length_history, extend_length_future)}, drop=True) @staticmethod def _create_full_filter_result_array(template_array: xr.DataArray, result_array: xr.DataArray, new_dim: str, - station_name: str = None) -> xr.DataArray: + display_name: str = None) -> xr.DataArray: """ Create result filter array with same shape line given template data (should be the original input data before filtering the data). All gaps are filled by nans. @@ -653,9 +664,9 @@ class ClimateFIRFilter(FIRFilter): :param template_array: this array is used as template for shape and ordering of dims :param result_array: array with data that are filled into template :param new_dim: new dimension which is shifted/appended to/at the end (if present or not) - :param station_name: string that is attached to logging (default None) + :param display_name: string that is attached to logging (default None) """ - logging.debug(f"{station_name}: create res_full") + logging.debug(f"{display_name}: create res_full") new_coords = {**{k: template_array.coords[k].values for k in template_array.coords if k != new_dim}, new_dim: result_array.coords[new_dim]} dims = [*template_array.dims, new_dim] if new_dim not in template_array.dims else template_array.dims @@ -665,25 +676,24 @@ class ClimateFIRFilter(FIRFilter): @TimeTrackingWrapper def clim_filter(self, data, fs, cutoff_high, order, apriori=None, sel_opts=None, sampling="1d", time_dim="datetime", var_dim="variables", window: Union[str, Tuple] = "hamming", - minimum_length=None, new_dim="window", plot_dates=None, station_name=None, - extend_length_opts: Union[dict, int] = None): + minimum_length=0, next_order=0, new_dim="window", plot_dates=None, display_name=None, + extend_opts: int = 0, offset: int = 0): - logging.debug(f"{station_name}: extend apriori") - extend_opts = extend_length_opts if extend_length_opts is not None else {} + logging.debug(f"{display_name}: extend apriori") # calculate apriori information from data if not given and extend its range if not sufficient long enough if apriori is None: apriori = self.create_monthly_mean(data, time_dim, sel_opts=sel_opts, sampling=sampling) apriori = apriori.astype(data.dtype) - apriori = self.extend_apriori(data, apriori, time_dim, sampling, station_name=station_name) + apriori = self.extend_apriori(data, apriori, time_dim, sampling, display_name=display_name) # calculate FIR filter coefficients h = self._calculate_filter_coefficients(window, order, cutoff_high, fs) length = len(h) - # use filter length if no minimum is given, otherwise use minimum + half filter length for extension - extend_length_history = length if minimum_length is None else minimum_length + int((length + 1) / 2) - extend_length_future = int((length + 1) / 2) + 1 + # set data extend that is required for filtering + extend_length_history = minimum_length + int((next_order + 1) / 2) + int((length + 1) / 2) - offset + extend_length_future = offset + int((next_order + 1) / 2) + int((length + 1) / 2) # collect some data for visualization plot_pos = np.array([0.25, 1.5, 2.75, 4]) * 365 * fs @@ -693,19 +703,20 @@ class ClimateFIRFilter(FIRFilter): plot_data = [] coll = [] + coll_input = [] for var in reversed(data.coords[var_dim].values): - logging.info(f"{station_name} ({var}): sel data") + logging.info(f"{display_name} ({var}): sel data") _start, _end = self._get_year_interval(data, time_dim) - extend_opts_var = extend_opts.get(var, 0) if isinstance(extend_opts, dict) else extend_opts filt_coll = [] + filt_input_coll = [] for _year in range(_start, _end + 1): - logging.debug(f"{station_name} ({var}): year={_year}") + logging.debug(f"{display_name} ({var}): year={_year}") # select observations and apriori data time_slice = self._create_time_range_extend( - _year, sampling, max(extend_length_history, extend_length_future + extend_opts_var)) + _year, sampling, max(extend_length_history, extend_length_future)) d = data.sel({var_dim: [var], time_dim: time_slice}) a = apriori.sel({var_dim: [var], time_dim: time_slice}) if len(d.coords[time_dim]) == 0: # no data at all for this year @@ -713,8 +724,7 @@ class ClimateFIRFilter(FIRFilter): # combine historical data / observation [t0-length,t0] and climatological statistics [t0+1,t0+length] filter_input_data = self.combine_observation_and_apriori(d, a, time_dim, new_dim, extend_length_history, - extend_length_future, - extend_length_separator=extend_opts_var) + extend_length_future, extend_length_separator=extend_opts) # select only data for current year try: @@ -725,33 +735,41 @@ class ClimateFIRFilter(FIRFilter): continue # apply filter - logging.debug(f"{station_name} ({var}): start filter convolve") - with TimeTracking(name=f"{station_name} ({var}): filter convolve", logging_level=logging.DEBUG): + logging.debug(f"{display_name} ({var}): start filter convolve") + with TimeTracking(name=f"{display_name} ({var}): filter convolve", logging_level=logging.DEBUG): filt = xr.apply_ufunc(fir_filter_convolve, filter_input_data, input_core_dims=[[new_dim]], output_core_dims=[[new_dim]], vectorize=True, kwargs={"h": h}, output_dtypes=[d.dtype]) # trim data if required - trimmed = self._trim_data_to_minimum_length(filt, extend_length_history, new_dim, minimum_length, - extend_length_opts=extend_opts_var) + valid_range_end = int(filt.coords[new_dim].max() - (length + 1) / 2) + 1 + ext_len = min(extend_length_future, valid_range_end) + trimmed = self._trim_data_to_minimum_length(filt, extend_length_history, new_dim, + extend_length_future=ext_len) filt_coll.append(trimmed) + trimmed = self._trim_data_to_minimum_length(filter_input_data, extend_length_history, new_dim, + extend_length_future=ext_len) + filt_input_coll.append(trimmed) # visualization plot_data.extend(self.create_visualization(filt, d, filter_input_data, plot_dates, time_dim, new_dim, sampling, extend_length_history, extend_length_future, - minimum_length, h, var, extend_opts_var)) + minimum_length, h, var, extend_opts, offset)) # collect all filter results coll.append(xr.concat(filt_coll, time_dim)) + coll_input.append(xr.concat(filt_input_coll, time_dim)) gc.collect() # concat all variables - logging.debug(f"{station_name}: concat all variables") + logging.debug(f"{display_name}: concat all variables") res = xr.concat(coll, var_dim) + res_input = xr.concat(coll_input, var_dim) # create result array with same shape like input data, gaps are filled by nans - res_full = self._create_full_filter_result_array(data, res, new_dim, station_name) - return res_full, h, apriori, plot_data + res_full = self._create_full_filter_result_array(data, res, new_dim, display_name) + res_input_full = self._create_full_filter_result_array(data, res_input, new_dim, display_name) + return res_full, res_input_full, h, apriori, plot_data @staticmethod def _create_time_range_extend(year: int, sampling: str, extend_length: int) -> slice: diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py index 8890ae3c9fe389dfa64bed4750df2944c4cacc61..d43e58b985ffa6ab9fb7ff111050919f6d5c7bdc 100644 --- a/mlair/helpers/helpers.py +++ b/mlair/helpers/helpers.py @@ -68,6 +68,23 @@ def to_list(obj: Any) -> List: return obj +def sort_like(list_obj: list, sorted_obj: list): + """ + Sort elements of list_obj as ordered in sorted_obj. Length of sorted_obj as allowed to be higher than length of + list_obj, but must contain at least all objects of list_obj. Will raise AssertionError, if not all elements of + list_obj are also in sorted_obj. Also it is required for list_obj and sorted_obj to have only unique elements. + + :param list_obj: list to sort + :param sorted_obj: list to use ordering from + + :return: sorted list + """ + assert set(list_obj).issubset(sorted_obj) + assert len(set(list_obj)) == len(list_obj) + assert len(set(sorted_obj)) == len(sorted_obj) + return [e for e in sorted_obj if e in list_obj] + + def dict_to_xarray(d: Dict, coordinate_name: str) -> xr.DataArray: """ Convert a dictionary of 2D-xarrays to single 3D-xarray. The name of new coordinate axis follows <coordinate_name>. diff --git a/mlair/helpers/testing.py b/mlair/helpers/testing.py index e727d9b50308d339af79f5c5b82b592af6e91921..9820b4956dac09e213df3b9addc317a00ee381b8 100644 --- a/mlair/helpers/testing.py +++ b/mlair/helpers/testing.py @@ -105,7 +105,7 @@ def get_all_args(*args, remove=None, add=None): return res -def test_nested_equality(obj1, obj2): +def check_nested_equality(obj1, obj2): try: print(f"check type {type(obj1)} and {type(obj2)}") @@ -116,13 +116,13 @@ def test_nested_equality(obj1, obj2): assert len(obj1) == len(obj2) for pos in range(len(obj1)): print(f"check pos {obj1[pos]} and {obj2[pos]}") - assert test_nested_equality(obj1[pos], obj2[pos]) is True + assert check_nested_equality(obj1[pos], obj2[pos]) is True elif isinstance(obj1, dict): print(f"check keys {obj1.keys()} and {obj2.keys()}") assert sorted(obj1.keys()) == sorted(obj2.keys()) for k in obj1.keys(): print(f"check pos {obj1[k]} and {obj2[k]}") - assert test_nested_equality(obj1[k], obj2[k]) is True + assert check_nested_equality(obj1[k], obj2[k]) is True elif isinstance(obj1, xr.DataArray): print(f"check xr {obj1} and {obj2}") assert xr.testing.assert_equal(obj1, obj2) is None diff --git a/mlair/model_modules/convolutional_networks.py b/mlair/model_modules/convolutional_networks.py index be047eb7a1c92cbb8847328c157c874bfeca93ca..d8eb6eb3403db13932a51274fdc1f563dbfb6ef3 100644 --- a/mlair/model_modules/convolutional_networks.py +++ b/mlair/model_modules/convolutional_networks.py @@ -11,7 +11,7 @@ from mlair.model_modules.advanced_paddings import PadUtils, Padding2D, Symmetric import tensorflow.keras as keras -class CNN(AbstractModelClass): +class CNN(AbstractModelClass): # pragma: no cover _activation = {"relu": keras.layers.ReLU, "tanh": partial(keras.layers.Activation, "tanh"), "sigmoid": partial(keras.layers.Activation, "sigmoid"), diff --git a/mlair/model_modules/fully_connected_networks.py b/mlair/model_modules/fully_connected_networks.py index 8536516e66cc1dda15972fd2e91d0ef67c70dda7..372473ee22b5174a3beca91898509a3582391587 100644 --- a/mlair/model_modules/fully_connected_networks.py +++ b/mlair/model_modules/fully_connected_networks.py @@ -192,7 +192,7 @@ class FCN_64_32_16(FCN): super()._update_model_name() -class BranchedInputFCN(AbstractModelClass): +class BranchedInputFCN(AbstractModelClass): # pragma: no cover """ A customisable fully connected network (64, 32, 16, window_lead_time), where the last layer is the output layer depending on the window_lead_time parameter. diff --git a/mlair/model_modules/recurrent_networks.py b/mlair/model_modules/recurrent_networks.py index 59927e992d432207db5b5737289a6f4d671d92f3..6ec920c1cde08c0d2fc6064528eea800fbdde2a7 100644 --- a/mlair/model_modules/recurrent_networks.py +++ b/mlair/model_modules/recurrent_networks.py @@ -10,7 +10,7 @@ from mlair.model_modules.loss import var_loss, custom_loss import tensorflow.keras as keras -class RNN(AbstractModelClass): +class RNN(AbstractModelClass): # pragma: no cover """ """ diff --git a/mlair/plotting/abstract_plot_class.py b/mlair/plotting/abstract_plot_class.py index 7a91c2269ccd03608bcdbe67a634156f55fde91f..21e5d9413b490a4be5281c2a80308be558fe64c8 100644 --- a/mlair/plotting/abstract_plot_class.py +++ b/mlair/plotting/abstract_plot_class.py @@ -8,7 +8,7 @@ import os from matplotlib import pyplot as plt -class AbstractPlotClass: +class AbstractPlotClass: # pragma: no cover """ Abstract class for all plotting routines to unify plot workflow. diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 1eee96623d4fed6fcfb23fd1438a954a4aca230f..db2b3340e06545f988c81503df2aa27b655095bb 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -959,6 +959,7 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover "t0": {"color": "lightgrey", "lw": 6, "label": "$t_0$"} } + self.variables_list = [] plot_folder = os.path.join(os.path.abspath(plot_folder), "climFIR") self.fir_filter_convolve = fir_filter_convolve super().__init__(plot_folder, plot_name=None, rc_params=rc_params) @@ -995,17 +996,18 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover plot_dict_t0[i] = plot_dict_order plot_dict_var[t0] = plot_dict_t0 plot_dict[var] = plot_dict_var + self.variables_list = list(plot_dict.keys()) return plot_dict, new_dim def _plot(self, plot_dict, sampling, new_dim="window"): td_type = {"1d": "D", "1H": "h"}.get(sampling) - for var, viz_date_dict in plot_dict.items(): - for it0, t0 in enumerate(viz_date_dict.keys()): - viz_data = viz_date_dict[t0] + for var, vis_dict in plot_dict.items(): + for it0, t0 in enumerate(vis_dict.keys()): + vis_data = vis_dict[t0] residuum_true = None try: - for ifilter in sorted(viz_data.keys()): - data = viz_data[ifilter] + for ifilter in sorted(vis_data.keys()): + data = vis_data[ifilter] filter_input = data["filter_input"] filter_input_nc = data["filter_input_nc"] if residuum_true is None else residuum_true.sel( {new_dim: filter_input.coords[new_dim]}) @@ -1023,7 +1025,7 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover self._plot_original_data(ax, time_axis, filter_input_nc) # clim apriori - self._plot_apriori(ax, time_axis, filter_input, new_dim, ifilter) + self._plot_apriori(ax, time_axis, filter_input, new_dim, ifilter, offset=1) # clim filter response residuum_estimated = self._plot_clim_filter(ax, time_axis, filter_input, new_dim, h, @@ -1068,8 +1070,8 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover Use order and valid_range to find a good zoom in that hides edges of filter values that are effected by reduced filter order. Limits are returned to be usable for other plots. """ - t_minus_delta = max(1.5 * valid_range.start, 0.3 * order) - t_plus_delta = max(0.5 * valid_range.start, 0.3 * order) + t_minus_delta = max(min(2 * (valid_range.stop - valid_range.start), 0.5 * order), (-valid_range.start + 0.3 * order)) + t_plus_delta = max(min(2 * (valid_range.stop - valid_range.start), 0.5 * order), valid_range.stop + 0.3 * order) t_minus = t0 + np.timedelta64(-int(t_minus_delta), td_type) t_plus = t0 + np.timedelta64(int(t_plus_delta), td_type) ax_start = max(t_minus, time_axis[0]) @@ -1078,7 +1080,7 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover return ax_start, ax_end def _plot_valid_area(self, ax, t0, valid_range, td_type): - ax.axvspan(t0 + np.timedelta64(-valid_range.start, td_type), + ax.axvspan(t0 + np.timedelta64(valid_range.start, td_type), t0 + np.timedelta64(valid_range.stop - 1, td_type), **self.style_dict["valid_area"]) def _plot_t0(self, ax, t0): @@ -1094,12 +1096,12 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover # self._plot_series(ax, time_axis, filter_input_nc.values.flatten(), color="darkgrey", linestyle="dashed", # label="original") - def _plot_apriori(self, ax, time_axis, data, new_dim, ifilter): + def _plot_apriori(self, ax, time_axis, data, new_dim, ifilter, offset): # clim apriori filter_input = data if ifilter == 0: d_tmp = filter_input.sel( - {new_dim: slice(0, filter_input.coords[new_dim].values.max())}).values.flatten() + {new_dim: slice(offset, filter_input.coords[new_dim].values.max())}).values.flatten() else: d_tmp = filter_input.values.flatten() self._plot_series(ax, time_axis[len(time_axis) - len(d_tmp):], d_tmp, style="apriori") @@ -1138,7 +1140,7 @@ class PlotClimateFirFilter(AbstractPlotClass): # pragma: no cover def _store_plot_data(self, data): """Store plot data. Could be loaded in a notebook to redraw.""" - file = os.path.join(self.plot_folder, "plot_data.pickle") + file = os.path.join(self.plot_folder, "_".join(self.variables_list) + "plot_data.pickle") with open(file, "wb") as f: dill.dump(data, f) diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index ac6ef0b8155271286a47f49f5adb5bc5bbaac610..8b239157557de032e648bae538f98383bfa37ffc 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -1357,14 +1357,14 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover width=width, orient=orientation) if orientation == "v": if apply_u_test: - ax = self.set_sigificance_bars_vertical(asteriks, ax, data_table) + ax = self.set_significance_bars(asteriks, ax, data_table, orientation) ylims = list(ax.get_ylim()) ax.set_ylim([ylims[0], ylims[1]*1.025]) ax.set_ylabel(f"{self.error_measure} (in {self.error_unit})") ax.set_xticklabels(ax.get_xticklabels(), rotation=45) elif orientation == "h": if apply_u_test: - ax = self.set_sigificance_bars_horizontal(asteriks, ax, data_table) + ax = self.set_significance_bars(asteriks, ax, data_table, orientation) ax.set_xlabel(f"{self.error_measure} (in {self.error_unit})") xlims = list(ax.get_xlim()) ax.set_xlim([xlims[0], xlims[1] * 1.015]) @@ -1382,35 +1382,25 @@ class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover self._save() plt.close("all") - def set_sigificance_bars_vertical(self, asteriks, ax, data_table): - x1 = list(asteriks.index).index(self.model_name) - y_prev = 0. - for i, v in enumerate(asteriks): + def set_significance_bars(self, asteriks, ax, data_table, orientation): + p1 = list(asteriks.index).index(self.model_name) + q_prev = 0. + factor = 0.025 + for i, ast in enumerate(asteriks): if not i == list(asteriks.index).index(self.model_name): - x2 = i - y = data_table[[self.model_name, data_table.columns[i]]].max().max() - y = max(y, y_prev) * 1.025 - if abs(y-y_prev) < y * 0.025: - y = y * 1.025 - h = .01 * data_table.max().max() - ax.plot([x1, x1, x2, x2], [y, y + h, y + h, y], c="k") - ax.text((x1 + x2) * .5, y + h, v, ha="center", va="bottom", color="k") - y_prev = y - return ax - - def set_sigificance_bars_horizontal(self, asteriks, ax, data_table): - y1 = list(asteriks.index).index(self.model_name) - x_prev = 0. - for i, v in enumerate(asteriks): - if not i == list(asteriks.index).index(self.model_name): - y2 = i - x = data_table[[self.model_name, data_table.columns[i]]].max().max() - x = max(x, x_prev) * 1.025 - if abs(x-x_prev) < x * 0.025: - x = x * 1.025 - h = .01 * data_table.max().max() - ax.plot([x, x+h, x+h, x], [y1, y1, y2, y2], c="k") - ax.text(x + h, (y1 + y2) * .5, v, ha="left", va="center", color="k", rotation=-90) + p2 = i + q = data_table[[self.model_name, data_table.columns[i]]].max().max() + q = max(q, q_prev) * (1 + factor) + if abs(q - q_prev) < q * factor: + q = q * (1 + factor) + h = 0.01 * data_table.max().max() + if orientation == "h": + ax.plot([q, q + h, q + h, q], [p1, p1, p2, p2], c="k") + ax.text(q + h, (p1 + p2) * 0.5, ast, ha="left", va="center", color="k", rotation=-90) + elif orientation == "v": + ax.plot([p1, p1, p2, p2], [q, q + h, q + h, q], c="k") + ax.text((p1 + p2) * 0.5, q + h, ast, ha="center", va="bottom", color="k") + q_prev = q return ax diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 91194da9dc8da8bdd6e71ef0c3e13844984b0331..56a7ab539a37ba89c2b9027c6dcdb83f714120d4 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -212,7 +212,7 @@ class PostProcessing(RunEnvironment): against the number of observations and diversity ot stations. """ path = self.data_store.get("forecast_path") - all_stations = self.data_store.get("stations") + all_stations = self.data_store.get("stations", "test") start = self.data_store.get("start", "test") end = self.data_store.get("end", "test") index_dim = self.index_dim diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index c094d1af42731348e120e0dcddebaaf9439435bb..4a5fd1683da1c8ab73f31458bc67db2c9643fe21 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -331,7 +331,8 @@ class PreProcessing(RunEnvironment): experiment_path = self.data_store.get("experiment_path") transformation_path = os.path.join(experiment_path, "data", "transformation") transformation_file = os.path.join(transformation_path, "transformation.pickle") - if not os.path.exists(transformation_file): + calculate_fresh_transformation = self.data_store.get_default("calculate_fresh_transformation", True) + if not os.path.exists(transformation_file) or calculate_fresh_transformation: path_config.check_path_and_create(transformation_path) with open(transformation_file, "wb") as f: dill.dump(transformation_opts, f, protocol=4) diff --git a/test/test_helpers/test_filter.py b/test/test_helpers/test_filter.py index 519a36b3438cafd041cc65c43572fc026eced4dd..e4bfb6890936d13137ebb6dda01a44eed0166ae5 100644 --- a/test/test_helpers/test_filter.py +++ b/test/test_helpers/test_filter.py @@ -121,22 +121,22 @@ class TestClimateFIRFilter: obj._create_tmp_dimension(xr_array) assert "Could not create new dimension." in e.value.args[0] - def test_minimum_length(self): + def test_next_order(self): obj = object.__new__(ClimateFIRFilter) - res = obj._minimum_length([43], 15, 0, "hamming") + res = obj._next_order([43], 15, 0, "hamming") assert res == 15 - res = obj._minimum_length([43, 13], 15, 0, ("kaiser", 10)) + res = obj._next_order([43, 13], 15, 0, ("kaiser", 10)) assert res == 28 - res = obj._minimum_length([43, 13], 15, 1, "hamming") + res = obj._next_order([43, 13], 15, 1, "hamming") assert res == 15 - res = obj._minimum_length([128, 64, 43], None, 0, "hamming") + res = obj._next_order([128, 64, 43], None, 0, "hamming") assert res == 64 - res = obj._minimum_length([43], None, 0, "hamming") - assert res is None + res = obj._next_order([43], None, 0, "hamming") + assert res == 0 - def test_minimum_length_with_kzf(self): + def test_next_order_with_kzf(self): obj = object.__new__(ClimateFIRFilter) - res = obj._minimum_length([(15, 5), (5, 3)], None, 0, "kzf") + res = obj._next_order([(15, 5), (5, 3)], None, 0, "kzf") assert res == 13 def test_calculate_filter_coefficients(self): @@ -297,10 +297,11 @@ class TestClimateFIRFilter: res = obj._trim_data_to_minimum_length(xr_array, 5, "window") assert xr_array.shape == (21, 100, 1) assert res.shape == (6, 100, 1) - res = obj._trim_data_to_minimum_length(xr_array, 5, "window", 10) - assert res.shape == (11, 100, 1) res = obj._trim_data_to_minimum_length(xr_array, 30, "window") assert res.shape == (21, 100, 1) + xr_array = obj._shift_data(xr_array.sel(window=0), range(-20, 5), time_dim, new_dim="window") + res = obj._trim_data_to_minimum_length(xr_array, 5, "window", extend_length_future=2) + assert res.shape == (8, 100, 1) def test_create_full_filter_result_array(self, xr_array, time_dim): obj = object.__new__(ClimateFIRFilter) @@ -315,28 +316,29 @@ class TestClimateFIRFilter: def test_clim_filter(self, xr_array_long_with_var, time_dim, var_dim): obj = object.__new__(ClimateFIRFilter) filter_order = 10*24+1 - res = obj.clim_filter(xr_array_long_with_var, 24, 0.05, 10*24+1, sampling="1H", time_dim=time_dim, var_dim=var_dim) - assert len(res) == 4 + res = obj.clim_filter(xr_array_long_with_var, 24, 0.05, filter_order, sampling="1H", time_dim=time_dim, + var_dim=var_dim, minimum_length=24) + assert len(res) == 5 # check filter data properties - assert res[0].shape == (*xr_array_long_with_var.shape, filter_order + 1) + assert res[0].shape == (*xr_array_long_with_var.shape, int(filter_order+1)/2 + 24 + 2) assert res[0].dims == (*xr_array_long_with_var.dims, "window") # check filter properties assert np.testing.assert_almost_equal( - res[1], obj._calculate_filter_coefficients("hamming", filter_order, 0.05, 24)) is None + res[2], obj._calculate_filter_coefficients("hamming", filter_order, 0.05, 24)) is None # check apriori apriori = obj.create_monthly_mean(xr_array_long_with_var, time_dim, sampling="1H") apriori = apriori.astype(xr_array_long_with_var.dtype) apriori = obj.extend_apriori(xr_array_long_with_var, apriori, time_dim, "1H") - assert xr.testing.assert_equal(res[2], apriori) is None + assert xr.testing.assert_equal(res[3], apriori) is None # check plot data format - assert isinstance(res[3], list) - assert isinstance(res[3][0], dict) + assert isinstance(res[4], list) + assert isinstance(res[4][0], dict) keys = {"t0", "var", "filter_input", "filter_input_nc", "valid_range", "time_range", "h", "new_dim"} - assert len(keys.symmetric_difference(res[3][0].keys())) == 0 + assert len(keys.symmetric_difference(res[4][0].keys())) == 0 def test_clim_filter_kwargs(self, xr_array_long_with_var, time_dim, var_dim): obj = object.__new__(ClimateFIRFilter) @@ -349,12 +351,12 @@ class TestClimateFIRFilter: var_dim=var_dim, new_dim="total_new_dim", window=("kaiser", 5), minimum_length=1000, apriori=apriori, plot_dates=plot_dates) - assert res[0].shape == (*xr_array_long_with_var.shape, 1000 + 1) + assert res[0].shape == (*xr_array_long_with_var.shape, int(10 * 24 + 1 + 1) / 2 + 1000 + 2) assert res[0].dims == (*xr_array_long_with_var.dims, "total_new_dim") assert np.testing.assert_almost_equal( - res[1], obj._calculate_filter_coefficients(("kaiser", 5), filter_order, 0.05, 24)) is None - assert xr.testing.assert_equal(res[2], apriori) is None - assert len(res[3]) == len(res[0].coords[var_dim]) + res[2], obj._calculate_filter_coefficients(("kaiser", 5), filter_order, 0.05, 24)) is None + assert xr.testing.assert_equal(res[3], apriori) is None + assert len(res[4]) == len(res[0].coords[var_dim]) class TestFirFilterConvolve: diff --git a/test/test_helpers/test_helpers.py b/test/test_helpers/test_helpers.py index 99a5d65de532e8b025f77d5bf8551cbff9ead901..b850b361b09a8d180c5c70c2257d2d7be27c6cc0 100644 --- a/test/test_helpers/test_helpers.py +++ b/test/test_helpers/test_helpers.py @@ -12,7 +12,7 @@ import mock import pytest import string -from mlair.helpers import to_list, dict_to_xarray, float_round, remove_items, extract_value, select_from_dict +from mlair.helpers import to_list, dict_to_xarray, float_round, remove_items, extract_value, select_from_dict, sort_like from mlair.helpers import PyTestRegex from mlair.helpers import Logger, TimeTracking from mlair.helpers.helpers import is_xarray, convert2xrda @@ -432,3 +432,25 @@ class TestConvert2xrDa: e.value.args[0] assert "`use_1d_default=True' is used with `arr' of type da.array. For da.arrays please pass" + \ " `use_1d_default=False' and specify keywords for xr.DataArray via kwargs." in e.value.args[0] + + +class TestSortLike: + + def test_sort_like(self): + l_obj = [1, 2, 3, 8, 4] + res = sort_like(l_obj, [1, 2, 3, 4, 5, 6, 7, 8]) + assert res == [1, 2, 3, 4, 8] + assert l_obj == [1, 2, 3, 8, 4] + + def test_sort_like_not_unique(self): + l_obj = [1, 2, 3, 8, 4, 3] + with pytest.raises(AssertionError) as e: + sort_like(l_obj, [1, 2, 3, 4, 5, 6, 7, 8]) + l_obj = [1, 2, 3, 8, 4] + with pytest.raises(AssertionError) as e: + sort_like(l_obj, [1, 2, 3, 4, 5, 6, 7, 8, 5]) + + def test_sort_like_missing_element(self): + l_obj = [1, 2, 3, 8, 4] + with pytest.raises(AssertionError) as e: + sort_like(l_obj, [1, 2, 3, 5, 6, 7, 8]) diff --git a/test/test_helpers/test_statistics.py b/test/test_helpers/test_statistics.py index f5148cdc293939d5711afb57c2fa009c47b6c86d..a3f645937258604c2dbbda07b36a58d83e879065 100644 --- a/test/test_helpers/test_statistics.py +++ b/test/test_helpers/test_statistics.py @@ -5,7 +5,9 @@ import xarray as xr from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply, \ - create_single_bootstrap_realization, calculate_average, create_n_bootstrap_realizations + create_single_bootstrap_realization, calculate_average, create_n_bootstrap_realizations, mean_squared_error, \ + mean_absolute_error, calculate_error_metrics +from mlair.helpers.testing import check_nested_equality lazy = pytest.lazy_fixture @@ -255,3 +257,72 @@ class TestCreateBootstrapRealizations: dim_name_model='model', n_boots=1000, dim_name_boots='boots') assert isinstance(boot_data, xr.DataArray) assert boot_data.shape == (1000,) + + +class TestMeanSquaredError: + + def test_mean_squared_error(self): + assert mean_squared_error(10, 3) == 49 + assert np.testing.assert_almost_equal(mean_squared_error(np.array([10, 20, 15]), np.array([5, 25, 15])), 50./3) is None + + def test_mean_squared_error_xarray(self): + d1 = np.array([np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5])]) + d2 = np.array([np.array([2, 4, 3, 4, 6]), np.array([2, 3, 3, 4, 5]), np.array([0, 1, 3, 4, 5])]) + shape = d1.shape + coords = {'index': range(shape[0]), 'value': range(shape[1])} + x_array1 = xr.DataArray(d1, coords=coords, dims=coords.keys()) + x_array2 = xr.DataArray(d2, coords=coords, dims=coords.keys()) + expected = xr.DataArray(np.array([1, 2, 0, 0, 1./3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"]) + assert xr.testing.assert_equal(mean_squared_error(x_array1, x_array2, "index"), expected) is None + expected = xr.DataArray(np.array([1.2, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]) + assert xr.testing.assert_equal(mean_squared_error(x_array1, x_array2, "value"), expected) is None + + +class TestMeanAbsoluteError: + + def test_mean_absolute_error(self): + assert mean_absolute_error(10, 3) == 7 + assert np.testing.assert_almost_equal(mean_absolute_error(np.array([10, 20, 15]), np.array([5, 25, 15])), 10./3) is None + + def test_mean_absolute_error_xarray(self): + d1 = np.array([np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5])]) + d2 = np.array([np.array([2, 4, 3, 4, 6]), np.array([2, 3, 3, 4, 5]), np.array([0, 1, 3, 4, 5])]) + shape = d1.shape + coords = {'index': range(shape[0]), 'value': range(shape[1])} + x_array1 = xr.DataArray(d1, coords=coords, dims=coords.keys()) + x_array2 = xr.DataArray(d2, coords=coords, dims=coords.keys()) + expected = xr.DataArray(np.array([1, 4./3, 0, 0, 1./3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"]) + assert xr.testing.assert_equal(mean_absolute_error(x_array1, x_array2, "index"), expected) is None + expected = xr.DataArray(np.array([0.8, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]) + assert xr.testing.assert_equal(mean_absolute_error(x_array1, x_array2, "value"), expected) is None + + +class TestCalculateErrorMetrics: + + def test_calculate_error_metrics(self): + d1 = np.array([np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5]), np.array([1, 2, 3, 4, 5])]) + d2 = np.array([np.array([2, 4, 3, 4, 6]), np.array([2, 3, 3, 4, 5]), np.array([0, 1, 3, 4, 5])]) + shape = d1.shape + coords = {'index': range(shape[0]), 'value': range(shape[1])} + x_array1 = xr.DataArray(d1, coords=coords, dims=coords.keys()) + x_array2 = xr.DataArray(d2, coords=coords, dims=coords.keys()) + expected = {"mse": xr.DataArray(np.array([1, 2, 0, 0, 1./3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"]), + "rmse": np.sqrt(xr.DataArray(np.array([1, 2, 0, 0, 1./3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"])), + "mae": xr.DataArray(np.array([1, 4./3, 0, 0, 1./3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"]), + "n": xr.DataArray(np.array([3, 3, 3, 3, 3]), coords={"value": [0, 1, 2, 3, 4]}, dims=["value"])} + assert check_nested_equality(expected, calculate_error_metrics(x_array1, x_array2, "index")) is True + + expected = {"mse": xr.DataArray(np.array([1.2, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]), + "rmse": np.sqrt(xr.DataArray(np.array([1.2, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"])), + "mae": xr.DataArray(np.array([0.8, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]), + "n": xr.DataArray(np.array([5, 5, 5]), coords={"index": [0, 1, 2]}, dims=["index"])} + assert check_nested_equality(expected, calculate_error_metrics(x_array1, x_array2, "value")) is True + + + + # expected = xr.DataArray(np.array([1.2, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]) + # assert xr.testing.assert_equal(mean_squared_error(x_array1, x_array2, "value"), expected) is None + # + # + # expected = xr.DataArray(np.array([0.8, 0.4, 0.4]), coords={"index": [0, 1, 2]}, dims=["index"]) + # assert xr.testing.assert_equal(mean_absolute_error(x_array1, x_array2, "value"), expected) is None \ No newline at end of file diff --git a/test/test_helpers/test_testing_helpers.py b/test/test_helpers/test_testing_helpers.py index bceed646c345d3add4602e67b55da1553eabdbaa..9b888a91a7c88a31764bd272632b1aab8e6e170f 100644 --- a/test/test_helpers/test_testing_helpers.py +++ b/test/test_helpers/test_testing_helpers.py @@ -1,4 +1,4 @@ -from mlair.helpers.testing import PyTestRegex, PyTestAllEqual, test_nested_equality +from mlair.helpers.testing import PyTestRegex, PyTestAllEqual, check_nested_equality import re import xarray as xr @@ -52,30 +52,30 @@ class TestPyTestAllEqual: class TestNestedEquality: def test_nested_equality_single_entries(self): - assert test_nested_equality(3, 3) is True - assert test_nested_equality(3.9, 3.9) is True - assert test_nested_equality(3.91, 3.9) is False - assert test_nested_equality("3", 3) is False - assert test_nested_equality("3", "3") is True - assert test_nested_equality(None, None) is True + assert check_nested_equality(3, 3) is True + assert check_nested_equality(3.9, 3.9) is True + assert check_nested_equality(3.91, 3.9) is False + assert check_nested_equality("3", 3) is False + assert check_nested_equality("3", "3") is True + assert check_nested_equality(None, None) is True def test_nested_equality_xarray(self): obj1 = xr.DataArray(np.random.randn(2, 3), dims=('x', 'y'), coords={'x': [10, 20], 'y': [0, 10, 20]}) obj2 = xr.ones_like(obj1) * obj1 - assert test_nested_equality(obj1, obj2) is True + assert check_nested_equality(obj1, obj2) is True def test_nested_equality_numpy(self): obj1 = np.random.randn(2, 3) obj2 = obj1 * 1 - assert test_nested_equality(obj1, obj2) is True + assert check_nested_equality(obj1, obj2) is True def test_nested_equality_list_tuple(self): - assert test_nested_equality([3, 3], [3, 3]) is True - assert test_nested_equality((2, 6), (2, 6)) is True - assert test_nested_equality([3, 3.5], [3.5, 3]) is False - assert test_nested_equality([3, 3.5, 10], [3, 3.5]) is False + assert check_nested_equality([3, 3], [3, 3]) is True + assert check_nested_equality((2, 6), (2, 6)) is True + assert check_nested_equality([3, 3.5], [3.5, 3]) is False + assert check_nested_equality([3, 3.5, 10], [3, 3.5]) is False def test_nested_equality_dict(self): - assert test_nested_equality({"a": 3, "b": 10}, {"b": 10, "a": 3}) is True - assert test_nested_equality({"a": 3, "b": [10, 100]}, {"b": [10, 100], "a": 3}) is True - assert test_nested_equality({"a": 3, "b": 10, "c": "c"}, {"b": 10, "a": 3}) is False + assert check_nested_equality({"a": 3, "b": 10}, {"b": 10, "a": 3}) is True + assert check_nested_equality({"a": 3, "b": [10, 100]}, {"b": [10, 100], "a": 3}) is True + assert check_nested_equality({"a": 3, "b": 10, "c": "c"}, {"b": 10, "a": 3}) is False