diff --git a/mlair/helpers/filter.py b/mlair/helpers/filter.py index bdaba6dc3fc52fbefe86ef4d18c7d072f36bace8..36127063d21186873523a3f86b84e25038bfd99d 100644 --- a/mlair/helpers/filter.py +++ b/mlair/helpers/filter.py @@ -1,6 +1,6 @@ import gc import warnings -from typing import Union, Callable, Tuple +from typing import Union, Callable, Tuple, Dict, Any import logging import os import time @@ -71,15 +71,13 @@ class ClimateFIRFilter: sampling = {1: "1d", 24: "1H"}.get(int(fs)) logging.debug(f"{plot_name}: create diurnal_anomalies") if apriori_diurnal is True and sampling == "1H": - diurnal_anomalies = self.create_seasonal_hourly_mean(data, sel_opts=sel_opts, sampling=sampling, - time_dim=time_dim, + diurnal_anomalies = self.create_seasonal_hourly_mean(data, time_dim, sel_opts=sel_opts, sampling=sampling, as_anomaly=True) else: diurnal_anomalies = 0 logging.debug(f"{plot_name}: create monthly apriori") if apriori is None: - apriori = self.create_monthly_mean(data, sel_opts=sel_opts, sampling=sampling, - time_dim=time_dim) + diurnal_anomalies + apriori = self.create_monthly_mean(data, time_dim, sel_opts=sel_opts, sampling=sampling) + diurnal_anomalies logging.debug(f"{plot_name}: apriori shape = {apriori.shape}") apriori_list = to_list(apriori) input_data = data.__deepcopy__() @@ -124,12 +122,9 @@ class ClimateFIRFilter: if len(apriori_list) <= i + 1: logging.info(f"{plot_name}: create diurnal_anomalies") if apriori_diurnal is True and sampling == "1H": - # diurnal_anomalies = self.create_hourly_mean(input_data.sel({new_dim: 0}, drop=True), - # sel_opts=sel_opts, sampling=sampling, - # time_dim=time_dim, as_anomaly=True) diurnal_anomalies = self.create_seasonal_hourly_mean(input_data.sel({new_dim: 0}, drop=True), - sel_opts=sel_opts, sampling=sampling, - time_dim=time_dim, as_anomaly=True) + time_dim, sel_opts=sel_opts, sampling=sampling, + as_anomaly=True) else: diurnal_anomalies = 0 logging.info(f"{plot_name}: create monthly apriori") @@ -137,9 +132,8 @@ class ClimateFIRFilter: apriori_list.append(xr.zeros_like(apriori_list[i]) + diurnal_anomalies) elif apriori_type == "residuum_stats": # calculate monthly statistic on residuum apriori_list.append( - -self.create_monthly_mean(input_data.sel({new_dim: 0}, drop=True), sel_opts=sel_opts, - sampling=sampling, - time_dim=time_dim) + diurnal_anomalies) + -self.create_monthly_mean(input_data.sel({new_dim: 0}, drop=True), time_dim, sel_opts=sel_opts, + sampling=sampling) + diurnal_anomalies) else: raise ValueError(f"Cannot handle unkown apriori type: {apriori_type}. Please choose from None, " f"`zeros` or `residuum_stats`.") @@ -179,7 +173,7 @@ class ClimateFIRFilter: :param data: data to create monthly unity array from, must contain dimension time_dim :param time_dim: name of temporal dimension - :param extend_range: number of days to extend data + :param extend_range: number of days to extend data (default 366) :returns: xarray in monthly resolution (centered at 16th day of month) with all values equal to 1 """ coords = data.coords @@ -196,8 +190,8 @@ class ClimateFIRFilter: # loffset is required because resampling uses last day in month as resampling timestamp return new_array.resample({time_dim: "1m"}, loffset=datetime.timedelta(days=-15)).max() - def create_monthly_mean(self, data: xr.DataArray, sel_opts: dict = None, sampling: str = "1d", - time_dim: str = "datetime") -> xr.DataArray: + def create_monthly_mean(self, data: xr.DataArray, time_dim: str, sel_opts: dict = None, sampling: str = "1d") \ + -> xr.DataArray: """ Calculate monthly means (12 values) and return a data array with same resolution as given data containing these monthly mean values. Sampling points are the 16th of each month (this value is equal to the true monthly mean) @@ -206,11 +200,11 @@ class ClimateFIRFilter: calculate the monthly statistic. :param data: data to apply statistical calculation on + :param time_dim: name of temporal axis :param sel_opts: selection options as dict to select a subset of data (default None). A given sel_opts with `sel_opts={<time_dim>: "2006"}` forces the method e.g. to derive the monthly means only from data of the year 2006. :param sampling: sampling of the returned data (default 1d) - :param time_dim: name of temporal axis :returns: array in desired resolution containing interpolated monthly values. Months with no valid data are returned as np.nan which also effects data in the neighbouring months (before / after sampling points which are the 16th of each month). @@ -233,28 +227,67 @@ class ClimateFIRFilter: return monthly.resample({time_dim: sampling}).interpolate() @staticmethod - def create_hourly_mean(data, sel_opts=None, sampling="1H", time_dim="datetime", as_anomaly=True): - """Calculate hourly statistics. Either the absolute value or the anomaly (as_anomaly=True).""" - # can only be used for hourly sampling rate - assert sampling == "1H" + def _compute_hourly_mean_per_month(data: xr.DataArray, time_dim: str, as_anomaly: bool) -> Dict[int, xr.DataArray]: + """ + Calculate for each hour in each month a separate mean value (12 x 24 values in total). Average is either the + anomaly of a monthly mean state or the raw mean value. - # create unity xarray in hourly resolution - hourly = xr.ones_like(data) + :param data: data to calculate averages on + :param time_dim: name of temporal dimension + :param as_anomaly: indicates whether to calculate means as anomaly of a monthly mean or as raw mean values. + :returns: dictionary containing 12 months each with a 24-valued array (1 entry for each hour) + """ + seasonal_hourly_means = {} + for month in data.groupby(f"{time_dim}.month").groups.keys(): + single_month_data = data.sel({time_dim: (data[f"{time_dim}.month"] == month)}) + hourly_mean = single_month_data.groupby(f"{time_dim}.hour").mean() + if as_anomaly is True: + hourly_mean = hourly_mean - hourly_mean.mean("hour") + seasonal_hourly_means[month] = hourly_mean + return seasonal_hourly_means - # apply selection if given (only use subset for hourly means) - if sel_opts is not None: - data = data.sel(**sel_opts) + @staticmethod + def _create_seasonal_cycle_of_single_hour_mean(result_arr: xr.DataArray, means: Dict[int, xr.DataArray], hour: int, + time_dim: str, sampling: str) -> xr.DataArray: + """ + Use monthly means of a given hour to create an array with interpolated values at the indicated hour for each day + of the full time span indicated by given result_arr. - # create mean for each hour and replace entries in unity array, calculate anomaly if enabled - hourly_mean = data.groupby(f"{time_dim}.hour").mean() - if as_anomaly is True: - hourly_mean = hourly_mean - hourly_mean.mean("hour") - for hour in hourly_mean.hour.values: - loc = (hourly[f"{time_dim}.hour"] == hour) - hourly.loc[{f"{time_dim}": loc}] = hourly_mean.sel(hour=hour) - return hourly + :param result_arr: template array indicating the full time range and additional dimensions to keep + :param means: dictionary containing 24 hourly averages for each month (12 x 24 values in total) + :param hour: integer of hour of interest + :param time_dim: name of temporal dimension + :param sampling: sampling rate to interpolate + :returns: array with interpolated averages in sampling resolution containing only values for hour of interest + """ + h_coll = xr.ones_like(result_arr) * np.nan + for month in means.keys(): + hourly_mean_single_month = means[month].sel(hour=hour, drop=True) + h_coll = xr.where((h_coll[f"{time_dim}.month"] == month), hourly_mean_single_month, h_coll) + h_coll = h_coll.resample({time_dim: sampling}).interpolate() + h_coll = h_coll.sel({time_dim: (h_coll[f"{time_dim}.hour"] == hour)}) + return h_coll + + def create_seasonal_hourly_mean(self, data: xr.DataArray, time_dim: str, sel_opts: Dict[str, Any] = None, + sampling: str = "1H", as_anomaly: bool = True) -> xr.DataArray: + """ + Compute climatological statistics on hourly base either as raw data or anomalies. For each month, an overall + mean value (only used if requiring anomalies) and the mean of each hour are calculated. The climatological + diurnal cycle is positioned on the 16th of each month and interpolated in between by using a distinct + interpolation for each hour of day. The returned array therefore contains data with a yearly cycle (if anomaly + is not calculated) or data without a yearly cycle (if using anomalies). In both cases, the data have an + amplitude that varies over the year. + + :param data: data to apply this method to + :param time_dim: name of temporal axis + :param sel_opts: specific selection options that are applied before calculation of climatological statistics + (default None) + :param sampling: temporal resolution of data (default "1H") + :param as_anomaly: specify whether to use anomalies or raw data including a seasonal cycle of the mean value + (default: True) + :returns: climatological statistics for given data interpolated with given sampling rate + """ - def create_seasonal_hourly_mean(self, data, sel_opts=None, sampling="1H", time_dim="datetime", as_anomaly=True): """Calculate hourly statistics. Either the absolute value or the anomaly (as_anomaly=True).""" # can only be used for hourly sampling rate assert sampling == "1H" @@ -266,29 +299,18 @@ class ClimateFIRFilter: # create unity xarray in monthly resolution with sampling point in mid of each month monthly = self.create_monthly_unity_array(data, time_dim) * np.nan - seasonal_hourly_means = {} - - for month in data.groupby(f"{time_dim}.month").groups.keys(): - # select each month - single_month_data = data.sel({time_dim: (data[f"{time_dim}.month"] == month)}) - hourly_mean = single_month_data.groupby(f"{time_dim}.hour").mean() - if as_anomaly is True: - hourly_mean = hourly_mean - hourly_mean.mean("hour") - seasonal_hourly_means[month] = hourly_mean + # calculate for each hour in each month a separate mean value + seasonal_hourly_means = self._compute_hourly_mean_per_month(data, time_dim, as_anomaly) + # create seasonal cycles of these hourly averages seasonal_coll = [] for hour in data.groupby(f"{time_dim}.hour").groups.keys(): - h_coll = monthly.__deepcopy__() - for month in seasonal_hourly_means.keys(): - hourly_mean_single_month = seasonal_hourly_means[month].sel(hour=hour, drop=True) - h_coll = xr.where((h_coll[f"{time_dim}.month"] == month), - hourly_mean_single_month, - h_coll) - h_coll = h_coll.resample({time_dim: sampling}).interpolate() - h_coll = h_coll.sel({time_dim: (h_coll[f"{time_dim}.hour"] == hour)}) - seasonal_coll.append(h_coll) - hourly = xr.concat(seasonal_coll, time_dim).sortby(time_dim).resample({time_dim: sampling}).interpolate() + mean_single_hour = self._create_seasonal_cycle_of_single_hour_mean(monthly, seasonal_hourly_means, hour, + time_dim, sampling) + seasonal_coll.append(mean_single_hour) + # combine all cycles in a common data array + hourly = xr.concat(seasonal_coll, time_dim).sortby(time_dim).resample({time_dim: sampling}).interpolate() return hourly @staticmethod @@ -383,7 +405,7 @@ class ClimateFIRFilter: return filter_input_data def create_visualization(self, filtered, data, filter_input_data, plot_dates, time_dim, new_dim, sampling, extend_length_history, - extend_length_future, minimum_length, h, variable_name): + extend_length_future, minimum_length, h, variable_name): # pragma: no cover plot_data = [] for viz_date in set(plot_dates).intersection(filtered.coords[time_dim].values): try: @@ -456,7 +478,7 @@ class ClimateFIRFilter: # calculate apriori information from data if not given and extend its range if not sufficient long enough if apriori is None: - apriori = self.create_monthly_mean(data, sel_opts=sel_opts, sampling=sampling, time_dim=time_dim) + apriori = self.create_monthly_mean(data, time_dim, sel_opts=sel_opts, sampling=sampling) apriori = apriori.astype(data.dtype) apriori = self.extend_apriori(data, apriori, time_dim, sampling, station_name=station_name) diff --git a/test/test_helpers/test_filter.py b/test/test_helpers/test_filter.py index 6f6d26dd63f199ce3cf83bba3059c5cefc7ff4b8..aecdd04dcd6a0542c7cf4989eec002e6920d8c99 100644 --- a/test/test_helpers/test_filter.py +++ b/test/test_helpers/test_filter.py @@ -28,14 +28,16 @@ class TestClimateFIRFilter: def xr_array(self, data, time_dim): start = np.datetime64("2010-01-01 00:00") time_index = [start + np.timedelta64(h, "h") for h in range(len(data))] - array = xr.DataArray(data, dims=time_dim, coords={time_dim: time_index}) + array = xr.DataArray(data.reshape(len(data), 1), dims=[time_dim, "station"], + coords={time_dim: time_index, "station": ["DE266X"]}) return array @pytest.fixture def xr_array_long(self, data, time_dim): start = np.datetime64("2010-01-01 00:00") time_index = [start + np.timedelta64(175 * h, "h") for h in range(len(data))] - array = xr.DataArray(data, dims=time_dim, coords={time_dim: time_index}) + array = xr.DataArray(data.reshape(len(data), 1), dims=[time_dim, "station"], + coords={time_dim: time_index, "station": ["DE266X"]}) return array def test_combine_observation_and_apriori_no_new_dim(self, xr_array, time_dim): @@ -61,11 +63,12 @@ class TestClimateFIRFilter: assert xr.testing.assert_equal(first_entry.sel(window=range(1, 10)), apriori.sel({time_dim: date_pos})) is None def test_shift_data(self, xr_array, time_dim): + remaining_dims = set(xr_array.dims).difference([time_dim]) obj = object.__new__(ClimateFIRFilter) index_values = range(-15, 1) res = obj._shift_data(xr_array, index_values, time_dim, new_dim="window") - assert len(res.dims) == 2 - assert len(set(res.dims).difference([time_dim, "window"])) == 0 + assert len(res.dims) == len(remaining_dims) + 2 + assert len(set(res.dims).difference([time_dim, "window", *remaining_dims])) == 0 assert np.testing.assert_array_equal(res.coords["window"].values, np.arange(-15, 1)) is None sel = res.sel({time_dim: res.coords[time_dim].values[15]}) assert sel.sel(window=-15).values == xr_array.sel({time_dim: xr_array.coords[time_dim].values[0]}).values @@ -135,8 +138,8 @@ class TestClimateFIRFilter: def test_create_monthly_mean(self, xr_array_long, time_dim): obj = object.__new__(ClimateFIRFilter) - res = obj.create_monthly_mean(xr_array_long, time_dim=time_dim) - assert res.shape == (1462,) + res = obj.create_monthly_mean(xr_array_long, time_dim) + assert res.shape == (1462, 1) assert np.datetime64("2008-12-16") == res.coords[time_dim][0].values assert np.datetime64("2012-12-16") == res.coords[time_dim][-1].values mean_jan = xr_array_long[xr_array_long[f"{time_dim}.month"] == 1].mean() @@ -148,10 +151,10 @@ class TestClimateFIRFilter: def test_create_monthly_mean_sampling(self, xr_array_long, time_dim): obj = object.__new__(ClimateFIRFilter) - res = obj.create_monthly_mean(xr_array_long, time_dim=time_dim, sampling="1m") - assert res.shape == (49,) - res = obj.create_monthly_mean(xr_array_long, time_dim=time_dim, sampling="1H") - assert res.shape == (35065,) + res = obj.create_monthly_mean(xr_array_long, time_dim, sampling="1m") + assert res.shape == (49, 1) + res = obj.create_monthly_mean(xr_array_long, time_dim, sampling="1H") + assert res.shape == (35065, 1) mean_jun = xr_array_long[xr_array_long[f"{time_dim}.month"] == 6].mean() assert res.sel({time_dim: "2010-06-15T00:00:00"}) == mean_jun assert res.sel({time_dim: "2011-06-15T00:00:00"}) == mean_jun @@ -159,15 +162,57 @@ class TestClimateFIRFilter: def test_create_monthly_mean_sel_opts(self, xr_array_long, time_dim): obj = object.__new__(ClimateFIRFilter) sel_opts = {time_dim: slice("2010-05", "2010-08")} - res = obj.create_monthly_mean(xr_array_long, time_dim=time_dim, sel_opts=sel_opts) + res = obj.create_monthly_mean(xr_array_long, time_dim, sel_opts=sel_opts) assert res.dropna(time_dim)[f"{time_dim}.month"].min() == 5 assert res.dropna(time_dim)[f"{time_dim}.month"].max() == 8 mean_jun_2010 = xr_array_long[xr_array_long[f"{time_dim}.month"] == 6].sel({time_dim: "2010"}).mean() assert res.sel({time_dim: "2010-06-15T00:00:00"}) == mean_jun_2010 - def test_create_seasonal_hourly_mean(self): - #Todo: stopped here - pass + def test_compute_hourly_mean_per_month(self, xr_array_long, time_dim): + obj = object.__new__(ClimateFIRFilter) + xr_array_long = xr_array_long.resample({time_dim: "1H"}).interpolate() + res = obj._compute_hourly_mean_per_month(xr_array_long, time_dim, True) + assert len(res.keys()) == 12 + assert 6 in res.keys() + assert np.testing.assert_almost_equal(res[12].mean(), 0) is None + assert np.testing.assert_almost_equal(res[3].mean(), 0) is None + assert res[8].shape == (24, 1) + + def test_compute_hourly_mean_per_month_no_anomaly(self, xr_array_long, time_dim): + obj = object.__new__(ClimateFIRFilter) + xr_array_long = xr_array_long.resample({time_dim: "1H"}).interpolate() + res = obj._compute_hourly_mean_per_month(xr_array_long, time_dim, False) + assert len(res.keys()) == 12 + assert 9 in res.keys() + assert np.testing.assert_array_less(res[2], res[1]) is None + + def test_create_seasonal_cycle_of_hourly_mean(self, xr_array_long, time_dim): + obj = object.__new__(ClimateFIRFilter) + xr_array_long = xr_array_long.resample({time_dim: "1H"}).interpolate() + monthly = obj.create_monthly_unity_array(xr_array_long, time_dim) * np.nan + seasonal_hourly_means = obj._compute_hourly_mean_per_month(xr_array_long, time_dim, True) + res = obj._create_seasonal_cycle_of_single_hour_mean(monthly, seasonal_hourly_means, 0, time_dim, "1h") + assert res[f"{time_dim}.hour"].sum() == 0 + assert np.testing.assert_almost_equal(res.sel({time_dim: "2010-12-01"}), res.sel({time_dim: "2011-12-01"})) is None + res = obj._create_seasonal_cycle_of_single_hour_mean(monthly, seasonal_hourly_means, 13, time_dim, "1h") + assert res[f"{time_dim}.hour"].mean() == 13 + assert np.testing.assert_almost_equal(res.sel({time_dim: "2010-12-01"}), res.sel({time_dim: "2011-12-01"})) is None + + def test_create_seasonal_hourly_mean(self, xr_array_long, time_dim): + obj = object.__new__(ClimateFIRFilter) + xr_array_long = xr_array_long.resample({time_dim: "1H"}).interpolate() + res = obj.create_seasonal_hourly_mean(xr_array_long, time_dim) + assert len(set(res.dims).difference(xr_array_long.dims)) == 0 + assert res.coords[time_dim][0] < xr_array_long.coords[time_dim][0] + assert res.coords[time_dim][-1] > xr_array_long.coords[time_dim][-1] + + def test_create_seasonal_hourly_mean_sel_opts(self, xr_array_long, time_dim): + obj = object.__new__(ClimateFIRFilter) + xr_array_long = xr_array_long.resample({time_dim: "1H"}).interpolate() + sel_opts = {time_dim: slice("2010-05", "2010-08")} + res = obj.create_seasonal_hourly_mean(xr_array_long, time_dim, sel_opts=sel_opts) + assert res.dropna(time_dim)[f"{time_dim}.month"].min() == 5 + assert res.dropna(time_dim)[f"{time_dim}.month"].max() == 8 def test_create_unity_array(self, xr_array, time_dim): obj = object.__new__(ClimateFIRFilter) @@ -176,12 +221,12 @@ class TestClimateFIRFilter: assert np.datetime64("2011-01-16") == res.coords[time_dim][-1].values assert res.max() == res.min() assert res.max() == 1 - assert res.shape == (26,) + assert res.shape == (26, 1) res = obj.create_monthly_unity_array(xr_array, time_dim, extend_range=0) - assert res.shape == (1,) + assert res.shape == (1, 1) assert np.datetime64("2010-01-16") == res.coords[time_dim][0].values res = obj.create_monthly_unity_array(xr_array, time_dim, extend_range=28) - assert res.shape == (3,) + assert res.shape == (3, 1) def test_extend_apriori_at_end(self, xr_array_long, time_dim): obj = object.__new__(ClimateFIRFilter) @@ -221,6 +266,15 @@ class TestClimateFIRFilter: assert res.stop == np.datetime64("1993-01-01T01:00:00") assert res.step is None + def test_properties(self): + obj = object.__new__(ClimateFIRFilter) + obj._h = [1, 2, 3] + obj._filtered = [4, 5, 63] + obj._apriori = [10, 11, 12, 13] + assert obj.filter_coefficients == [1, 2, 3] + assert obj.filtered_data == [4, 5, 63] + assert obj.apriori_data == [10, 11, 12, 13] + assert obj.initial_apriori_data == 10 class TestFirwinKzf: