diff --git a/docs/_source/customise.rst b/docs/_source/customise.rst index 45971f5fa5ccdc934c3ea448c2f2d0ebbd8eb070..a30488b5e16dec4e5ff24aea7f35a0e286e32897 100644 --- a/docs/_source/customise.rst +++ b/docs/_source/customise.rst @@ -245,7 +245,7 @@ Later on during ModelSetup, Training and PostProcessing, MLAir requests data usi :py:`data_handler.get_Y()`. In PostProcessing, MLAir applies inverse transformation to some data by calling -:py:`data_handler.apply_transformation(`data, inverse=True, **kwargs)'. +:py:`data_handler.apply_transformation(data, inverse=True, **kwargs)`. Default Data Handler ~~~~~~~~~~~~~~~~~~~~ diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 5c173eefa2577f535313c1b9180bfc132d1cc2e7..4002d4785ec36dfcf492d386e368255be528348e 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -171,22 +171,23 @@ class DataHandlerSingleStation(AbstractDataHandler): #. data: Standardised data """ - def f(data, method, *args): - if method == 'standardise': + def f(data, method="standardise"): + if method == "standardise": return statistics.standardise(data, dim) - elif method == 'centre': + elif method == "centre": return statistics.centre(data, dim) - elif method == 'normalise': - # use min/max of data or given min/max - raise NotImplementedError + elif method == "min_max": + return statistics.min_max(data, dim) else: raise NotImplementedError - def f_apply(data, method, mean, std): + def f_apply(data, method, mean=None, std=None, min=None, max=None): if method == "standardise": - return mean, std, statistics.standardise_apply(data, mean, std) + return statistics.standardise_apply(data, mean, std), {"mean": mean, "std": std, "method": method} elif method == "centre": - return mean, None, statistics.centre_apply(data, mean) + return statistics.centre_apply(data, mean), {"mean": mean, "method": method} + elif method == "min_max": + return statistics.min_max_apply(data, min, max), {"min": min, "max": max, "method": method} else: raise NotImplementedError @@ -197,11 +198,9 @@ class DataHandlerSingleStation(AbstractDataHandler): for var in data_in.variables.values: data_var = data_in.sel(**{transformation_dim: [var]}) var_opts = opts.get(var, {}) - _method = var_opts.get("method", "standardise") - _mean = var_opts.get("mean", None) - _std = var_opts.get("std", None) - mean, std, values = locals()["f" if _mean is None else "f_apply"](data_var, _method, _mean, _std) - opts_updated[var] = {"method": _method, "mean": mean, "std": std} + _apply = (var_opts.get("mean", None) is not None) or (var_opts.get("min") is not None) + values, new_var_opts = locals()["f_apply" if _apply else "f"](data_var, **var_opts) + opts_updated[var] = copy.deepcopy(new_var_opts) transformed_values.append(values) return xr.concat(transformed_values, dim=transformation_dim), opts_updated else: @@ -568,7 +567,7 @@ class DataHandlerSingleStation(AbstractDataHandler): raise NotImplementedError("Cannot handle this.") @staticmethod - def check_inverse_transform_params(method: str, mean: data_or_none, std: data_or_none) -> None: + def check_inverse_transform_params(method: str, mean=None, std=None, min=None, max=None) -> None: """ Support inverse_transformation method. @@ -584,6 +583,10 @@ class DataHandlerSingleStation(AbstractDataHandler): msg += "mean, " if method == 'standardise' and std is None: msg += "std, " + if method == "min_max" and min is None: + msg += "min, " + if method == "min_max" and max is None: + msg += "max, " if len(msg) > 0: raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}") @@ -597,13 +600,13 @@ class DataHandlerSingleStation(AbstractDataHandler): current data is not transformed. """ - def f_inverse(data, method_inverse, mean, std): - if method_inverse == 'standardise': + def f_inverse(data, method, mean=None, std=None, min=None, max=None): + if method == 'standardise': return statistics.standardise_inverse(data, mean, std) - elif method_inverse == 'centre': + elif method == 'centre': return statistics.centre_inverse(data, mean) - elif method_inverse == 'normalise': - raise NotImplementedError + elif method == 'min_max': + raise statistics.min_max_inverse(data, min, max) else: raise NotImplementedError @@ -621,10 +624,8 @@ class DataHandlerSingleStation(AbstractDataHandler): _method = var_opts.get("method", None) if _method is None: raise AssertionError(f"Inverse transformation method is not set for {var}.") - _mean = var_opts.get("mean", None) - _std = var_opts.get("std", None) - self.check_inverse_transform_params(_method, _mean, _std) - values = f_inverse(data_var, _method, _mean, _std) + self.check_inverse_transform_params(**var_opts) + values = f_inverse(data_var, **var_opts) transformed_values.append(values) res = xr.concat(transformed_values, dim=transformation_dim) return res.squeeze(transformation_dim) if squeeze else res diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 8914969ac683f01f3d5f2e833bb870b5c710f188..5a62731de44cdfa24a72cdd0d200ddb561be29c0 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -263,7 +263,7 @@ class DefaultDataHandler(AbstractDataHandler): opts = transformation[var] assert transformation_dict[i][var].get("method", opts["method"]) == opts["method"] transformation_dict[i][var]["method"] = opts["method"] - for k in ["mean", "std"]: + for k in ["mean", "std", "min", "max"]: old = transformation_dict[i][var].get(k, None) new = opts.get(k) transformation_dict[i][var][k] = new if old is None else old.combine_first(new) @@ -294,6 +294,10 @@ class DefaultDataHandler(AbstractDataHandler): transformation_dict[i][k]["mean"] = transformation[k]["mean"].mean(iter_dim) if transformation[k]["std"] is not None: transformation_dict[i][k]["std"] = transformation[k]["std"].mean(iter_dim) + if transformation[k]["min"] is not None: + transformation_dict[i][k]["min"] = transformation[k]["min"].min(iter_dim) + if transformation[k]["max"] is not None: + transformation_dict[i][k]["max"] = transformation[k]["max"].max(iter_dim) except KeyError: pop_list.append((i, k)) for (i, k) in pop_list: diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index bfc1490d9826be008847502a6181c492060acda2..a79d201eb9a6b77e38f0cec0a269a0ca7f96478b 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -17,7 +17,8 @@ from mlair.helpers import to_list, remove_items Data = Union[xr.DataArray, pd.DataFrame] -def apply_inverse_transformation(data: Data, method: str = "standardise", mean: Data = None, std: Data = None) -> Data: +def apply_inverse_transformation(data: Data, method: str = "standardise", mean: Data = None, std: Data = None, + max: Data = None, min: Data = None) -> Data: """ Apply inverse transformation for given statistics. @@ -32,22 +33,22 @@ def apply_inverse_transformation(data: Data, method: str = "standardise", mean: return standardise_inverse(data, mean, std) elif method == 'centre': # pragma: no branch return centre_inverse(data, mean) - elif method == 'normalise': # pragma: no cover - # use min/max of data or given min/max - raise NotImplementedError + elif method == 'min_max': # pragma: no branch + return min_max_inverse(data, min, max) else: raise NotImplementedError -def standardise(data: Data, dim: Union[str, int]) -> Tuple[Data, Data, Data]: +def standardise(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]: """ Standardise a xarray.dataarray (along dim) or pandas.DataFrame (along axis) with mean=0 and std=1. :param data: data to standardise :param dim: name (xarray) or axis (pandas) of dimension which should be standardised - :return: mean, standard deviation and standardised data + :return: standardised data, mean, and standard deviation """ - return data.mean(dim), data.std(dim), (data - data.mean(dim)) / data.std(dim) + return (data - data.mean(dim)) / data.std(dim), {"mean": data.mean(dim), "std": data.std(dim), + "method": "standardise"} def standardise_inverse(data: Data, mean: Data, std: Data) -> Data: @@ -76,16 +77,16 @@ def standardise_apply(data: Data, mean: Data, std: Data) -> Data: return (data - mean) / std -def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, None, Data]: +def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]: """ Centre a xarray.dataarray (along dim) or pandas.DataFrame (along axis) to mean=0. :param data: data to centre :param dim: name (xarray) or axis (pandas) of dimension which should be centred - :return: mean, None placeholder and centred data + :return: centred data, mean, and None placeholder """ - return data.mean(dim), None, data - data.mean(dim) + return data - data.mean(dim), {"mean": data.mean(dim), "method": "centre"} def centre_inverse(data: Data, mean: Data) -> Data: @@ -112,6 +113,20 @@ def centre_apply(data: Data, mean: Data) -> Data: return data - mean +def min_max(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]: + d_max = data.max(dim) + d_min = data.min(dim) + return (data - d_min) / (d_max - d_min), {"min": d_min, "max": d_max, "method": "min_max"} + + +def min_max_inverse(data: Data, min: Data, max: Data) -> Data: + return data * (max - min) + min + + +def min_max_apply(data: Data, min: Data, max: Data) -> Data: + return (data - min) / (max - min) + + def mean_squared_error(a, b): """Calculate mean squared error.""" return np.square(a - b).mean() diff --git a/test/test_helpers/test_statistics.py b/test/test_helpers/test_statistics.py index 8e2923f8d4f234e6ca4b47da5ef927ab240df8d6..e0febe1e062f40f705ad2f4d1fb2280a6dc32ba5 100644 --- a/test/test_helpers/test_statistics.py +++ b/test/test_helpers/test_statistics.py @@ -4,7 +4,7 @@ import pytest import xarray as xr from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ - centre_apply, apply_inverse_transformation + centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply lazy = pytest.lazy_fixture @@ -31,6 +31,16 @@ def pd_std(): return [3, 2, 3] +@pytest.fixture(scope='module') +def pd_min(): + return pd.Series([2, -10, -3]) + + +@pytest.fixture(scope='module') +def pd_max(): + return pd.Series([3, 2, 3]) + + @pytest.fixture(scope='module') def xarray(input_data): shape = input_data.shape @@ -48,29 +58,39 @@ def xr_std(input_data): return xr.DataArray([3, 2, 3], coords={'value': range(3)}, dims=['value']) +@pytest.fixture(scope='module') +def xr_min(input_data): + return xr.DataArray([2, -10, -3], coords={'value': range(3)}, dims=['value']) + + +@pytest.fixture(scope='module') +def xr_max(input_data): + return xr.DataArray([3, 2, 3], coords={'value': range(3)}, dims=['value']) + + class TestStandardise: @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_standardise(self, data_orig, dim): - mean, std, data = standardise(data_orig, dim) - assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None - assert np.testing.assert_almost_equal(std, [2, 3, 1], decimal=1) is None + data, opts = standardise(data_orig, dim) + assert np.testing.assert_almost_equal(opts["mean"], [2, -5, 10], decimal=1) is None + assert np.testing.assert_almost_equal(opts["std"], [2, 3, 1], decimal=1) is None assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None assert np.testing.assert_almost_equal(data.std(dim), [1, 1, 1]) is None @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_standardise_inverse(self, data_orig, dim): - mean, std, data = standardise(data_orig, dim) - data_recovered = standardise_inverse(data, mean, std) + data, opts = standardise(data_orig, dim) + data_recovered = standardise_inverse(data, opts["mean"], opts["std"]) assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_apply_standardise_inverse(self, data_orig, dim): - mean, std, data = standardise(data_orig, dim) - data_recovered = apply_inverse_transformation(data, "standardise", mean, std) + data, opts = standardise(data_orig, dim) + data_recovered = apply_inverse_transformation(data, **opts) assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None @pytest.mark.parametrize('data_orig, mean, std, dim', [(lazy('pandas'), lazy('pd_mean'), lazy('pd_std'), 0), @@ -88,23 +108,22 @@ class TestCentre: @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_centre(self, data_orig, dim): - mean, std, data = centre(data_orig, dim) - assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None - assert std is None + data, opts = centre(data_orig, dim) + assert np.testing.assert_almost_equal(opts["mean"], [2, -5, 10], decimal=1) is None assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_centre_inverse(self, data_orig, dim): - mean, _, data = centre(data_orig, dim) - data_recovered = centre_inverse(data, mean) + data, opts = centre(data_orig, dim) + data_recovered = centre_inverse(data, opts["mean"]) assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), (lazy('xarray'), 'index')]) def test_apply_centre_inverse(self, data_orig, dim): - mean, _, data = centre(data_orig, dim) - data_recovered = apply_inverse_transformation(data, mean=mean, method="centre") + data, opts = centre(data_orig, dim) + data_recovered = apply_inverse_transformation(data, **opts) assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None @pytest.mark.parametrize('data_orig, mean, dim', [(lazy('pandas'), lazy('pd_mean'), 0), @@ -113,3 +132,40 @@ class TestCentre: data = centre_apply(data_orig, mean) mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3]) assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None + + +class TestMinMax: + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_min_max(self, data_orig, dim): + data, opts = min_max(data_orig, dim) + max_expected = data_orig.max(dim) + min_expected = data_orig.min(dim) + assert np.testing.assert_array_almost_equal(opts["max"], max_expected, decimal=1) is None + assert np.testing.assert_array_almost_equal(opts["min"], min_expected, decimal=1) is None + assert np.testing.assert_almost_equal(data.max(dim), [1, 1, 1]) is None + assert np.testing.assert_almost_equal(data.min(dim), [0, 0, 0]) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_min_max_inverse(self, data_orig, dim): + data, opts = min_max(data_orig, dim) + data_recovered = min_max_inverse(data, opts["min"], opts["max"]) + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_apply_min_max_inverse(self, data_orig, dim): + data, opts = min_max(data_orig, dim) + data_recovered = apply_inverse_transformation(data, **opts) + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dmin, dmax, dim', [(lazy('pandas'), lazy('pd_min'), lazy('pd_max'), 0), + (lazy('xarray'), lazy('xr_min'), lazy('xr_max'), 'index')]) + def test_min_max_apply(self, data_orig, dmin, dmax, dim): + data = min_max_apply(data_orig, dmin, dmax) + min_expected = (data_orig.min(dim) - dmin) / (dmax - dmin) + max_expected = (data_orig.max(dim) - dmin) / (dmax - dmin) + assert np.testing.assert_array_almost_equal(data.min(dim), min_expected, decimal=1) is None + assert np.testing.assert_array_almost_equal(data.max(dim), max_expected, decimal=1) is None