diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 4002d4785ec36dfcf492d386e368255be528348e..a894c635282b5879d79426168eb96d64ff5fa2a2 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -178,6 +178,8 @@ class DataHandlerSingleStation(AbstractDataHandler): return statistics.centre(data, dim) elif method == "min_max": return statistics.min_max(data, dim) + elif method == "log": + return statistics.log(data, dim) else: raise NotImplementedError @@ -188,6 +190,8 @@ class DataHandlerSingleStation(AbstractDataHandler): return statistics.centre_apply(data, mean), {"mean": mean, "method": method} elif method == "min_max": return statistics.min_max_apply(data, min, max), {"min": min, "max": max, "method": method} + elif method == "log": + return statistics.log_apply(data, mean, std), {"mean": mean, "std": std, "method": method} else: raise NotImplementedError @@ -601,12 +605,14 @@ class DataHandlerSingleStation(AbstractDataHandler): """ def f_inverse(data, method, mean=None, std=None, min=None, max=None): - if method == 'standardise': + if method == "standardise": return statistics.standardise_inverse(data, mean, std) - elif method == 'centre': + elif method == "centre": return statistics.centre_inverse(data, mean) - elif method == 'min_max': - raise statistics.min_max_inverse(data, min, max) + elif method == "min_max": + return statistics.min_max_inverse(data, min, max) + elif method == "log": + return statistics.log_inverse(data, mean, std) else: raise NotImplementedError diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 65574f4c2641d811e31087a1c9bb61f5e9a2aa8f..ea5a9f05c8ff91a5cd6be678ad03d12b923a4bec 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -37,6 +37,8 @@ def apply_inverse_transformation(data: Data, method: str = "standardise", mean: return centre_inverse(data, mean) elif method == 'min_max': # pragma: no branch return min_max_inverse(data, min, max) + elif method == "log": + return log_inverse(data, mean, std) else: raise NotImplementedError @@ -152,6 +154,48 @@ def min_max_apply(data: Data, min: Data, max: Data) -> Data: return (data - min) / (max - min) +def log(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]: + """ + Apply logarithmic transformation (and standarization) to data. This method first uses the logarithm for + transformation and second applies the `standardise` method additionally. A logarithmic function numpy's log1p is + used (`res = log(1+x)`) instead of the pure logarithm to be applicable to values of 0 too. + + :param data: transform this data + :param dim: name (xarray) or axis (pandas) of dimension which should be transformed + :return: transformed data, and option dictionary with keys method, mean, and std + """ + transformed_standardized, opts = standardise(np.log1p(data), dim) + opts.update({"method": "log"}) + return transformed_standardized, opts + + +def log_inverse(data: Data, mean: Data, std: Data) -> Data: + """ + Apply inverse log transformation (therefore exponential transformation). Because `log` is using `np.log1p` this + method is based on the equivalent method `np.exp1m`. Data are first rescaled using `standardise_inverse` and then + given to the exponential function. + + :param data: apply inverse log transformation on this data + :param mean: mean of the standarization + :param std: std of the standarization + :return: inverted data + """ + data_rescaled = standardise_inverse(data, mean, std) + return np.expm1(data_rescaled) + + +def log_apply(data: Data, mean: Data, std: Data) -> Data: + """ + Apply numpy's log1p on given data. Further information can be found in description of `log` method. + + :param data: transform this data + :param mean: mean of the standarization + :param std: std of the standarization + :return: transformed data + """ + return standardise_apply(np.log1p(data), mean, std) + + def mean_squared_error(a, b): """Calculate mean squared error.""" return np.square(a - b).mean() diff --git a/test/test_helpers/test_statistics.py b/test/test_helpers/test_statistics.py index e0febe1e062f40f705ad2f4d1fb2280a6dc32ba5..2a77f0806b886bee1a3961ff0a972e8f0ee62873 100644 --- a/test/test_helpers/test_statistics.py +++ b/test/test_helpers/test_statistics.py @@ -4,7 +4,7 @@ import pytest import xarray as xr from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ - centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply + centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply lazy = pytest.lazy_fixture @@ -16,11 +16,23 @@ def input_data(): np.random.normal(10, 1, 3000)]).T +@pytest.fixture(scope='module') +def input_data_gamma(): + return np.array([np.random.gamma(2, 2, 3000), + np.random.gamma(3, 3, 3000), + np.random.gamma(1, 1, 3000)]).T + + @pytest.fixture(scope='module') def pandas(input_data): return pd.DataFrame(input_data) +@pytest.fixture(scope='module') +def pandas_gamma(input_data_gamma): + return pd.DataFrame(input_data_gamma) + + @pytest.fixture(scope='module') def pd_mean(): return [2, 10, 3] @@ -48,6 +60,13 @@ def xarray(input_data): return xr.DataArray(input_data, coords=coords, dims=coords.keys()) +@pytest.fixture(scope='module') +def xarray_gamma(input_data_gamma): + shape = input_data_gamma.shape + coords = {'index': range(shape[0]), 'value': range(shape[1])} + return xr.DataArray(input_data_gamma, coords=coords, dims=coords.keys()) + + @pytest.fixture(scope='module') def xr_mean(input_data): return xr.DataArray([2, 10, 3], coords={'value': range(3)}, dims=['value']) @@ -169,3 +188,36 @@ class TestMinMax: max_expected = (data_orig.max(dim) - dmin) / (dmax - dmin) assert np.testing.assert_array_almost_equal(data.min(dim), min_expected, decimal=1) is None assert np.testing.assert_array_almost_equal(data.max(dim), max_expected, decimal=1) is None + + +class TestLog: + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas_gamma'), 0), + (lazy('xarray_gamma'), 'index')]) + def test_standardise(self, data_orig, dim): + data, opts = log(data_orig, dim) + assert {"method", "mean", "std"} == opts.keys() + assert opts["method"] == "log" + assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None + assert np.testing.assert_almost_equal(data.std(dim), [1, 1, 1]) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas_gamma'), 0), + (lazy('xarray_gamma'), 'index')]) + def test_standardise_inverse(self, data_orig, dim): + data, opts = log(data_orig, dim) + data_recovered = log_inverse(data, opts["mean"], opts["std"]) + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas_gamma'), 0), + (lazy('xarray_gamma'), 'index')]) + def test_apply_standardise_inverse(self, data_orig, dim): + data, opts = log(data_orig, dim) + data_recovered = apply_inverse_transformation(data, **opts) + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_standardise_apply(self, data_orig, dim): + data_ref, opts = log(data_orig, dim) + data_test = log_apply(data_orig, opts["mean"], opts["std"]) + assert np.testing.assert_array_almost_equal(data_ref, data_test) is None