From 12aa46e6a8cdb03dd058630ebb16a8330903fa2b Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Fri, 14 Feb 2020 13:53:37 +0100 Subject: [PATCH] prepare data preparation to use different transformation schemes --- src/data_handling/data_preparation.py | 15 ++- src/statistics.py | 28 +++++- .../test_data_preparation.py | 41 +++++++- test/test_statistics.py | 96 +++++++++++++++---- 4 files changed, 150 insertions(+), 30 deletions(-) diff --git a/src/data_handling/data_preparation.py b/src/data_handling/data_preparation.py index 5bca71f5..75a98ffb 100644 --- a/src/data_handling/data_preparation.py +++ b/src/data_handling/data_preparation.py @@ -216,7 +216,7 @@ class DataPrep(object): self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) self._transform_method = None - def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False) -> None: + def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean = None, std=None) -> None: """ This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0 and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale @@ -247,11 +247,19 @@ class DataPrep(object): else: raise NotImplementedError + def f_apply(data): + if method == "standardise": + return mean, std, statistics.standardise_apply(data, mean, std) + elif method == "centre": + return mean, None, statistics.centre_apply(data, mean) + else: + raise NotImplementedError + if not inverse: if self._transform_method is not None: raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with " f"{self._transform_method}. Please perform inverse transformation of data first.") - self.mean, self.std, self.data = f(self.data) + self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data) self._transform_method = method else: self.inverse_transform() @@ -387,8 +395,7 @@ class DataPrep(object): return data def get_transposed_history(self): - if self.history is not None: - return self.history.transpose("datetime", "window", "Stations", "variables") + return self.history.transpose("datetime", "window", "Stations", "variables") if __name__ == "__main__": diff --git a/src/statistics.py b/src/statistics.py index e3481d0e..26b2be88 100644 --- a/src/statistics.py +++ b/src/statistics.py @@ -15,11 +15,11 @@ Data = Union[xr.DataArray, pd.DataFrame] def apply_inverse_transformation(data, mean, std=None, method="standardise"): - if method == 'standardise': + if method == 'standardise': # pragma: no branch return standardise_inverse(data, mean, std) - elif method == 'centre': + elif method == 'centre': # pragma: no branch return centre_inverse(data, mean) - elif method == 'normalise': + elif method == 'normalise': # pragma: no cover # use min/max of data or given min/max raise NotImplementedError else: @@ -52,6 +52,17 @@ def standardise_inverse(data: Data, mean: Data, std: Data) -> Data: return data * std + mean +def standardise_apply(data: Data, mean: Data, std: Data) -> Data: + """ + This applies `standardise` on data using given mean and std. + :param data: + :param mean: + :param std: + :return: + """ + return (data - mean) / std + + def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, None, Data]: """ This function centres a xarray.dataarray (along dim) or pandas.DataFrame (along axis) to mean=0 @@ -77,6 +88,17 @@ def centre_inverse(data: Data, mean: Data) -> Data: return data + mean +def centre_apply(data: Data, mean: Data) -> Data: + """ + This applies `centre` on data using given mean and std. + :param data: + :param mean: + :param std: + :return: + """ + return data - mean + + def mean_squared_error(a, b): return np.square(a - b).mean() diff --git a/test/test_data_handling/test_data_preparation.py b/test/test_data_handling/test_data_preparation.py index 72bacaf9..ac449c4d 100644 --- a/test/test_data_handling/test_data_preparation.py +++ b/test/test_data_handling/test_data_preparation.py @@ -152,6 +152,26 @@ class TestDataPrep: assert isinstance(data.mean, xr.DataArray) assert isinstance(data.std, xr.DataArray) + def test_transform_standardise_apply(self, data): + assert data._transform_method is None + assert data.mean is None + assert data.std is None + data_mean_orig = data.data.mean('datetime').variable.values + data_std_orig = data.data.std('datetime').variable.values + mean_external = np.array([20, 12]) + std_external = np.array([15, 5]) + mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"]) + std = xr.DataArray(std_external, coords={"variables": ['o3', 'temp']}, dims=["variables"]) + data.transform('datetime', mean=mean, std=std) + assert all(data.mean.values == mean_external) + assert all(data.std.values == std_external) + data_mean_transformed = data.data.mean('datetime').variable.values + data_std_transformed = data.data.std('datetime').variable.values + data_mean_expected = (data_mean_orig - mean_external) / std_external # mean scales as any other data + data_std_expected = data_std_orig / std_external # std scales by given std + assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None + assert np.testing.assert_almost_equal(data_std_transformed, data_std_expected) is None + @pytest.mark.parametrize('mean, std, method, msg', [(10, 3, 'standardise', ''), (6, None, 'standardise', 'std, '), (None, 3, 'standardise', 'mean, '), (19, None, 'centre', ''), (None, 2, 'centre', 'mean, '), (8, 2, 'centre', ''), @@ -168,12 +188,29 @@ class TestDataPrep: assert data._transform_method is None assert data.mean is None assert data.std is None - data_std_org = data.data.std('datetime'). variable.values + data_std_orig = data.data.std('datetime'). variable.values data.transform('datetime', 'centre') assert data._transform_method == 'centre' assert np.testing.assert_almost_equal(data.data.mean('datetime').variable.values, np.array([[0, 0]])) is None - assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_org) is None + assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_orig) is None + assert data.std is None + + def test_transform_centre_apply(self, data): + assert data._transform_method is None + assert data.mean is None + assert data.std is None + data_mean_orig = data.data.mean('datetime').variable.values + data_std_orig = data.data.std('datetime').variable.values + mean_external = np.array([20, 12]) + mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"]) + data.transform('datetime', 'centre', mean=mean) + assert all(data.mean.values == mean_external) assert data.std is None + data_mean_transformed = data.data.mean('datetime').variable.values + data_std_transformed = data.data.std('datetime').variable.values + data_mean_expected = (data_mean_orig - mean_external) # mean scales as any other data + assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None + assert np.testing.assert_almost_equal(data_std_transformed, data_std_orig) is None @pytest.mark.parametrize('method', ['standardise', 'centre']) def test_transform_inverse(self, data, method): diff --git a/test/test_statistics.py b/test/test_statistics.py index 308ac655..cad91556 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -3,7 +3,10 @@ import pandas as pd import pytest import xarray as xr -from src.statistics import standardise, standardise_inverse, centre, centre_inverse +from src.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, centre_apply,\ + apply_inverse_transformation + +lazy = pytest.lazy_fixture @pytest.fixture(scope='module') @@ -18,44 +21,95 @@ def pandas(input_data): return pd.DataFrame(input_data) +@pytest.fixture(scope='module') +def pd_mean(): + return [2, 10, 3] + + +@pytest.fixture(scope='module') +def pd_std(): + return [3, 2, 3] + + @pytest.fixture(scope='module') def xarray(input_data): - return xr.DataArray(input_data, dims=['index', 'value']) + shape = input_data.shape + coords = {'index': range(shape[0]), 'value': range(shape[1])} + return xr.DataArray(input_data, coords=coords, dims=coords.keys()) + + +@pytest.fixture(scope='module') +def xr_mean(input_data): + return xr.DataArray([2, 10, 3], coords={'value': range(3)}, dims=['value']) + + +@pytest.fixture(scope='module') +def xr_std(input_data): + return xr.DataArray([3, 2, 3], coords={'value': range(3)}, dims=['value']) class TestStandardise: - @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0), - (pytest.lazy_fixture('xarray'), 'index')]) - def test_standardise(self, data_org, dim): - mean, std, data = standardise(data_org, dim) + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_standardise(self, data_orig, dim): + mean, std, data = standardise(data_orig, dim) assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None assert np.testing.assert_almost_equal(std, [2, 3, 1], decimal=1) is None assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None assert np.testing.assert_almost_equal(data.std(dim), [1, 1, 1]) is None - @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0), - (pytest.lazy_fixture('xarray'), 'index')]) - def test_standardise_inverse(self, data_org, dim): - mean, std, data = standardise(data_org, dim) + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_standardise_inverse(self, data_orig, dim): + mean, std, data = standardise(data_orig, dim) data_recovered = standardise_inverse(data, mean, std) - assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_apply_standardise_inverse(self, data_orig, dim): + mean, std, data = standardise(data_orig, dim) + data_recovered = apply_inverse_transformation(data, mean, std) + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, mean, std, dim', [(lazy('pandas'), lazy('pd_mean'), lazy('pd_std'), 0), + (lazy('xarray'), lazy('xr_mean'), lazy('xr_std'), 'index')]) + def test_standardise_apply(self, data_orig, mean, std, dim): + data = standardise_apply(data_orig, mean, std) + mean_expected = (np.array([2, -5, 10]) - np.array([2, 10, 3])) / np.array([3, 2, 3]) + std_expected = np.array([2, 3, 1]) / np.array([3, 2, 3]) + assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None + assert np.testing.assert_almost_equal(data.std(dim), std_expected, decimal=1) is None class TestCentre: - @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0), - (pytest.lazy_fixture('xarray'), 'index')]) - def test_centre(self, data_org, dim): - mean, std, data = centre(data_org, dim) + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_centre(self, data_orig, dim): + mean, std, data = centre(data_orig, dim) assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None assert std is None assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None - @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0), - (pytest.lazy_fixture('xarray'), 'index')]) - def test_centre_inverse(self, data_org, dim): - mean, _, data = centre(data_org, dim) + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_centre_inverse(self, data_orig, dim): + mean, _, data = centre(data_orig, dim) data_recovered = centre_inverse(data, mean) - assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None - + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0), + (lazy('xarray'), 'index')]) + def test_apply_centre_inverse(self, data_orig, dim): + mean, _, data = centre(data_orig, dim) + data_recovered = apply_inverse_transformation(data, mean, method="centre") + assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None + + @pytest.mark.parametrize('data_orig, mean, dim', [(lazy('pandas'), lazy('pd_mean'), 0), + (lazy('xarray'), lazy('xr_mean'), 'index')]) + def test_centre_apply(self, data_orig, mean, dim): + data = centre_apply(data_orig, mean) + mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3]) + assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None -- GitLab