From 12aa46e6a8cdb03dd058630ebb16a8330903fa2b Mon Sep 17 00:00:00 2001
From: lukas leufen <l.leufen@fz-juelich.de>
Date: Fri, 14 Feb 2020 13:53:37 +0100
Subject: [PATCH] prepare data preparation to use different transformation
 schemes

---
 src/data_handling/data_preparation.py         | 15 ++-
 src/statistics.py                             | 28 +++++-
 .../test_data_preparation.py                  | 41 +++++++-
 test/test_statistics.py                       | 96 +++++++++++++++----
 4 files changed, 150 insertions(+), 30 deletions(-)

diff --git a/src/data_handling/data_preparation.py b/src/data_handling/data_preparation.py
index 5bca71f5..75a98ffb 100644
--- a/src/data_handling/data_preparation.py
+++ b/src/data_handling/data_preparation.py
@@ -216,7 +216,7 @@ class DataPrep(object):
         self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method)
         self._transform_method = None
 
-    def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False) -> None:
+    def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean = None, std=None) -> None:
         """
         This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0
         and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale
@@ -247,11 +247,19 @@ class DataPrep(object):
             else:
                 raise NotImplementedError
 
+        def f_apply(data):
+            if method == "standardise":
+                return mean, std, statistics.standardise_apply(data, mean, std)
+            elif method == "centre":
+                return mean, None, statistics.centre_apply(data, mean)
+            else:
+                raise NotImplementedError
+
         if not inverse:
             if self._transform_method is not None:
                 raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with "
                                      f"{self._transform_method}. Please perform inverse transformation of data first.")
-            self.mean, self.std, self.data = f(self.data)
+            self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data)
             self._transform_method = method
         else:
             self.inverse_transform()
@@ -387,8 +395,7 @@ class DataPrep(object):
         return data
 
     def get_transposed_history(self):
-        if self.history is not None:
-            return self.history.transpose("datetime", "window", "Stations", "variables")
+        return self.history.transpose("datetime", "window", "Stations", "variables")
 
 
 if __name__ == "__main__":
diff --git a/src/statistics.py b/src/statistics.py
index e3481d0e..26b2be88 100644
--- a/src/statistics.py
+++ b/src/statistics.py
@@ -15,11 +15,11 @@ Data = Union[xr.DataArray, pd.DataFrame]
 
 
 def apply_inverse_transformation(data, mean, std=None, method="standardise"):
-    if method == 'standardise':
+    if method == 'standardise':  # pragma: no branch
         return standardise_inverse(data, mean, std)
-    elif method == 'centre':
+    elif method == 'centre':  # pragma: no branch
         return centre_inverse(data, mean)
-    elif method == 'normalise':
+    elif method == 'normalise':  # pragma: no cover
         # use min/max of data or given min/max
         raise NotImplementedError
     else:
@@ -52,6 +52,17 @@ def standardise_inverse(data: Data, mean: Data, std: Data) -> Data:
     return data * std + mean
 
 
+def standardise_apply(data: Data, mean: Data, std: Data) -> Data:
+    """
+    This applies `standardise` on data using given mean and std.
+    :param data:
+    :param mean:
+    :param std:
+    :return:
+    """
+    return (data - mean) / std
+
+
 def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, None, Data]:
     """
     This function centres a xarray.dataarray (along dim) or pandas.DataFrame (along axis) to mean=0
@@ -77,6 +88,17 @@ def centre_inverse(data: Data, mean: Data) -> Data:
     return data + mean
 
 
+def centre_apply(data: Data, mean: Data) -> Data:
+    """
+    This applies `centre` on data using given mean and std.
+    :param data:
+    :param mean:
+    :param std:
+    :return:
+    """
+    return data - mean
+
+
 def mean_squared_error(a, b):
     return np.square(a - b).mean()
 
diff --git a/test/test_data_handling/test_data_preparation.py b/test/test_data_handling/test_data_preparation.py
index 72bacaf9..ac449c4d 100644
--- a/test/test_data_handling/test_data_preparation.py
+++ b/test/test_data_handling/test_data_preparation.py
@@ -152,6 +152,26 @@ class TestDataPrep:
         assert isinstance(data.mean, xr.DataArray)
         assert isinstance(data.std, xr.DataArray)
 
+    def test_transform_standardise_apply(self, data):
+        assert data._transform_method is None
+        assert data.mean is None
+        assert data.std is None
+        data_mean_orig = data.data.mean('datetime').variable.values
+        data_std_orig = data.data.std('datetime').variable.values
+        mean_external = np.array([20, 12])
+        std_external = np.array([15, 5])
+        mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
+        std = xr.DataArray(std_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
+        data.transform('datetime', mean=mean, std=std)
+        assert all(data.mean.values == mean_external)
+        assert all(data.std.values == std_external)
+        data_mean_transformed = data.data.mean('datetime').variable.values
+        data_std_transformed = data.data.std('datetime').variable.values
+        data_mean_expected = (data_mean_orig - mean_external) / std_external  # mean scales as any other data
+        data_std_expected = data_std_orig / std_external  # std scales by given std
+        assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None
+        assert np.testing.assert_almost_equal(data_std_transformed, data_std_expected) is None
+
     @pytest.mark.parametrize('mean, std, method, msg', [(10, 3, 'standardise', ''), (6, None, 'standardise', 'std, '),
                                                         (None, 3, 'standardise', 'mean, '), (19, None, 'centre', ''),
                                                         (None, 2, 'centre', 'mean, '), (8, 2, 'centre', ''),
@@ -168,12 +188,29 @@ class TestDataPrep:
         assert data._transform_method is None
         assert data.mean is None
         assert data.std is None
-        data_std_org = data.data.std('datetime'). variable.values
+        data_std_orig = data.data.std('datetime'). variable.values
         data.transform('datetime', 'centre')
         assert data._transform_method == 'centre'
         assert np.testing.assert_almost_equal(data.data.mean('datetime').variable.values, np.array([[0, 0]])) is None
-        assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_org) is None
+        assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_orig) is None
+        assert data.std is None
+
+    def test_transform_centre_apply(self, data):
+        assert data._transform_method is None
+        assert data.mean is None
+        assert data.std is None
+        data_mean_orig = data.data.mean('datetime').variable.values
+        data_std_orig = data.data.std('datetime').variable.values
+        mean_external = np.array([20, 12])
+        mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
+        data.transform('datetime', 'centre', mean=mean)
+        assert all(data.mean.values == mean_external)
         assert data.std is None
+        data_mean_transformed = data.data.mean('datetime').variable.values
+        data_std_transformed = data.data.std('datetime').variable.values
+        data_mean_expected = (data_mean_orig - mean_external)  # mean scales as any other data
+        assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None
+        assert np.testing.assert_almost_equal(data_std_transformed, data_std_orig) is None
 
     @pytest.mark.parametrize('method', ['standardise', 'centre'])
     def test_transform_inverse(self, data, method):
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 308ac655..cad91556 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -3,7 +3,10 @@ import pandas as pd
 import pytest
 import xarray as xr
 
-from src.statistics import standardise, standardise_inverse, centre, centre_inverse
+from src.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, centre_apply,\
+    apply_inverse_transformation
+
+lazy = pytest.lazy_fixture
 
 
 @pytest.fixture(scope='module')
@@ -18,44 +21,95 @@ def pandas(input_data):
     return pd.DataFrame(input_data)
 
 
+@pytest.fixture(scope='module')
+def pd_mean():
+    return [2, 10, 3]
+
+
+@pytest.fixture(scope='module')
+def pd_std():
+    return [3, 2, 3]
+
+
 @pytest.fixture(scope='module')
 def xarray(input_data):
-    return xr.DataArray(input_data, dims=['index', 'value'])
+    shape = input_data.shape
+    coords = {'index': range(shape[0]), 'value': range(shape[1])}
+    return xr.DataArray(input_data, coords=coords, dims=coords.keys())
+
+
+@pytest.fixture(scope='module')
+def xr_mean(input_data):
+    return xr.DataArray([2, 10, 3], coords={'value': range(3)}, dims=['value'])
+
+
+@pytest.fixture(scope='module')
+def xr_std(input_data):
+    return xr.DataArray([3, 2, 3], coords={'value': range(3)}, dims=['value'])
 
 
 class TestStandardise:
 
-    @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
-                                               (pytest.lazy_fixture('xarray'), 'index')])
-    def test_standardise(self, data_org, dim):
-        mean, std, data = standardise(data_org, dim)
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_standardise(self, data_orig, dim):
+        mean, std, data = standardise(data_orig, dim)
         assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None
         assert np.testing.assert_almost_equal(std, [2, 3, 1], decimal=1) is None
         assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None
         assert np.testing.assert_almost_equal(data.std(dim), [1, 1, 1]) is None
 
-    @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
-                                               (pytest.lazy_fixture('xarray'), 'index')])
-    def test_standardise_inverse(self, data_org, dim):
-        mean, std, data = standardise(data_org, dim)
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_standardise_inverse(self, data_orig, dim):
+        mean, std, data = standardise(data_orig, dim)
         data_recovered = standardise_inverse(data, mean, std)
-        assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None
+        assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
+
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_apply_standardise_inverse(self, data_orig, dim):
+        mean, std, data = standardise(data_orig, dim)
+        data_recovered = apply_inverse_transformation(data, mean, std)
+        assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
+
+    @pytest.mark.parametrize('data_orig, mean, std, dim', [(lazy('pandas'), lazy('pd_mean'), lazy('pd_std'), 0),
+                                                           (lazy('xarray'), lazy('xr_mean'), lazy('xr_std'), 'index')])
+    def test_standardise_apply(self, data_orig, mean, std, dim):
+        data = standardise_apply(data_orig, mean, std)
+        mean_expected = (np.array([2, -5, 10]) - np.array([2, 10, 3])) / np.array([3, 2, 3])
+        std_expected = np.array([2, 3, 1]) / np.array([3, 2, 3])
+        assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None
+        assert np.testing.assert_almost_equal(data.std(dim), std_expected, decimal=1) is None
 
 
 class TestCentre:
 
-    @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
-                                               (pytest.lazy_fixture('xarray'), 'index')])
-    def test_centre(self, data_org, dim):
-        mean, std, data = centre(data_org, dim)
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_centre(self, data_orig, dim):
+        mean, std, data = centre(data_orig, dim)
         assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None
         assert std is None
         assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None
 
-    @pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
-                                               (pytest.lazy_fixture('xarray'), 'index')])
-    def test_centre_inverse(self, data_org, dim):
-        mean, _, data = centre(data_org, dim)
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_centre_inverse(self, data_orig, dim):
+        mean, _, data = centre(data_orig, dim)
         data_recovered = centre_inverse(data, mean)
-        assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None
-
+        assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
+
+    @pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
+                                                (lazy('xarray'), 'index')])
+    def test_apply_centre_inverse(self, data_orig, dim):
+        mean, _, data = centre(data_orig, dim)
+        data_recovered = apply_inverse_transformation(data, mean, method="centre")
+        assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
+
+    @pytest.mark.parametrize('data_orig, mean, dim', [(lazy('pandas'), lazy('pd_mean'), 0),
+                                                      (lazy('xarray'), lazy('xr_mean'), 'index')])
+    def test_centre_apply(self, data_orig, mean, dim):
+        data = centre_apply(data_orig, mean)
+        mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3])
+        assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None
-- 
GitLab