Skip to content
Snippets Groups Projects
Commit 12aa46e6 authored by lukas leufen's avatar lukas leufen
Browse files

prepare data preparation to use different transformation schemes

parent be21b1e0
No related branches found
No related tags found
2 merge requests!50release for v0.7.0,!49Lukas issue054 feat transformation on entire dataset
Pipeline #29578 passed
......@@ -216,7 +216,7 @@ class DataPrep(object):
self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method)
self._transform_method = None
def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False) -> None:
def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean = None, std=None) -> None:
"""
This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0
and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale
......@@ -247,11 +247,19 @@ class DataPrep(object):
else:
raise NotImplementedError
def f_apply(data):
if method == "standardise":
return mean, std, statistics.standardise_apply(data, mean, std)
elif method == "centre":
return mean, None, statistics.centre_apply(data, mean)
else:
raise NotImplementedError
if not inverse:
if self._transform_method is not None:
raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with "
f"{self._transform_method}. Please perform inverse transformation of data first.")
self.mean, self.std, self.data = f(self.data)
self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data)
self._transform_method = method
else:
self.inverse_transform()
......@@ -387,7 +395,6 @@ class DataPrep(object):
return data
def get_transposed_history(self):
if self.history is not None:
return self.history.transpose("datetime", "window", "Stations", "variables")
......
......@@ -15,11 +15,11 @@ Data = Union[xr.DataArray, pd.DataFrame]
def apply_inverse_transformation(data, mean, std=None, method="standardise"):
if method == 'standardise':
if method == 'standardise': # pragma: no branch
return standardise_inverse(data, mean, std)
elif method == 'centre':
elif method == 'centre': # pragma: no branch
return centre_inverse(data, mean)
elif method == 'normalise':
elif method == 'normalise': # pragma: no cover
# use min/max of data or given min/max
raise NotImplementedError
else:
......@@ -52,6 +52,17 @@ def standardise_inverse(data: Data, mean: Data, std: Data) -> Data:
return data * std + mean
def standardise_apply(data: Data, mean: Data, std: Data) -> Data:
"""
This applies `standardise` on data using given mean and std.
:param data:
:param mean:
:param std:
:return:
"""
return (data - mean) / std
def centre(data: Data, dim: Union[str, int]) -> Tuple[Data, None, Data]:
"""
This function centres a xarray.dataarray (along dim) or pandas.DataFrame (along axis) to mean=0
......@@ -77,6 +88,17 @@ def centre_inverse(data: Data, mean: Data) -> Data:
return data + mean
def centre_apply(data: Data, mean: Data) -> Data:
"""
This applies `centre` on data using given mean and std.
:param data:
:param mean:
:param std:
:return:
"""
return data - mean
def mean_squared_error(a, b):
return np.square(a - b).mean()
......
......@@ -152,6 +152,26 @@ class TestDataPrep:
assert isinstance(data.mean, xr.DataArray)
assert isinstance(data.std, xr.DataArray)
def test_transform_standardise_apply(self, data):
assert data._transform_method is None
assert data.mean is None
assert data.std is None
data_mean_orig = data.data.mean('datetime').variable.values
data_std_orig = data.data.std('datetime').variable.values
mean_external = np.array([20, 12])
std_external = np.array([15, 5])
mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
std = xr.DataArray(std_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
data.transform('datetime', mean=mean, std=std)
assert all(data.mean.values == mean_external)
assert all(data.std.values == std_external)
data_mean_transformed = data.data.mean('datetime').variable.values
data_std_transformed = data.data.std('datetime').variable.values
data_mean_expected = (data_mean_orig - mean_external) / std_external # mean scales as any other data
data_std_expected = data_std_orig / std_external # std scales by given std
assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None
assert np.testing.assert_almost_equal(data_std_transformed, data_std_expected) is None
@pytest.mark.parametrize('mean, std, method, msg', [(10, 3, 'standardise', ''), (6, None, 'standardise', 'std, '),
(None, 3, 'standardise', 'mean, '), (19, None, 'centre', ''),
(None, 2, 'centre', 'mean, '), (8, 2, 'centre', ''),
......@@ -168,12 +188,29 @@ class TestDataPrep:
assert data._transform_method is None
assert data.mean is None
assert data.std is None
data_std_org = data.data.std('datetime'). variable.values
data_std_orig = data.data.std('datetime'). variable.values
data.transform('datetime', 'centre')
assert data._transform_method == 'centre'
assert np.testing.assert_almost_equal(data.data.mean('datetime').variable.values, np.array([[0, 0]])) is None
assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_org) is None
assert np.testing.assert_almost_equal(data.data.std('datetime').variable.values, data_std_orig) is None
assert data.std is None
def test_transform_centre_apply(self, data):
assert data._transform_method is None
assert data.mean is None
assert data.std is None
data_mean_orig = data.data.mean('datetime').variable.values
data_std_orig = data.data.std('datetime').variable.values
mean_external = np.array([20, 12])
mean = xr.DataArray(mean_external, coords={"variables": ['o3', 'temp']}, dims=["variables"])
data.transform('datetime', 'centre', mean=mean)
assert all(data.mean.values == mean_external)
assert data.std is None
data_mean_transformed = data.data.mean('datetime').variable.values
data_std_transformed = data.data.std('datetime').variable.values
data_mean_expected = (data_mean_orig - mean_external) # mean scales as any other data
assert np.testing.assert_almost_equal(data_mean_transformed, data_mean_expected) is None
assert np.testing.assert_almost_equal(data_std_transformed, data_std_orig) is None
@pytest.mark.parametrize('method', ['standardise', 'centre'])
def test_transform_inverse(self, data, method):
......
......@@ -3,7 +3,10 @@ import pandas as pd
import pytest
import xarray as xr
from src.statistics import standardise, standardise_inverse, centre, centre_inverse
from src.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, centre_apply,\
apply_inverse_transformation
lazy = pytest.lazy_fixture
@pytest.fixture(scope='module')
......@@ -18,44 +21,95 @@ def pandas(input_data):
return pd.DataFrame(input_data)
@pytest.fixture(scope='module')
def pd_mean():
return [2, 10, 3]
@pytest.fixture(scope='module')
def pd_std():
return [3, 2, 3]
@pytest.fixture(scope='module')
def xarray(input_data):
return xr.DataArray(input_data, dims=['index', 'value'])
shape = input_data.shape
coords = {'index': range(shape[0]), 'value': range(shape[1])}
return xr.DataArray(input_data, coords=coords, dims=coords.keys())
@pytest.fixture(scope='module')
def xr_mean(input_data):
return xr.DataArray([2, 10, 3], coords={'value': range(3)}, dims=['value'])
@pytest.fixture(scope='module')
def xr_std(input_data):
return xr.DataArray([3, 2, 3], coords={'value': range(3)}, dims=['value'])
class TestStandardise:
@pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
(pytest.lazy_fixture('xarray'), 'index')])
def test_standardise(self, data_org, dim):
mean, std, data = standardise(data_org, dim)
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_standardise(self, data_orig, dim):
mean, std, data = standardise(data_orig, dim)
assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None
assert np.testing.assert_almost_equal(std, [2, 3, 1], decimal=1) is None
assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None
assert np.testing.assert_almost_equal(data.std(dim), [1, 1, 1]) is None
@pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
(pytest.lazy_fixture('xarray'), 'index')])
def test_standardise_inverse(self, data_org, dim):
mean, std, data = standardise(data_org, dim)
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_standardise_inverse(self, data_orig, dim):
mean, std, data = standardise(data_orig, dim)
data_recovered = standardise_inverse(data, mean, std)
assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None
assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_apply_standardise_inverse(self, data_orig, dim):
mean, std, data = standardise(data_orig, dim)
data_recovered = apply_inverse_transformation(data, mean, std)
assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
@pytest.mark.parametrize('data_orig, mean, std, dim', [(lazy('pandas'), lazy('pd_mean'), lazy('pd_std'), 0),
(lazy('xarray'), lazy('xr_mean'), lazy('xr_std'), 'index')])
def test_standardise_apply(self, data_orig, mean, std, dim):
data = standardise_apply(data_orig, mean, std)
mean_expected = (np.array([2, -5, 10]) - np.array([2, 10, 3])) / np.array([3, 2, 3])
std_expected = np.array([2, 3, 1]) / np.array([3, 2, 3])
assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None
assert np.testing.assert_almost_equal(data.std(dim), std_expected, decimal=1) is None
class TestCentre:
@pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
(pytest.lazy_fixture('xarray'), 'index')])
def test_centre(self, data_org, dim):
mean, std, data = centre(data_org, dim)
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_centre(self, data_orig, dim):
mean, std, data = centre(data_orig, dim)
assert np.testing.assert_almost_equal(mean, [2, -5, 10], decimal=1) is None
assert std is None
assert np.testing.assert_almost_equal(data.mean(dim), [0, 0, 0]) is None
@pytest.mark.parametrize('data_org, dim', [(pytest.lazy_fixture('pandas'), 0),
(pytest.lazy_fixture('xarray'), 'index')])
def test_centre_inverse(self, data_org, dim):
mean, _, data = centre(data_org, dim)
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_centre_inverse(self, data_orig, dim):
mean, _, data = centre(data_orig, dim)
data_recovered = centre_inverse(data, mean)
assert np.testing.assert_array_almost_equal(data_org, data_recovered) is None
assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
@pytest.mark.parametrize('data_orig, dim', [(lazy('pandas'), 0),
(lazy('xarray'), 'index')])
def test_apply_centre_inverse(self, data_orig, dim):
mean, _, data = centre(data_orig, dim)
data_recovered = apply_inverse_transformation(data, mean, method="centre")
assert np.testing.assert_array_almost_equal(data_orig, data_recovered) is None
@pytest.mark.parametrize('data_orig, mean, dim', [(lazy('pandas'), lazy('pd_mean'), 0),
(lazy('xarray'), lazy('xr_mean'), 'index')])
def test_centre_apply(self, data_orig, mean, dim):
data = centre_apply(data_orig, mean)
mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3])
assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment