diff --git a/.gitignore b/.gitignore index d115e562c57ec8f353f32944bdcb38e68e89edc7..9884d3949d33343ae668c24f304d8d4e57956d88 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,6 @@ ehthumbs.db Thumbs.db .idea/ /venv/ -.coverage # don't check data and plot folder # #################################### @@ -49,3 +48,9 @@ Thumbs.db # tmp folder # ############## /tmp/ + +# test related data # +##################### +.coverage +htmlcov/ +.pytest_cache diff --git a/src/data_preparation.py b/src/data_preparation.py index 74dc13636948a2a0891defdc5dd74dd470f8f8e5..bbd306d87ae9ef9530d6691d56a6d9318c239de8 100644 --- a/src/data_preparation.py +++ b/src/data_preparation.py @@ -10,6 +10,11 @@ import os from src import join, helpers from src import statistics from typing import Union, List, Dict, Iterable +import datetime as dt + + +# definde more general date type for type hinting +date = Union[dt.date, dt.datetime] class DataPrep(object): @@ -40,7 +45,8 @@ class DataPrep(object): file_name = self._set_file_name() meta_file = self._set_meta_file_name() try: - self.data = xr.open_dataarray(file_name) + data = self._slice_prep(xr.open_dataarray(file_name)) + self.data = self.check_for_negative_concentrations(data) self.meta = pd.read_csv(meta_file, index_col=0) except FileNotFoundError as e: logging.warning(e) @@ -50,8 +56,9 @@ class DataPrep(object): # convert df_all to xarray xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} xarr = xr.Dataset(xarr).to_array(dim='Stations') - self.data = xarr - # save locally as nc file + data = self._slice_prep(xarr) + self.data = self.check_for_negative_concentrations(data) + # save locally as nc/csv file xarr.to_netcdf(path=file_name) self.meta.to_csv(meta_file) @@ -267,12 +274,42 @@ class DataPrep(object): res.name = index_name return res - def _slice_prep(self, data, coord='datetime'): - raise NotImplementedError + def _slice_prep(self, data: xr.DataArray, coord: str = 'datetime') -> xr.DataArray: + """ + This function prepares all settings for slicing and executes _slice + :param data: + :param coord: name of axis to slice + :return: + """ + start = self.kwargs.get('start', data.coords[coord][0].values) + end = self.kwargs.get('end', data.coords[coord][-1].values) + return self._slice(data, start, end, coord) @staticmethod - def _slice(data, start, end, coord): - raise NotImplementedError + def _slice(data: xr.DataArray, start: Union[date, str], end: Union[date, str], coord: str) -> xr.DataArray: + """ + This function slices through a given data_item (for example select only values of 2011) + :param data: + :param start: + :param end: + :param coord: name of axis to slice + :return: + """ + return data.loc[{coord: slice(start, end)}] # type: ignore + + def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray: + """ + This function sets all negative concentrations to zero. Names of all concentrations are extracted from + https://join.fz-juelich.de/services/rest/surfacedata/ #2.1 Parameters + :param data: + :param minimum: + :return: + """ + chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", + "propane", "so2", "toluene"] + used_chem_vars = list(set(chem_vars) & set(self.variables)) + data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) + return data if __name__ == "__main__": diff --git a/test/test_data_preparation.py b/test/test_data_preparation.py index f8a18aea433b6b141eced21b41da81825a2b683c..0e0984f096fd444fb76f29184df8b1d85a046756 100644 --- a/test/test_data_preparation.py +++ b/test/test_data_preparation.py @@ -6,6 +6,7 @@ import numpy as np import xarray as xr import datetime as dt import pandas as pd +from operator import itemgetter class TestDataPrep: @@ -21,7 +22,7 @@ class TestDataPrep: assert data.station == ['DEBW107'] assert data.variables == ['o3', 'temp'] assert data.statistics_per_var == {'o3': 'dma8eu', 'temp': 'maximum'} - assert not all([data.mean, data.std, data.df, data.history, data.label]) + assert not all([data.mean, data.std, data.history, data.label]) assert {'test': 'testKWARGS'}.items() <= data.kwargs.items() def test_init_no_stats(self): @@ -141,15 +142,29 @@ class TestDataPrep: with pytest.raises(NotImplementedError): data.inverse_transform() - def test_nan_remove_no_history(self, data): + def test_nan_remove_no_hist_or_label(self, data): assert data.history is None assert data.label is None data.history_label_nan_remove('datetime') assert data.history is None assert data.label is None + data.make_history_window('datetime', 6) + assert data.history is not None + data.history_label_nan_remove('datetime') + assert data.history is None + data.make_labels('variables', 'o3', 'datetime', 2) + assert data.label is not None + data.history_label_nan_remove('datetime') + assert data.label is None def test_nan_remove(self, data): - pass + data.make_history_window('datetime', -12) + data.make_labels('variables', 'o3', 'datetime', 3) + shape = data.history.shape + data.history_label_nan_remove('datetime') + assert data.history.isnull().sum() == 0 + assert itemgetter(0, 1, 3)(shape) == itemgetter(0, 1, 3)(data.history.shape) + assert shape[2] >= data.history.shape[2] def test_create_index_array(self, data): index_array = data.create_index_array('window', range(1, 4)) @@ -206,3 +221,23 @@ class TestDataPrep: save_label = data.label data.make_labels('variables', 'o3', 'datetime', -3) assert np.testing.assert_array_equal(data.label, save_label) is None + + def test_slice(self, data): + res = data._slice(data.data, dt.date(1997, 1, 1), dt.date(1997, 1, 10), 'datetime') + assert itemgetter(0, 2)(res.shape) == itemgetter(0, 2)(data.data.shape) + assert res.shape[1] == 10 + + def test_slice_prep(self, data): + res = data._slice_prep(data.data) + assert res.shape == data.data.shape + data.kwargs['start'] = res.coords['datetime'][0].values + data.kwargs['end'] = res.coords['datetime'][9].values + res = data._slice_prep(data.data) + assert itemgetter(0, 2)(res.shape) == itemgetter(0, 2)(data.data.shape) + assert res.shape[1] == 10 + + def test_check_for_neg_concentrations(self, data): + res = data.check_for_negative_concentrations(data.data) + assert res.sel({'variables': 'o3'}).min() >= 0 + res = data.check_for_negative_concentrations(data.data, minimum=2) + assert res.sel({'variables': 'o3'}).min() >= 2