diff --git a/.gitignore b/.gitignore index e0fbe0994d8a7c695caffb5fe3135d0cc54abc0b..3ecdfc9dd61c77ead268c28b9787b65ad31078eb 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,4 @@ Thumbs.db htmlcov/ .pytest_cache /test/data/ +report.html diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1ef531ba09fb84a6a70c1fc8f3c6a79e43f8b6e6..c34d9a24bdfacd3795dd2f64bdbd8017ea8ba71e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -47,6 +47,7 @@ tests: when: always paths: - badges/ + - test/ coverage: tags: @@ -88,6 +89,9 @@ pages: - mkdir -p public/coverage - cp -af coverage/. public/coverage - ls public/coverage + - mkdir -p public/test + - cp -af test/. public/test + - ls public/test - ls public when: always artifacts: @@ -97,8 +101,10 @@ pages: - public - badges/ - coverage/ + - test/ cache: key: old-pages paths: - public/badges/ - public/coverage/ + - public/test/ diff --git a/CI/run_pytest.sh b/CI/run_pytest.sh index 4ba24b11a0aa4de7a9f5aca5fe93a4101986d98d..d8755448cfaaf9c1477add0929ecc65cc1115ba4 100644 --- a/CI/run_pytest.sh +++ b/CI/run_pytest.sh @@ -1,10 +1,21 @@ #!/bin/bash # run pytest for all modules -python3 -m pytest test/ | tee test_results.out +python3 -m pytest --html=report.html --self-contained-html test/ | tee test_results.out IS_FAILED=$? +# move html test report +mkdir test/ +BRANCH_NAME=$( echo -e "${CI_COMMIT_REF_NAME////_}") +mkdir test/${BRANCH_NAME} +mkdir test/recent +cp report.html test/${BRANCH_NAME}/. +cp report.html test/recent/. +if [[ "${CI_COMMIT_REF_NAME}" = "master" ]]; then + cp -r report.html test/. +fi + # exit 0 if no tests implemented RUN_NO_TESTS="$(grep -c 'no tests ran' test_results.out)" if [[ ${RUN_NO_TESTS} > 0 ]]; then diff --git a/requirements.txt b/requirements.txt index 4f4c9fd2d3311335f43fe9efa367205a2e0c196f..cdf035784475dac51d17173e7863dbf483e20101 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ requests==2.22.0 pytest==5.2.1 pytest-lazy-fixture==0.6.1 pytest-cov +pytest-html pydot diff --git a/src/data_generator.py b/src/data_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..d067e1e9e0d3225e869ca6a2944b7c749198834b --- /dev/null +++ b/src/data_generator.py @@ -0,0 +1,134 @@ +__author__ = 'Felix Kleinert, Lukas Leufen' +__date__ = '2019-11-07' + +import keras +from src import helpers +from src.data_preparation import DataPrep +import os +from typing import Union, List, Tuple +import decimal +import numpy as np +import xarray as xr + + +class DataGenerator(keras.utils.Sequence): + """ + This class is a generator to handle large arrays for machine learning. This class can be used with keras' + fit_generator and predict_generator. Individual stations are the iterables. This class uses class Dataprep and + returns X, y when an item is called. + Item can be called manually by position (integer) or station id (string). Methods also accept lists with exactly + one entry of integer or string + """ + + def __init__(self, path: str, network: str, stations: Union[str, List[str]], variables: List[str], + interpolate_dim: str, target_dim: str, target_var: str, interpolate_method: str = "linear", + limit_nan_fill: int = 1, window_history: int = 7, window_lead_time: int = 4, + transform_method: str = "standardise", **kwargs): + self.path = os.path.abspath(path) + self.network = network + self.stations = helpers.to_list(stations) + self.variables = variables + self.interpolate_dim = interpolate_dim + self.target_dim = target_dim + self.target_var = target_var + self.interpolate_method = interpolate_method + self.limit_nan_fill = limit_nan_fill + self.window_history = window_history + self.window_lead_time = window_lead_time + self.transform_method = transform_method + self.kwargs = kwargs + + def __repr__(self): + """ + display all class attributes + """ + return f"DataGenerator(path='{self.path}', network='{self.network}', stations={self.stations}, " \ + f"variables={self.variables}, interpolate_dim='{self.interpolate_dim}', target_dim='{self.target_dim}'" \ + f", target_var='{self.target_var}', **{self.kwargs})" + + def __len__(self): + """ + display the number of stations + """ + return len(self.stations) + + def __iter__(self) -> "DataGenerator": + """ + Define the __iter__ part of the iterator protocol to iterate through this generator. Sets the private attribute + `_iterator` to 0. + :return: + """ + self._iterator = 0 + return self + + def __next__(self) -> Tuple[xr.DataArray, xr.DataArray]: + """ + This is the implementation of the __next__ method of the iterator protocol. Get the data generator, and return + the history and label data of this generator. + :return: + """ + if self._iterator < self.__len__(): + data = self.get_data_generator() + self._iterator += 1 + if data.history is not None and data.label is not None: + return data.history.transpose("datetime", "window", "Stations", "variables"), \ + data.label.squeeze("Stations").transpose("datetime", "window") + else: + self.__next__() + else: + raise StopIteration + + def __getitem__(self, item: Union[str, int]) -> Tuple[xr.DataArray, xr.DataArray]: + """ + Defines the get item method for this generator. Retrieve data from generator and return history and labels. + :param item: station key to choose the data generator. + :return: The generator's time series of history data and its labels + """ + data = self.get_data_generator(key=item) + return data.history.transpose("datetime", "window", "Stations", "variables"), \ + data.label.squeeze("Stations").transpose("datetime", "window") + + def get_data_generator(self, key: Union[str, int] = None) -> DataPrep: + """ + Select data for given key, create a DataPrep object and interpolate, transform, make history and labels and + remove nans. + :param key: station key to choose the data generator. + :return: preprocessed data as a DataPrep instance + """ + station = self.get_station_key(key) + data = DataPrep(self.path, self.network, station, self.variables, **self.kwargs) + data.interpolate(self.interpolate_dim, method=self.interpolate_method, limit=self.limit_nan_fill) + data.transform("datetime", method=self.transform_method) + data.make_history_window(self.interpolate_dim, self.window_history) + data.make_labels(self.target_dim, self.target_var, self.interpolate_dim, self.window_lead_time) + data.history_label_nan_remove(self.interpolate_dim) + return data + + def get_station_key(self, key: Union[None, str, int, List[Union[None, str, int]]]) -> str: + """ + Return a valid station key or raise KeyError if this wasn't possible + :param key: station key to choose the data generator. + :return: station key (id from database) + """ + # extract value if given as list + if isinstance(key, list): + if len(key) == 1: + key = key[0] + else: + raise KeyError(f"More than one key was given: {key}") + # return station name either from key or the recent element from iterator + if key is None: + return self.stations[self._iterator] + else: + if isinstance(key, int): + if key < self.__len__(): + return self.stations[key] + else: + raise KeyError(f"{key} is not in range(0, {self.__len__()})") + elif isinstance(key, str): + if key in self.stations: + return key + else: + raise KeyError(f"{key} is not in stations") + else: + raise KeyError(f"Key has to be from Union[str, int]. Given was {key} ({type(key)})") diff --git a/src/data_preparation.py b/src/data_preparation.py index c23c9a9f962848f63a3ff2aa8b6e9f012dd562e4..873433f499f51c003988d8b33da7a525d14544fa 100644 --- a/src/data_preparation.py +++ b/src/data_preparation.py @@ -71,7 +71,7 @@ class DataPrep(object): data is available. The latter case, store downloaded data locally if wished (default yes). """ - self.check_path_and_create() + helpers.check_path_and_create(self.path) file_name = self._set_file_name() meta_file = self._set_meta_file_name() try: @@ -113,14 +113,6 @@ class DataPrep(object): return f"Dataprep(path='{self.path}', network='{self.network}', station={self.station}, " \ f"variables={self.variables}, **{self.kwargs})" - def check_path_and_create(self): - try: - os.makedirs(self.path) - logging.info(f"Created path: {self.path}") - except FileExistsError: - logging.info(f"Path already exists: {self.path}") - pass - def interpolate(self, dim: str, method: str = 'linear', limit: int = None, use_coordinate: Union[bool, str] = True, **kwargs): """ diff --git a/src/helpers.py b/src/helpers.py index 342a0b5ef77d8b286291792aa007f59c8c7b09b2..3c422f1baec528fa0f62d8f290978990bdc471bd 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -8,6 +8,7 @@ import keras.backend as K import math from typing import Union import numpy as np +import os def to_list(arg): @@ -16,6 +17,14 @@ def to_list(arg): return arg +def check_path_and_create(path): + try: + os.makedirs(path) + logging.info(f"Created path: {path}") + except FileExistsError: + logging.info(f"Path already exists: {path}") + + def l_p_loss(power: int): """ Calculate the L<p> loss for given power p. L1 (p=1) is equal to mean absolute error (MAE), L2 (p=2) is to mean diff --git a/src/join.py b/src/join.py index 4909ec267d06e6739fd61d9ee187cf9f7aec922a..a8b8edc7d25a610db3dbbd623cf2cde162587eaa 100644 --- a/src/join.py +++ b/src/join.py @@ -8,6 +8,7 @@ import logging import pandas as pd import datetime as dt from typing import Iterator, Union, List +from src import helpers join_url_base = 'https://join.fz-juelich.de/services/rest/surfacedata/' logging.basicConfig(level=logging.INFO) @@ -24,8 +25,7 @@ def download_join(station_name: Union[str, List[str]], statvar: dict) -> [pd.Dat - meta - pandas df with all meta information """ # make sure station_name parameter is a list - if not isinstance(station_name, list): - station_name = [station_name] + station_name = helpers.to_list(station_name) # load series information opts = {'base': join_url_base, 'service': 'series', 'station_id': station_name[0]} diff --git a/test/test_data_generator.py b/test/test_data_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..ab6233f34533fa7ad35f05306f3990209b83bb82 --- /dev/null +++ b/test/test_data_generator.py @@ -0,0 +1,85 @@ +import pytest +import os +from src.data_generator import DataGenerator +import logging +import numpy as np +import xarray as xr +import datetime as dt +import pandas as pd +from operator import itemgetter + + +class TestDataGenerator: + + @pytest.fixture + def gen(self): + return DataGenerator(os.path.join(os.path.dirname(__file__), 'data'), 'UBA', 'DEBW107', ['o3', 'temp'], + 'datetime', 'variables', 'o3') + + def test_init(self, gen): + assert gen.path == os.path.join(os.path.dirname(__file__), 'data') + assert gen.network == 'UBA' + assert gen.stations == ['DEBW107'] + assert gen.variables == ['o3', 'temp'] + assert gen.interpolate_dim == 'datetime' + assert gen.target_dim == 'variables' + assert gen.target_var == 'o3' + assert gen.interpolate_method == "linear" + assert gen.limit_nan_fill == 1 + assert gen.window_history == 7 + assert gen.window_lead_time == 4 + assert gen.transform_method == "standardise" + assert gen.kwargs == {} + + def test_repr(self, gen): + path = os.path.join(os.path.dirname(__file__), 'data') + assert gen.__repr__().rstrip() == f"DataGenerator(path='{path}', network='UBA', stations=['DEBW107'], "\ + f"variables=['o3', 'temp'], interpolate_dim='datetime', " \ + f"target_dim='variables', target_var='o3', **{{}})".rstrip() + + def test_len(self, gen): + assert len(gen) == 1 + gen.stations = ['station1', 'station2', 'station3'] + assert len(gen) == 3 + + def test_iter(self, gen): + assert hasattr(gen, '_iterator') is False + iter(gen) + assert hasattr(gen, '_iterator') + assert gen._iterator == 0 + + def test_next(self, gen): + gen.kwargs = {'statistics_per_var': {'o3': 'dma8eu', 'temp': 'maximum'}} + for i, d in enumerate(gen, start=1): + assert i == gen._iterator + + def test_getitem(self, gen): + gen.kwargs = {'statistics_per_var': {'o3': 'dma8eu', 'temp': 'maximum'}} + station = gen["DEBW107"] + assert len(station) == 2 + assert station[0].Stations.data == "DEBW107" + assert station[0].data.shape[1:] == (8, 1, 2) + assert station[1].data.shape[-1] == gen.window_lead_time + assert station[0].data.shape[1] == gen.window_history + 1 + + def test_get_station_key(self, gen): + gen.stations.append("DEBW108") + f = gen.get_station_key + iter(gen) + assert f(None) == "DEBW107" + with pytest.raises(KeyError) as e: + f([None, None]) + assert "More than one key was given: [None, None]" in e.value.args[0] + assert f(1) == "DEBW108" + assert f([1]) == "DEBW108" + with pytest.raises(KeyError) as e: + f(3) + assert "3 is not in range(0, 2)" in e.value.args[0] + assert f("DEBW107") == "DEBW107" + assert f(["DEBW108"]) == "DEBW108" + with pytest.raises(KeyError) as e: + f("DEBW999") + assert "DEBW999 is not in stations" in e.value.args[0] + with pytest.raises(KeyError) as e: + f(6.5) + assert "key has to be from Union[str, int]. Given was 6.5 (float)" diff --git a/test/test_data_preparation.py b/test/test_data_preparation.py index 0e0984f096fd444fb76f29184df8b1d85a046756..5d45c041b6e669cced56172d41fc2f9653dd30e7 100644 --- a/test/test_data_preparation.py +++ b/test/test_data_preparation.py @@ -13,8 +13,8 @@ class TestDataPrep: @pytest.fixture def data(self): - return DataPrep('test/data/', 'dummy', 'DEBW107', ['o3', 'temp'], test='testKWARGS', - statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) + return DataPrep(os.path.join(os.path.dirname(__file__), 'data'), 'dummy', 'DEBW107', ['o3', 'temp'], + test='testKWARGS', statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) def test_init(self, data): assert data.path == os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') @@ -29,18 +29,6 @@ class TestDataPrep: with pytest.raises(NotImplementedError): DataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp']) - def test_check_path_and_create(self, caplog): - caplog.set_level(logging.INFO) - d = object.__new__(DataPrep) - d.path = 'data/test' - assert not os.path.exists('data/test') - d.check_path_and_create() - assert os.path.exists('data/test') - assert caplog.messages[0] == "Created path: data/test" - d.check_path_and_create() - assert caplog.messages[1] == "Path already exists: data/test" - os.rmdir('data/test') - def test_repr(self): d = object.__new__(DataPrep) d.path = 'data/test' @@ -53,13 +41,13 @@ class TestDataPrep: def test_set_file_name_and_meta(self): d = object.__new__(DataPrep) - d.path = os.path.abspath('test/data/test') + d.path = os.path.abspath('test/data/') d.station = 'TESTSTATION' d.variables = ['a', 'bc'] assert d._set_file_name() == os.path.join(os.path.abspath(os.path.dirname(__file__)), - "data/test/TESTSTATION_a_bc.nc") + "data/TESTSTATION_a_bc.nc") assert d._set_meta_file_name() == os.path.join(os.path.abspath(os.path.dirname(__file__)), - "data/test/TESTSTATION_a_bc_meta.csv") + "data/TESTSTATION_a_bc_meta.csv") @pytest.mark.parametrize('opts', [{'dim': 'datetime', 'method': 'nearest', 'limit': 10, 'use_coordinate': True}, {'dim': 'datetime', 'limit': 5}, {'dim': 'datetime'}]) diff --git a/test/test_helpers.py b/test/test_helpers.py index e4a23d15b6ef497849af28ffd783b6f51c6c5b5d..ffda6ac47e21b212d3a818d050783cffba96eb03 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -1,11 +1,34 @@ import pytest -from src.helpers import l_p_loss, LearningRateDecay +from src.helpers import to_list, check_path_and_create, l_p_loss, LearningRateDecay import logging import os import keras import numpy as np +class TestToList: + + def test_to_list(self): + assert to_list('a') == ['a'] + assert to_list('abcd') == ['abcd'] + assert to_list([1, 2, 3]) == [1, 2, 3] + assert to_list([45]) == [45] + + +class TestCheckPath: + + def test_check_path_and_create(self, caplog): + caplog.set_level(logging.INFO) + path = 'data/test' + assert not os.path.exists('data/test') + check_path_and_create(path) + assert os.path.exists('data/test') + assert caplog.messages[0] == "Created path: data/test" + check_path_and_create(path) + assert caplog.messages[1] == "Path already exists: data/test" + os.rmdir('data/test') + + class TestLoss: def test_l_p_loss(self):