diff --git a/src/data_preparation.py b/src/data_preparation.py index 3c50ba893563780dfd8ac92f36fffabc38ed16a9..badd75aa709e108c2516ccf24a3de915c77ca258 100644 --- a/src/data_preparation.py +++ b/src/data_preparation.py @@ -116,7 +116,7 @@ class DataPrep(object): :return: """ df_all = {} - df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var, + df, meta = join.download_join(station_name=self.station, stat_var=self.statistics_per_var, station_type=self.station_type, network_name=self.network) df_all[self.station[0]] = df # convert df_all to xarray diff --git a/src/join.py b/src/join.py index 4f9f36f960bc5a757a70b39222fb183ccec7aa8f..43271a7b0525b5d829ea761019176197c78c5468 100644 --- a/src/join.py +++ b/src/join.py @@ -6,10 +6,11 @@ import requests import logging import pandas as pd import datetime as dt -from typing import Iterator, Union, List +from typing import Iterator, Union, List, Dict from src import helpers join_url_base = 'https://join.fz-juelich.de/services/rest/surfacedata/' +str_or_none = Union[str, None] class EmptyQueryResult(Exception): @@ -19,54 +20,46 @@ class EmptyQueryResult(Exception): pass -def download_join(station_name: Union[str, List[str]], statvar: dict, station_type: str = None, network_name: str = None) -> [pd.DataFrame, pd.DataFrame]: +def download_join(station_name: Union[str, List[str]], stat_var: dict, station_type: str = None, + network_name: str = None) -> [pd.DataFrame, pd.DataFrame]: """ read data from JOIN/TOAR :param station_name: Station name e.g. DEBY122 - :param statvar: key as variable like 'O3', values as statistics on keys like 'mean' - :param station_type: - :param network_name: + :param stat_var: key as variable like 'O3', values as statistics on keys like 'mean' + :param station_type: set the station type like "traffic" or "background", can be none + :param network_name: set the measurement network like "UBA" or "AIRBASE", can be none :returns: - - df - pandas df with all variables and statistics - - meta - pandas df with all meta information + - df - data frame with all variables and statistics + - meta - data frame with all meta information """ # make sure station_name parameter is a list station_name = helpers.to_list(station_name) # load series information - opts = {"base": join_url_base, "service": "series", "station_id": station_name[0], "station_type": station_type, - "network_name": network_name} - url = create_url(**opts) - response = requests.get(url) - station_vars = response.json() - vars_dict = {item[3].lower(): item[0] for item in station_vars} + vars_dict = load_series_information(station_name, station_type, network_name) # download all variables with given statistic data = None df = None for var in _lower_list(sorted(vars_dict.keys())): - if var in statvar.keys(): + if var in stat_var.keys(): + logging.info('load: {}'.format(var)) # create data link - opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': statvar[var], + opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], 'sampling': 'daily', 'capture': 0, 'min_data_length': 1460} - url = create_url(**opts) # load data - response = requests.get(url) - data = response.json() + data = get_data(opts) # correct namespace of statistics - stat = _correct_stat_name(statvar[var]) + stat = _correct_stat_name(stat_var[var]) # store data in pandas dataframe - index = map(lambda s: dt.datetime.strptime(s, "%Y-%m-%d %H:%M"), data['datetime']) - if df is None: - df = pd.DataFrame(data[stat], index=index, columns=[var]) - else: - df = pd.concat([df, pd.DataFrame(data[stat], index=index, columns=[var])], axis=1) + df = _save_to_pandas(df, data, stat, var) + logging.debug('finished: {}'.format(var)) if data: @@ -77,6 +70,51 @@ def download_join(station_name: Union[str, List[str]], statvar: dict, station_ty raise EmptyQueryResult("No data found in JOIN.") +def get_data(opts: Dict) -> Union[Dict, List]: + """ + Download join data using requests framework. Data is returned as json like structure. Depending on the response + structure, this can lead to a list or dictionary. + :param opts: options to create the request url + :return: requested data (either as list or dictionary) + """ + url = create_url(**opts) + response = requests.get(url) + return response.json() + + +def load_series_information(station_name: List[str], station_type: str_or_none, network_name: str_or_none) -> Dict: + """ + List all series ids that are available for given station id and network name. + :param station_name: Station name e.g. DEBW107 + :param station_type: station type like "traffic" or "background" + :param network_name: measurement network of the station like "UBA" or "AIRBASE" + :return: all available series for requested station stored in an dictionary with parameter name (variable) as key + and the series id as value. + """ + opts = {"base": join_url_base, "service": "series", "station_id": station_name[0], "station_type": station_type, + "network_name": network_name} + station_vars = get_data(opts) + vars_dict = {item[3].lower(): item[0] for item in station_vars} + return vars_dict + + +def _save_to_pandas(df: Union[pd.DataFrame, None], data: dict, stat: str, var: str) -> pd.DataFrame: + """ + Save given data in data frame. If given data frame is not empty, the data is appened as new column. + :param df: data frame to append the new data, can be none + :param data: new data to append or format as data frame containing the keys 'datetime' and '<stat>' + :param stat: extracted statistic to get values from data (e.g. 'mean', 'dma8eu') + :param var: variable the data is from (e.g. 'o3') + :return: new created or concatenated data frame + """ + index = map(lambda s: dt.datetime.strptime(s, "%Y-%m-%d %H:%M"), data['datetime']) + if df is None: + df = pd.DataFrame(data[stat], index=index, columns=[var]) + else: + df = pd.concat([df, pd.DataFrame(data[stat], index=index, columns=[var])], axis=1) + return df + + def _correct_stat_name(stat: str) -> str: """ Map given statistic name to new namespace defined by mapping dict. Return given name stat if not element of mapping @@ -98,7 +136,7 @@ def _lower_list(args: List[str]) -> Iterator[str]: yield string.lower() -def create_url(base: str, service: str, **kwargs: Union[str, int, float]) -> str: +def create_url(base: str, service: str, **kwargs: Union[str, int, float, None]) -> str: """ create a request url with given base url, service type and arbitrarily many additional keyword arguments :param base: basic url of the rest service @@ -106,7 +144,9 @@ def create_url(base: str, service: str, **kwargs: Union[str, int, float]) -> str :param kwargs: keyword pairs for optional request specifications, e.g. 'statistics=maximum' :return: combined url as string """ - url = '{}{}/?'.format(base, service) + '&'.join('{}={}'.format(k, v) for k, v in kwargs.items() if v is not None) + if not base.endswith("/"): + base += "/" + url = f"{base}{service}/?{'&'.join(f'{k}={v}' for k, v in kwargs.items() if v is not None)}" return url diff --git a/test/test_join.py b/test/test_join.py new file mode 100644 index 0000000000000000000000000000000000000000..865ae80dfaaa0244eb7592e65ef134a23b36634c --- /dev/null +++ b/test/test_join.py @@ -0,0 +1,116 @@ +from typing import Iterable +import datetime as dt +import pytest + +from src.join import * +from src.join import _save_to_pandas, _correct_stat_name, _lower_list + + +class TestJoinUrlBase: + + def test_url(self): + assert join_url_base == 'https://join.fz-juelich.de/services/rest/surfacedata/' + + +class TestDownloadJoin: + + def test_download_single_var(self): + data, meta = download_join("DEBW107", {"o3": "dma8eu"}) + assert data.columns == "o3" + assert meta.columns == "DEBW107" + + def test_download_empty(self): + with pytest.raises(EmptyQueryResult) as e: + download_join("DEBW107", {"o3": "dma8eu"}, "traffic") + assert e.value.args[-1] == "No data found in JOIN." + + +class TestGetData: + + def test(self): + opts = {"base": join_url_base, "service": "series", "station_id": 'DEBW107', "network_name": "UBA", + "parameter_name": "o3,no2"} + assert get_data(opts) == [[17057, 'UBA', 'DEBW107', 'O3'], [17058, 'UBA', 'DEBW107', 'NO2']] + + +class TestLoadSeriesInformation: + + def test_standard_query(self): + expected_subset = {'o3': 23031, 'no2': 39002, 'temp--lubw': 17059, 'wspeed': 17060} + assert expected_subset.items() <= load_series_information(['DEBW107'], None, None).items() + + def test_empty_result(self): + assert load_series_information(['DEBW107'], "traffic", None) == {} + + +class TestSaveToPandas: + + @staticmethod + def convert_date(date): + return map(lambda s: dt.datetime.strptime(s, "%Y-%m-%d %H:%M"), date) + + @pytest.fixture + def date(self): + return ['1997-01-01 00:00', '1997-01-02 00:00', '1997-01-03 00:00', '1997-01-04 00:00'] + + @pytest.fixture + def values(self): + return [86.21, 94.76, 76.96, 99.89] + + @pytest.fixture + def alternative_values(self): + return [20.0, 25.2, 25.1, 23.6] + + @pytest.fixture + def create_df(self, date, values): + return pd.DataFrame(values, index=self.convert_date(date), columns=['cloudcover']) + + def test_empty_df(self, date, values, create_df): + data = {'datetime': date, 'mean': values, 'metadata': None} + assert pd.testing.assert_frame_equal(create_df, _save_to_pandas(None, data, 'mean', 'cloudcover')) is None + + def test_not_empty_df(self, date, alternative_values, create_df): + data = {'datetime': date, 'max': alternative_values, 'metadata': None} + next_df = pd.DataFrame(data["max"], index=self.convert_date(date), columns=['temperature']) + df_concat = pd.concat([create_df, next_df], axis=1) + assert pd.testing.assert_frame_equal(df_concat, _save_to_pandas(create_df, data, 'max', 'temperature')) is None + + +class TestCorrectStatName: + + def test_nothing_to_do(self): + assert _correct_stat_name("dma8eu") == "dma8eu" + assert _correct_stat_name("max") == "max" + + def test_correct_string(self): + assert _correct_stat_name("maximum") == "max" + assert _correct_stat_name("minimum") == "min" + assert _correct_stat_name("average_values") == "mean" + + +class TestLowerList: + + def test_string_lowering(self): + list_iterator = _lower_list(["Capitalised", "already_small", "UPPERCASE", "veRyStRaNGe"]) + assert isinstance(list_iterator, Iterable) + assert list(list_iterator) == ["capitalised", "already_small", "uppercase", "verystrange"] + + +class TestCreateUrl: + + def test_minimal_args_given(self): + url = create_url("www.base.edu", "testingservice") + assert url == "www.base.edu/testingservice/?" + + def test_given_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="happy", confidence=0.98) + assert url == "www.base2.edu/testingservice/?mood=happy&confidence=0.98" + + def test_single_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="undefined") + assert url == "www.base2.edu/testingservice/?mood=undefined" + + def test_none_kwargs(self): + url = create_url("www.base2.edu/", "testingservice", mood="sad", happiness=None, stress_factor=100) + assert url == "www.base2.edu/testingservice/?mood=sad&stress_factor=100" + diff --git a/test/test_modules/test_pre_processing.py b/test/test_modules/test_pre_processing.py index 13abe62a2b9199ad8d92528ff5363bd54f1be221..41e8729db8841a257b86740edecbbdfd3e0dc910 100644 --- a/test/test_modules/test_pre_processing.py +++ b/test/test_modules/test_pre_processing.py @@ -42,11 +42,11 @@ class TestPreProcessing: ExperimentSetup(parser_args={}, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'], var_all_dict={'o3': 'dma8eu', 'temp': 'maximum'}) caplog.set_level(logging.INFO) - PreProcessing() - assert caplog.record_tuples[0] == ('root', 20, 'PreProcessing started') - assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started') - assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+\.\d+s to check 5 station\(s\). Found ' - r'5/5 valid stations.')) + with PreProcessing(): + assert caplog.record_tuples[0] == ('root', 20, 'PreProcessing started') + assert caplog.record_tuples[1] == ('root', 20, 'check valid stations started') + assert caplog.record_tuples[-1] == ('root', 20, PyTestRegex(r'run for \d+\.\d+s to check 5 station\(s\). ' + r'Found 5/5 valid stations.')) RunEnvironment().__del__() def test_run(self, obj_with_exp_setup):