diff --git a/mlair/helpers/join.py b/mlair/helpers/join.py index ffa81101f2a244b88dd580d966e16741f449522a..9c020b39e1b16b8d6682d61c160cbba12c067221 100644 --- a/mlair/helpers/join.py +++ b/mlair/helpers/join.py @@ -189,24 +189,76 @@ def load_series_information(station_name: List[str], station_type: str_or_none, :return: all available series for requested station stored in an dictionary with parameter name (variable) as key and the series id as value. """ + network_name_opts = network_name if network_name is None else ",".join(helpers.to_list(network_name)) opts = {"base": join_url_base, "service": "search", "station_id": station_name[0], "station_type": station_type, - "network_name": network_name, "as_dict": "true", + "network_name": network_name_opts, "as_dict": "true", "columns": "id,network_name,station_id,parameter_name,parameter_label,parameter_attribute"} station_vars = get_data(opts, headers) logging.debug(f"{station_name}: {station_vars}") - return _select_distinct_series(station_vars, data_origin) + return _select_distinct_series(station_vars, data_origin, network_name) -def _select_distinct_series(vars: List[Dict], data_origin: Dict = None) -> [Dict, Dict]: +def _select_distinct_series(vars: List[Dict], data_origin: Dict = None, network_name: Union[str, List[str]] = None) -> \ + [Dict, Dict]: """ Select distinct series ids for all variables. Also check if a parameter is from REA or not. """ + data_origin = {} if data_origin is None else data_origin + selected, data_origin = _select_distinct_data_origin(vars, data_origin) + + network_name = [] if network_name is None else helpers.to_list(network_name) + selected = _select_distinct_network(selected, network_name) + + # extract id + selected = {k: v["id"] for k, v in selected.items()} + return selected, data_origin + + +def _select_distinct_network(vars: dict, network_name: list) -> dict: + """ + Select distinct series regarding network name. The order the network names are provided in parameter `network_name` + indicates priority (from high to low). If no network name is provided, first entry is used and a logging info is + issued. In case network names are given but no match can be found, this method raises a ValueError. + + :param vars: dictionary with all series candidates already grouped by variable name as key. Value should be a list + of possible candidates to select from. Each candidate must be a dictionary with at least keys `id` and + `network_name`. + :param network_name: list of networks to use with increasing priority (1st element has priority). Can be empty list + indicating to use always first candidate for each variable. + :return: dictionary with single series reference for each variable + """ + selected = {} + for var, series in vars.items(): + res = [] + for network in network_name: + res.extend(list(filter(lambda x: x["network_name"].upper() == network.upper(), series))) + if len(res) > 0: # use first match which has the highest priority + selected[var] = res[0] + else: + if len(network_name) == 0: # just print message which network is used if none is provided + selected[var] = series[0] + logging.info(f"Could not find a valid match for variable {var} and networks {network_name}! " + f"Therefore, use first answer from JOIN: {series[0]}") + else: # raise error if network name is provided but no match could be found + raise ValueError(f"Cannot find a valid match for requested networks {network_name} and " + f"variable {var} as only following networks are available in JOIN: " + f"{list(map(lambda x: x['network_name'], series))}") + return selected + + +def _select_distinct_data_origin(vars: List[Dict], data_origin: Dict) -> (Dict[str, List], Dict): + """ + Select distinct series regarding their data origin. Series are grouped as list according to their variable's name. + As series can be reported with different network attribution, results might contain multiple entries for a variable. + This method assumes the default data origin for chemical variables as `` (empty source) and for meteorological + variables as `REA`. + :param vars: list of all entries to check data origin for + :param data_origin: data origin to match series with, if empty default values are used + :return: dictionary with unique variable names as keys and list of respective series as values + """ data_origin_default = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", "pm10": "", "so2": ""} - if data_origin is None: - data_origin = {} - # ToDo: maybe press, wdir, wspeed from obs? or also temp, ... ? selected = {} for var in vars: name = var["parameter_name"].lower() @@ -215,7 +267,7 @@ def _select_distinct_series(vars: List[Dict], data_origin: Dict = None) -> [Dict data_origin.update({name: data_origin_default.get(name, "")}) attr = data_origin.get(name, "").lower() if var_attr == attr: - selected[name] = var["id"] + selected[name] = selected.get(name, []) + helpers.to_list(var) return selected, data_origin diff --git a/test/test_helpers/test_join.py b/test/test_helpers/test_join.py index e903669bf63f4056a8278401b07818d31a09616d..c309b26f597a812d7296872ee4f7c4c9f0baffea 100644 --- a/test/test_helpers/test_join.py +++ b/test/test_helpers/test_join.py @@ -3,8 +3,10 @@ from typing import Iterable import pytest from mlair.helpers.join import * -from mlair.helpers.join import _save_to_pandas, _correct_stat_name, _lower_list, _select_distinct_series +from mlair.helpers.join import _save_to_pandas, _correct_stat_name, _lower_list, _select_distinct_series, \ + _select_distinct_data_origin, _select_distinct_network from mlair.configuration.join_settings import join_settings +from mlair.helpers.testing import check_nested_equality class TestDownloadJoin: @@ -55,7 +57,7 @@ class TestGetData: class TestLoadSeriesInformation: def test_standard_query(self): - expected_subset = {'o3': 23031, 'no2': 39002, 'temp': 85584, 'wspeed': 17060} + expected_subset = {'o3': 17057, 'no2': 17058, 'temp': 85587, 'wspeed': 17060} res, orig = load_series_information(['DEBW107'], None, None, join_settings()[0], {}) assert expected_subset.items() <= res.items() @@ -64,6 +66,163 @@ class TestLoadSeriesInformation: assert res == {} +class TestSelectDistinctDataOrigin: + + @pytest.fixture + def vars(self): + return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + {'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''}, + {'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}, + {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}] + + def test_no_origin_given(self, vars): + res, orig = _select_distinct_data_origin(vars, {}) + expected = { + "no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}], + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + + assert check_nested_equality(res, expected) is True + # assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + def test_different_origins(self, vars): + origin = {"no2": "test", "temp": "", "cloudcover": "REA"} + res, orig = _select_distinct_data_origin(vars, data_origin=origin) + expected = { + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + assert check_nested_equality(res, expected) is True + # assert res == {"o3": 16687, "press": 102660, "temp": 16693, "cloudcover": 54036} + assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} + + +class TestSelectDistinctNetwork: + + @pytest.fixture + def vars(self): + return { + "no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}], + "o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}], + "cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}], + "temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}], + "press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]} + + def test_no_network_given(self, caplog, vars): + caplog.set_level(logging.INFO) + res = _select_distinct_network(vars, []) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + message = "Could not find a valid match for variable %s and networks []! Therefore, use first answer from JOIN:" + assert message % "no2" in caplog.messages[0] + assert message % "o3" in caplog.messages[1] + assert message % "cloudcover" in caplog.messages[2] + assert message % "temp" in caplog.messages[3] + assert message % "press" in caplog.messages[4] + + def test_single_network_given(self, vars): + res = _select_distinct_network(vars, ["UBA"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + def test_single_network_given_no_match(self, vars): + with pytest.raises(ValueError) as e: # AIRBASE not avail for all variables + _select_distinct_network(vars, ["AIRBASE"]) + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable " \ + "no2 as only following networks are available in JOIN: ['UBA']" + + with pytest.raises(ValueError) as e: # both requested networks are not available for all variables + _select_distinct_network(vars, ["LUBW", "EMEP"]) + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['LUBW', 'EMEP'] and variable " \ + "no2 as only following networks are available in JOIN: ['UBA']" + + def test_multiple_networks_given(self, vars): + res = _select_distinct_network(vars, ["UBA", "AIRBASE"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + res = _select_distinct_network(vars, ["AIRBASE", "UBA"]) + expected = { + "no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', + 'parameter_label': 'NO2', 'parameter_attribute': ''}, + "o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', + 'parameter_label': 'O3', 'parameter_attribute': ''}, + "cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover', + 'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}, + "temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', + 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, + "press": {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', + 'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}} + assert check_nested_equality(res, expected) is True + + class TestSelectDistinctSeries: @pytest.fixture @@ -71,8 +230,7 @@ class TestSelectDistinctSeries: return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2', 'parameter_label': 'NO2', 'parameter_attribute': ''}, {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3', - 'parameter_label': 'O3', - 'parameter_attribute': ''}, + 'parameter_label': 'O3', 'parameter_attribute': ''}, {'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', 'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''}, {'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', @@ -82,7 +240,9 @@ class TestSelectDistinctSeries: {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp', 'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}, {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press', - 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}] + 'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}, + {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press', + 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}] def test_no_origin_given(self, vars): res, orig = _select_distinct_series(vars) @@ -98,6 +258,31 @@ class TestSelectDistinctSeries: assert res == {"cloudcover": 54036, "no2": 16686, "o3": 16687, "press": 102660, "temp": 88491} assert orig == {"no2": "", "o3": "", "temp": "REA", "press": "REA", "cloudcover": "REA"} + def test_different_networks(self, vars): + res, orig = _select_distinct_series(vars, network_name="UBA") + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + res, orig = _select_distinct_series(vars, network_name=["UBA", "EMEP", "AIRBASE"]) + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + res, orig = _select_distinct_series(vars, network_name=["EMEP", "AIRBASE", "UBA"]) + assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 26692} + assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"} + + def test_network_not_available(self, vars): + with pytest.raises(ValueError) as e: + _select_distinct_series(vars, network_name="AIRBASE") + assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable " \ + "no2 as only following networks are available in JOIN: ['UBA']" + + def test_different_network_and_origin(self, vars): + origin = {"no2": "test", "temp": "", "cloudcover": "REA"} + res, orig = _select_distinct_series(vars, data_origin=origin, network_name=["EMEP", "AIRBASE", "UBA"]) + assert res == {"o3": 16687, "press": 26692, "temp": 16693, "cloudcover": 54036} + assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"} + class TestSaveToPandas: