Commit 8d3f92f1 authored by lukas leufen's avatar lukas leufen 👻
Browse files

update join to be able to process different networks

parent 8414d618
Pipeline #104367 passed with stages
in 16 minutes and 41 seconds
......@@ -189,24 +189,76 @@ def load_series_information(station_name: List[str], station_type: str_or_none,
:return: all available series for requested station stored in an dictionary with parameter name (variable) as key
and the series id as value.
"""
network_name_opts = network_name if network_name is None else ",".join(helpers.to_list(network_name))
opts = {"base": join_url_base, "service": "search", "station_id": station_name[0], "station_type": station_type,
"network_name": network_name, "as_dict": "true",
"network_name": network_name_opts, "as_dict": "true",
"columns": "id,network_name,station_id,parameter_name,parameter_label,parameter_attribute"}
station_vars = get_data(opts, headers)
logging.debug(f"{station_name}: {station_vars}")
return _select_distinct_series(station_vars, data_origin)
return _select_distinct_series(station_vars, data_origin, network_name)
def _select_distinct_series(vars: List[Dict], data_origin: Dict = None) -> [Dict, Dict]:
def _select_distinct_series(vars: List[Dict], data_origin: Dict = None, network_name: Union[str, List[str]] = None) -> \
[Dict, Dict]:
"""
Select distinct series ids for all variables. Also check if a parameter is from REA or not.
"""
data_origin = {} if data_origin is None else data_origin
selected, data_origin = _select_distinct_data_origin(vars, data_origin)
network_name = [] if network_name is None else helpers.to_list(network_name)
selected = _select_distinct_network(selected, network_name)
# extract id
selected = {k: v["id"] for k, v in selected.items()}
return selected, data_origin
def _select_distinct_network(vars: dict, network_name: list) -> dict:
"""
Select distinct series regarding network name. The order the network names are provided in parameter `network_name`
indicates priority (from high to low). If no network name is provided, first entry is used and a logging info is
issued. In case network names are given but no match can be found, this method raises a ValueError.
:param vars: dictionary with all series candidates already grouped by variable name as key. Value should be a list
of possible candidates to select from. Each candidate must be a dictionary with at least keys `id` and
`network_name`.
:param network_name: list of networks to use with increasing priority (1st element has priority). Can be empty list
indicating to use always first candidate for each variable.
:return: dictionary with single series reference for each variable
"""
selected = {}
for var, series in vars.items():
res = []
for network in network_name:
res.extend(list(filter(lambda x: x["network_name"].upper() == network.upper(), series)))
if len(res) > 0: # use first match which has the highest priority
selected[var] = res[0]
else:
if len(network_name) == 0: # just print message which network is used if none is provided
selected[var] = series[0]
logging.info(f"Could not find a valid match for variable {var} and networks {network_name}! "
f"Therefore, use first answer from JOIN: {series[0]}")
else: # raise error if network name is provided but no match could be found
raise ValueError(f"Cannot find a valid match for requested networks {network_name} and "
f"variable {var} as only following networks are available in JOIN: "
f"{list(map(lambda x: x['network_name'], series))}")
return selected
def _select_distinct_data_origin(vars: List[Dict], data_origin: Dict) -> (Dict[str, List], Dict):
"""
Select distinct series regarding their data origin. Series are grouped as list according to their variable's name.
As series can be reported with different network attribution, results might contain multiple entries for a variable.
This method assumes the default data origin for chemical variables as `` (empty source) and for meteorological
variables as `REA`.
:param vars: list of all entries to check data origin for
:param data_origin: data origin to match series with, if empty default values are used
:return: dictionary with unique variable names as keys and list of respective series as values
"""
data_origin_default = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA",
"temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA",
"no": "", "no2": "", "o3": "", "pm10": "", "so2": ""}
if data_origin is None:
data_origin = {}
# ToDo: maybe press, wdir, wspeed from obs? or also temp, ... ?
selected = {}
for var in vars:
name = var["parameter_name"].lower()
......@@ -215,7 +267,7 @@ def _select_distinct_series(vars: List[Dict], data_origin: Dict = None) -> [Dict
data_origin.update({name: data_origin_default.get(name, "")})
attr = data_origin.get(name, "").lower()
if var_attr == attr:
selected[name] = var["id"]
selected[name] = selected.get(name, []) + helpers.to_list(var)
return selected, data_origin
......
......@@ -3,8 +3,10 @@ from typing import Iterable
import pytest
from mlair.helpers.join import *
from mlair.helpers.join import _save_to_pandas, _correct_stat_name, _lower_list, _select_distinct_series
from mlair.helpers.join import _save_to_pandas, _correct_stat_name, _lower_list, _select_distinct_series, \
_select_distinct_data_origin, _select_distinct_network
from mlair.configuration.join_settings import join_settings
from mlair.helpers.testing import check_nested_equality
class TestDownloadJoin:
......@@ -55,7 +57,7 @@ class TestGetData:
class TestLoadSeriesInformation:
def test_standard_query(self):
expected_subset = {'o3': 23031, 'no2': 39002, 'temp': 85584, 'wspeed': 17060}
expected_subset = {'o3': 17057, 'no2': 17058, 'temp': 85587, 'wspeed': 17060}
res, orig = load_series_information(['DEBW107'], None, None, join_settings()[0], {})
assert expected_subset.items() <= res.items()
......@@ -64,6 +66,163 @@ class TestLoadSeriesInformation:
assert res == {}
class TestSelectDistinctDataOrigin:
@pytest.fixture
def vars(self):
return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''},
{'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''},
{'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''},
{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'},
{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]
def test_no_origin_given(self, vars):
res, orig = _select_distinct_data_origin(vars, {})
expected = {
"no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''}],
"o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''}],
"cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}],
"temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}],
"press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053',
'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]}
assert check_nested_equality(res, expected) is True
# assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660}
assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"}
def test_different_origins(self, vars):
origin = {"no2": "test", "temp": "", "cloudcover": "REA"}
res, orig = _select_distinct_data_origin(vars, data_origin=origin)
expected = {
"o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''}],
"cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}],
"temp": [{'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP--LANUV', 'parameter_attribute': ''}],
"press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053',
'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]}
assert check_nested_equality(res, expected) is True
# assert res == {"o3": 16687, "press": 102660, "temp": 16693, "cloudcover": 54036}
assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"}
class TestSelectDistinctNetwork:
@pytest.fixture
def vars(self):
return {
"no2": [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''}],
"o3": [{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''}],
"cloudcover": [{'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'}],
"temp": [{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'}],
"press": [{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053',
'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]}
def test_no_network_given(self, caplog, vars):
caplog.set_level(logging.INFO)
res = _select_distinct_network(vars, [])
expected = {
"no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
"o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''},
"cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'},
"temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
"press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}}
assert check_nested_equality(res, expected) is True
message = "Could not find a valid match for variable %s and networks []! Therefore, use first answer from JOIN:"
assert message % "no2" in caplog.messages[0]
assert message % "o3" in caplog.messages[1]
assert message % "cloudcover" in caplog.messages[2]
assert message % "temp" in caplog.messages[3]
assert message % "press" in caplog.messages[4]
def test_single_network_given(self, vars):
res = _select_distinct_network(vars, ["UBA"])
expected = {
"no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
"o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''},
"cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'},
"temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
"press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}}
assert check_nested_equality(res, expected) is True
def test_single_network_given_no_match(self, vars):
with pytest.raises(ValueError) as e: # AIRBASE not avail for all variables
_select_distinct_network(vars, ["AIRBASE"])
assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable " \
"no2 as only following networks are available in JOIN: ['UBA']"
with pytest.raises(ValueError) as e: # both requested networks are not available for all variables
_select_distinct_network(vars, ["LUBW", "EMEP"])
assert e.value.args[-1] == "Cannot find a valid match for requested networks ['LUBW', 'EMEP'] and variable " \
"no2 as only following networks are available in JOIN: ['UBA']"
def test_multiple_networks_given(self, vars):
res = _select_distinct_network(vars, ["UBA", "AIRBASE"])
expected = {
"no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
"o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''},
"cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'},
"temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
"press": {'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}}
assert check_nested_equality(res, expected) is True
res = _select_distinct_network(vars, ["AIRBASE", "UBA"])
expected = {
"no2": {'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
"o3": {'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3', 'parameter_attribute': ''},
"cloudcover": {'id': 54036, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'cloudcover',
'parameter_label': 'CLOUDCOVER', 'parameter_attribute': 'REA'},
"temp": {'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
"press": {'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053',
'parameter_name': 'press', 'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}}
assert check_nested_equality(res, expected) is True
class TestSelectDistinctSeries:
@pytest.fixture
......@@ -71,8 +230,7 @@ class TestSelectDistinctSeries:
return [{'id': 16686, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'no2',
'parameter_label': 'NO2', 'parameter_attribute': ''},
{'id': 16687, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'o3',
'parameter_label': 'O3',
'parameter_attribute': ''},
'parameter_label': 'O3', 'parameter_attribute': ''},
{'id': 16692, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS--LANUV', 'parameter_attribute': ''},
{'id': 16693, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
......@@ -82,7 +240,9 @@ class TestSelectDistinctSeries:
{'id': 88491, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'temp',
'parameter_label': 'TEMP-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 102660, 'network_name': 'UBA', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'}]
'parameter_label': 'PRESS-REA-MIUB', 'parameter_attribute': 'REA'},
{'id': 26692, 'network_name': 'AIRBASE', 'station_id': 'DENW053', 'parameter_name': 'press',
'parameter_label': 'PRESS', 'parameter_attribute': 'REA'}]
def test_no_origin_given(self, vars):
res, orig = _select_distinct_series(vars)
......@@ -98,6 +258,31 @@ class TestSelectDistinctSeries:
assert res == {"cloudcover": 54036, "no2": 16686, "o3": 16687, "press": 102660, "temp": 88491}
assert orig == {"no2": "", "o3": "", "temp": "REA", "press": "REA", "cloudcover": "REA"}
def test_different_networks(self, vars):
res, orig = _select_distinct_series(vars, network_name="UBA")
assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660}
assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"}
res, orig = _select_distinct_series(vars, network_name=["UBA", "EMEP", "AIRBASE"])
assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 102660}
assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"}
res, orig = _select_distinct_series(vars, network_name=["EMEP", "AIRBASE", "UBA"])
assert res == {"no2": 16686, "o3": 16687, "cloudcover": 54036, "temp": 88491, "press": 26692}
assert orig == {"no2": "", "o3": "", "cloudcover": "REA", "temp": "REA", "press": "REA"}
def test_network_not_available(self, vars):
with pytest.raises(ValueError) as e:
_select_distinct_series(vars, network_name="AIRBASE")
assert e.value.args[-1] == "Cannot find a valid match for requested networks ['AIRBASE'] and variable " \
"no2 as only following networks are available in JOIN: ['UBA']"
def test_different_network_and_origin(self, vars):
origin = {"no2": "test", "temp": "", "cloudcover": "REA"}
res, orig = _select_distinct_series(vars, data_origin=origin, network_name=["EMEP", "AIRBASE", "UBA"])
assert res == {"o3": 16687, "press": 26692, "temp": 16693, "cloudcover": 54036}
assert orig == {"no2": "test", "o3": "", "cloudcover": "REA", "temp": "", "press": "REA"}
class TestSaveToPandas:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment