diff --git a/src/helpers.py b/src/helpers.py index 5646eb94dbd43941b5673e64f6b70a7ed0e51c26..4312eac2134a6a7f73d90f8e429203a791bfba83 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -96,24 +96,24 @@ class TimeTracking(object): logging.info(f"undefined job finished after {self}") -def prepare_host(create_new=True): +def prepare_host(create_new=True, sampling="daily"): hostname = socket.gethostname() try: user = os.getlogin() except OSError: user = "default" - if hostname == 'ZAM144': - path = f'/home/{user}/Data/toar_daily/' - elif hostname == 'zam347': - path = f'/home/{user}/Data/toar_daily/' - elif hostname == 'linux-aa9b': - path = f'/home/{user}/machinelearningtools/data/toar_daily/' - elif (len(hostname) > 2) and (hostname[:2] == 'jr'): - path = f'/p/project/cjjsc42/{user}/DATA/toar_daily/' - elif (len(hostname) > 2) and (hostname[:2] == 'jw'): - path = f'/p/home/jusers/{user}/juwels/intelliaq/DATA/toar_daily/' + if hostname == "ZAM144": + path = f"/home/{user}/Data/toar_{sampling}/" + elif hostname == "zam347": + path = f"/home/{user}/Data/toar_{sampling}/" + elif hostname == "linux-aa9b": + path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + elif (len(hostname) > 2) and (hostname[:2] == "jr"): + path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" + elif (len(hostname) > 2) and (hostname[:2] == "jw"): + path = f"/p/home/jusers/{user}/juwels/intelliaq/DATA/toar_{sampling}/" elif "runner-6HmDp9Qd-project-2411-concurrent" in hostname: - path = f'/home/{user}/machinelearningtools/data/toar_daily/' + path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" else: logging.error(f"unknown host '{hostname}'") raise OSError(f"unknown host '{hostname}'") diff --git a/src/join.py b/src/join.py index f20623a08d91e8a2f8fdd95c97183c4c1d6359eb..bb1db30e69209bb59b2fae1e823821dce5961faf 100644 --- a/src/join.py +++ b/src/join.py @@ -44,6 +44,9 @@ def download_join(station_name: Union[str, List[str]], stat_var: dict, station_t # load series information vars_dict = load_series_information(station_name, station_type, network_name, join_url_base, headers) + # correct stat_var values if data is not aggregated (hourly) + [stat_var.update({k: "values"}) for k in stat_var.keys()] + # download all variables with given statistic data = None df = None @@ -53,12 +56,18 @@ def download_join(station_name: Union[str, List[str]], stat_var: dict, station_t logging.info('load: {}'.format(var)) # create data link - opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], #'statistics': stat_var[var], + opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], 'sampling': sampling, 'capture': 0, 'min_data_length': 1460, 'format': 'json'} # load data data = get_data(opts, headers) + # adjust data format if given as list of list + # no branch cover because this just happens when downloading hourly data using a secret token, not available + # for CI testing. + if isinstance(data, list): # pragma: no branch + data = correct_data_format(data) + # correct namespace of statistics stat = _correct_stat_name(stat_var[var]) @@ -75,6 +84,23 @@ def download_join(station_name: Union[str, List[str]], stat_var: dict, station_t raise EmptyQueryResult("No data found in JOIN.") +def correct_data_format(data): + """ + Transform to the standard data format. For some cases (e.g. hourly data), the data is returned as list instead of + a dictionary with keys datetime, values and metadata. This functions addresses this issue and transforms the data + into the dictionary version. + :param data: data in hourly format + :return: the same data but formatted to fit with aggregated format + """ + formatted = {"datetime": [], + "values": [], + "metadata": data[-1]} + for d in data[:-1]: + for k, v in zip(["datetime", "values"], d): + formatted[k].append(v) + return formatted + + def get_data(opts: Dict, headers: Dict) -> Union[Dict, List]: """ Download join data using requests framework. Data is returned as json like structure. Depending on the response @@ -116,8 +142,8 @@ def _save_to_pandas(df: Union[pd.DataFrame, None], data: dict, stat: str, var: s :param var: variable the data is from (e.g. 'o3') :return: new created or concatenated data frame """ - if len(data[0][0]) == 19: - str_format = "%Y-%m-%d %H:%M:%s" + if len(data["datetime"][0]) == 19: + str_format = "%Y-%m-%d %H:%M:%S" else: str_format = "%Y-%m-%d %H:%M" index = map(lambda s: dt.datetime.strptime(s, str_format), data['datetime']) @@ -164,6 +190,7 @@ def create_url(base: str, service: str, **kwargs: Union[str, int, float, None]) if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) var_all_dic = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', 'pblheight': 'maximum'}