diff --git a/mlair/configuration/toar_data_v2_settings.py b/mlair/configuration/toar_data_v2_settings.py index a8bb9f42cf5a1967f150aa18019c2dbdc89f43a2..da17b90d32088431566f93ae951545ff5a168079 100644 --- a/mlair/configuration/toar_data_v2_settings.py +++ b/mlair/configuration/toar_data_v2_settings.py @@ -10,7 +10,7 @@ def toar_data_v2_settings(sampling="daily") -> Tuple[str, Dict]: :return: Service url and optional headers """ if sampling == "daily": # pragma: no branch - TOAR_SERVICE_URL = "https://toar-data.fz-juelich.de/statistics/api/v1/" + TOAR_SERVICE_URL = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/" headers = {} elif sampling == "hourly" or sampling == "meta": TOAR_SERVICE_URL = "https://toar-data.fz-juelich.de/api/v2/" diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index a1d3c4aa9ac084ee02828e05b1f068323df80505..f0a29c3a27dbc21a88601f2a096df09ae6d1069e 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -335,7 +335,7 @@ class DataHandlerSingleStation(AbstractDataHandler): file_name = self._set_file_name(path, station, statistics_per_var) meta_file = self._set_meta_file_name(path, station, statistics_per_var) if self.overwrite_local_data is True: - logging.debug(f"overwrite_local_data is true, therefore reload {file_name}") + logging.debug(f"{self.station[0]}: overwrite_local_data is true, therefore reload {file_name}") if os.path.exists(file_name): os.remove(file_name) if os.path.exists(meta_file): @@ -344,22 +344,22 @@ class DataHandlerSingleStation(AbstractDataHandler): store_data_locally=store_data_locally, data_origin=data_origin, time_dim=self.time_dim, target_dim=self.target_dim, iter_dim=self.iter_dim, window_dim=self.window_dim) - logging.debug(f"loaded new data") + logging.debug(f"{self.station[0]}: loaded new data") else: try: - logging.debug(f"try to load local data from: {file_name}") + logging.debug(f"{self.station[0]}: try to load local data from: {file_name}") data = xr.open_dataarray(file_name) meta = pd.read_csv(meta_file, index_col=0) self.check_station_meta(meta, station, data_origin, statistics_per_var) - logging.debug("loading finished") + logging.debug(f"{self.station[0]}: loading finished") except FileNotFoundError as e: - logging.debug(e) - logging.debug(f"load new data") + logging.debug(f"{self.station[0]}: {e}") + logging.debug(f"{self.station[0]}: load new data") data, meta = data_sources.download_data(file_name, meta_file, station, statistics_per_var, sampling, store_data_locally=store_data_locally, data_origin=data_origin, time_dim=self.time_dim, target_dim=self.target_dim, iter_dim=self.iter_dim) - logging.debug("loading finished") + logging.debug(f"{self.station[0]}: loading finished") # create slices and check for negative concentration. data = self._slice_prep(data, start=start, end=end) data = self.check_for_negative_concentrations(data) @@ -378,7 +378,7 @@ class DataHandlerSingleStation(AbstractDataHandler): continue m = ast.literal_eval(meta.at[k, station[0]]) if not check_nested_equality(select_from_dict(m, v.keys()), v): - logging.debug(f"meta data does not agree with given request for {k}: {v} (requested) != " + logging.debug(f"{station[0]}: meta data does not agree with given request for {k}: {v} (requested) != " f"{m} (local). Raise FileNotFoundError to trigger new grapping from web.") raise FileNotFoundError diff --git a/mlair/data_handler/data_handler_with_filter.py b/mlair/data_handler/data_handler_with_filter.py index 21c45991dfd92d2fbfd370a5dc07989ffbc0cfb6..ad6ccaaf8e3bd49e1337bbd6cafbe44f9ebb1019 100644 --- a/mlair/data_handler/data_handler_with_filter.py +++ b/mlair/data_handler/data_handler_with_filter.py @@ -374,7 +374,7 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation def apply_filter(self): """Apply FIR filter only on inputs.""" self.apriori = self.apriori.get(str(self)) if isinstance(self.apriori, dict) else self.apriori - logging.info(f"{self.station}: call ClimateFIRFilter") + logging.info(f"{self.station[0]}: call ClimateFIRFilter") climate_filter = ClimateFIRFilter(self.input_data.astype("float32"), self.fs, self.filter_order, self.filter_cutoff_freq, self.filter_window_type, time_dim=self.time_dim, var_dim=self.target_dim, diff --git a/mlair/helpers/data_sources/data_loader.py b/mlair/helpers/data_sources/data_loader.py index 094ce20d6c6c607be4f6d4044dad6978f14974f8..a3b50746560f80038ab92f8b23d500359c8af5d0 100644 --- a/mlair/helpers/data_sources/data_loader.py +++ b/mlair/helpers/data_sources/data_loader.py @@ -97,7 +97,37 @@ class EmptyQueryResult(Exception): pass -def get_data(opts: Dict, headers: Dict, as_json: bool = True, max_retries=5) -> Union[Dict, List, str]: +def get_data_with_query(opts: Dict, headers: Dict, as_json: bool = True, max_retries=5, timeout_base=60) -> bytes: + """ + Download data from statistics rest api. This API is based on three steps: (1) post query and retrieve job id, (2) + read status of id until finished, (3) download data with job id. + """ + url = create_url(**opts) + response_error = None + for retry in range(max_retries + 1): + time.sleep(random.random()) + try: + timeout = timeout_base * (2 ** retry) + logging.info(f"connect (retry={retry}, timeout={timeout}) {url}") + start_time = time.time() + with TimeTracking(name=url): + session = retries_session(max_retries=0) + response = session.get(url, headers=headers, timeout=(5, 5)) # timeout=(open, read) + while (time.time() - start_time) < timeout: + response = requests.get(response.json()["status"], timeout=(5, 5)) + if response.history: + break + time.sleep(2) + return response.content + except Exception as e: + time.sleep(retry) + logging.debug(f"There was an error for request {url}: {e}") + response_error = e + if retry + 1 >= max_retries: + raise EmptyQueryResult(f"There was an RetryError for request {url}: {response_error}") + + +def get_data(opts: Dict, headers: Dict, as_json: bool = True, max_retries=5, timeout_base=60) -> Union[Dict, List, str]: """ Download join data using requests framework. @@ -110,10 +140,11 @@ def get_data(opts: Dict, headers: Dict, as_json: bool = True, max_retries=5) -> :return: requested data (either as list or dictionary) """ url = create_url(**opts) - for retry in range(max_retries): + response_error = None + for retry in range(max_retries + 1): time.sleep(random.random()) try: - timeout = 60 * (2 ** retry) + timeout = timeout_base * (2 ** retry) logging.info(f"connect (retry={retry}, timeout={timeout}) {url}") with TimeTracking(name=url): session = retries_session(max_retries=0) @@ -122,11 +153,13 @@ def get_data(opts: Dict, headers: Dict, as_json: bool = True, max_retries=5) -> return response.json() if as_json is True else response.text else: logging.debug(f"There was an error (STATUS {response.status_code}) for request {url}") + response_error = f"STATUS {response.status_code}" except Exception as e: time.sleep(2 * (2 ** retry)) logging.debug(f"There was an error for request {url}: {e}") - if retry + 1 >= max_retries: - raise EmptyQueryResult(f"There was an RetryError for request {url}: {e}") + response_error = e + if retry + 1 >= max_retries: + raise EmptyQueryResult(f"There was an RetryError for request {url}: {response_error}") def correct_stat_name(stat: str) -> str: diff --git a/mlair/helpers/data_sources/toar_data_v2.py b/mlair/helpers/data_sources/toar_data_v2.py index 5d1cacc604f4288e48d12a72f8a24ba0d8b21fd1..3f2bc79d2bf3143452b30305692dd00f550ed930 100644 --- a/mlair/helpers/data_sources/toar_data_v2.py +++ b/mlair/helpers/data_sources/toar_data_v2.py @@ -10,10 +10,12 @@ from io import StringIO import pandas as pd import pytz from timezonefinder import TimezoneFinder +from io import BytesIO +import zipfile from mlair.configuration.toar_data_v2_settings import toar_data_v2_settings from mlair.helpers import to_list -from mlair.helpers.data_sources.data_loader import EmptyQueryResult, get_data, correct_stat_name +from mlair.helpers.data_sources.data_loader import EmptyQueryResult, get_data, correct_stat_name, get_data_with_query str_or_none = Union[str, None] @@ -120,9 +122,9 @@ def prepare_meta(meta, sampling, stat_var, var): for m in meta: opts = {} if sampling == "daily": - opts["timeseries_id"] = m.pop("id") + opts["id"] = m.pop("id") m["id"] = None - opts["names"] = stat_var[var] + opts["statistics"] = stat_var[var] opts["sampling"] = sampling out.append(([m], opts)) return out @@ -167,17 +169,32 @@ def load_timeseries_data(timeseries_meta, url_base, opts, headers, sampling): series_id = meta["id"] # opts = {"base": url_base, "service": f"data/timeseries/{series_id}"} opts = {"base": url_base, "service": f"data/timeseries", "param_id": series_id, "format": "csv", **opts} - if sampling != "hourly": + if sampling == "hourly": + res = get_data(opts, headers, as_json=False) + data = extract_timeseries_data(res, "string") + else: opts["service"] = None - res = get_data(opts, headers, as_json=False) - data = pd.read_csv(StringIO(res), comment="#", index_col="datetime", parse_dates=True, - infer_datetime_format=True) + opts["format"] = None + res = get_data_with_query(opts, headers, as_json=False) + data = extract_timeseries_data(res, "bytes") if len(data.index) > 0: - data = data[correct_stat_name(opts.get("names", "value"))].rename(meta["variable"]["name"]) + data = data[correct_stat_name(opts.get("statistics", "value"))].rename(meta["variable"]["name"]) coll.append(data) return coll +def extract_timeseries_data(result, result_format): + if result_format == "string": + return pd.read_csv(StringIO(result), comment="#", index_col="datetime", parse_dates=True, + infer_datetime_format=True) + elif result_format == "bytes": + with zipfile.ZipFile(BytesIO(result)) as file: + return pd.read_csv(BytesIO(file.read(file.filelist[0].filename)), comment="#", index_col="datetime", + parse_dates=True) + else: + raise ValueError(f"Unknown result format given: {result_format}") + + def load_station_information(station_name: List[str], url_base: str, headers: Dict): # opts = {"base": url_base, "service": f"stationmeta/{station_name[0]}"} opts = {"base": url_base, "service": f"stationmeta", "param_id": station_name[0]}