From 27aa2010b203a550bb4aebadacece9af734e2e93 Mon Sep 17 00:00:00 2001 From: Carsten Hinz <c.hinz@fz-juelich.de> Date: Tue, 7 May 2024 15:23:32 +0200 Subject: [PATCH] added option to pass further arguments to the REST API query. also added some handling in case of the request to the TAOR DB fails --- tests/get_sample_data.ipynb | 49 ++++++++++++++++++++++++++++---- toargridding/gridding.py | 3 +- toargridding/metadata.py | 13 +++++++-- toargridding/toar_rest_client.py | 48 +++++++++++++++++++++++++------ 4 files changed, 96 insertions(+), 17 deletions(-) diff --git a/tests/get_sample_data.ipynb b/tests/get_sample_data.ipynb index 790b76f..df7f45b 100644 --- a/tests/get_sample_data.ipynb +++ b/tests/get_sample_data.ipynb @@ -2,9 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-05-07 14:38:11.226359\n" + ] + } + ], "source": [ "from datetime import datetime, timedelta\n", "\n", @@ -18,7 +26,8 @@ "statistic = \"mean\"\n", "\n", "time = TimeSample(start, end, sampling=sampling)\n", - "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", statistic, time)\n", + "# { \"station_type_of_area\" : \"urban\" } category is not known\n", + "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", time, statistic, { \"toar1_category\" : \"Urban\"})#\n", "\n", "start_time = datetime.now()\n", "print(start_time)" @@ -26,9 +35,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('daterange', '2010-01-01T00:00:00,2011-01-02T00:00:00'), ('variable_id', '5'), ('statistics', 'mean'), ('sampling', 'daily'), ('min_data_capture', '0'), ('metadata_scheme', 'basic'), ('limit', 'None'), ('format', 'by_statistic'), ('moreOptions', {'station_type_of_area': 'urban'})]\n", + "query not in cache\n", + "[('daterange', '2010-01-01T00:00:00,2011-01-02T00:00:00'), ('variable_id', '5'), ('statistics', 'mean'), ('sampling', 'daily'), ('min_data_capture', '0'), ('metadata_scheme', 'basic'), ('limit', 'None'), ('format', 'by_statistic'), ('moreOptions', {'station_type_of_area': 'urban'})]\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "Request was not successful. Response by TOAR database: {\"detail\":\"'An unknown argument was received: station_type_of_area.'\"}", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:236\u001b[0m, in \u001b[0;36mConnection.query_for_status_endpoint\u001b[0;34m(self, query_options)\u001b[0m\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 236\u001b[0m status_endpoint \u001b[38;5;241m=\u001b[39m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstatus\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n", + "\u001b[0;31mKeyError\u001b[0m: 'status'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 14\u001b[0m\n\u001b[1;32m 10\u001b[0m download_dir\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 12\u001b[0m analysis_service \u001b[38;5;241m=\u001b[39m AnalysisServiceDownload(statistics_endpoint, cache_dir, download_dir)\n\u001b[0;32m---> 14\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43manalysis_service\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 16\u001b[0m end_time \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mnow()\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28mprint\u001b[39m(end_time\u001b[38;5;241m-\u001b[39mstart_time)\n", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:294\u001b[0m, in \u001b[0;36mAnalysisService.get_data\u001b[0;34m(self, metadata)\u001b[0m\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_data\u001b[39m(\u001b[38;5;28mself\u001b[39m, metadata: Metadata) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m AnalysisRequestResult:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"main function to obtain data from the TOAR DB\u001b[39;00m\n\u001b[1;32m 282\u001b[0m \n\u001b[1;32m 283\u001b[0m \u001b[38;5;124;03m Handles requesting and loading of data into memory as soon as they are available.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m Requested data and statistics, station coordinates and metadata of the request\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 294\u001b[0m timeseries, timeseries_metadata \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_timeseries_and_metadata\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 295\u001b[0m coords \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_clean_coords(timeseries_metadata)\n\u001b[1;32m 296\u001b[0m timeseries \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_clean_timeseries(timeseries, metadata)\n", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:420\u001b[0m, in \u001b[0;36mAnalysisServiceDownload.get_timeseries_and_metadata\u001b[0;34m(self, metadata)\u001b[0m\n\u001b[1;32m 417\u001b[0m needs_fresh_download \u001b[38;5;241m=\u001b[39m (\u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_downloaded) \u001b[38;5;129;01mor\u001b[39;00m (\u001b[38;5;129;01mnot\u001b[39;00m filename\u001b[38;5;241m.\u001b[39mis_file())\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m needs_fresh_download:\n\u001b[0;32m--> 420\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(filename, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw+b\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m downloaded_file:\n\u001b[1;32m 422\u001b[0m downloaded_file\u001b[38;5;241m.\u001b[39mwrite(response\u001b[38;5;241m.\u001b[39mcontent)\n", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:184\u001b[0m, in \u001b[0;36mConnection.get\u001b[0;34m(self, query_options)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, query_options : QueryOptions):\n\u001b[1;32m 178\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"get results for a request.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \n\u001b[1;32m 180\u001b[0m \u001b[38;5;124;03m This is the main function to obtained data from the TOAR DB. It will start requests or lookup if an already started requests is finished.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \n\u001b[1;32m 182\u001b[0m \u001b[38;5;124;03m Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results.\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 184\u001b[0m status_endpoint \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_status_endpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, wait_time \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwait_seconds):\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtry: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, wait_time: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwait_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:221\u001b[0m, in \u001b[0;36mConnection.get_status_endpoint\u001b[0;34m(self, query_options)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 219\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquery not in cache\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 221\u001b[0m status_endpoint \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mquery_for_status_endpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery_options\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m status_endpoint\n", + "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/toargridding/toar_rest_client.py:238\u001b[0m, in \u001b[0;36mConnection.query_for_status_endpoint\u001b[0;34m(self, query_options)\u001b[0m\n\u001b[1;32m 236\u001b[0m status_endpoint \u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mjson()[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[0;32m--> 238\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRequest was not successful. Response by TOAR database: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mtext\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache\u001b[38;5;241m.\u001b[39mput(query_options\u001b[38;5;241m.\u001b[39mcache_key, status_endpoint)\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m status_endpoint\n", + "\u001b[0;31mRuntimeError\u001b[0m: Request was not successful. Response by TOAR database: {\"detail\":\"'An unknown argument was received: station_type_of_area.'\"}" + ] + } + ], "source": [ "from pathlib import Path\n", "from toargridding.toar_rest_client import AnalysisServiceDownload\n", diff --git a/toargridding/gridding.py b/toargridding/gridding.py index be6b552..5523c9e 100644 --- a/toargridding/gridding.py +++ b/toargridding/gridding.py @@ -16,6 +16,7 @@ def get_gridded_toar_data( time: TimeSample, variables: list[str], stats: list[str], + **kwargs ) -> tuple[list[xr.Dataset], list[Metadata]]: """ API to download data as xarrays @@ -40,7 +41,7 @@ def get_gridded_toar_data( """ metadatas = [ - Metadata.construct(var, time, stat) for var, stat in product(variables, stats) + Metadata.construct(standart_name=var, time=time, stat=stat, moreOptions=kwargs) for var, stat in product(variables, stats) ] datasets = [] diff --git a/toargridding/metadata.py b/toargridding/metadata.py index fd75551..6ace341 100644 --- a/toargridding/metadata.py +++ b/toargridding/metadata.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta from enum import Enum -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np import pandas as pd @@ -61,6 +61,8 @@ class TimeSample: def as_datetime_index(self) -> pd.DatetimeIndex: """Conversion to array with all sampled time points """ + print(self.start) + print(self.end) return pd.period_range(self.start, self.end, freq=self.frequency).to_timestamp() @property @@ -100,13 +102,16 @@ class Metadata: requested time points statistics: statistical processing applied by the TOAR database for this request + moreOptions: + collection of additional query options for the REST API. """ variable: TOARVariable time: TimeSample statistic: str + moreOptions : Dict = field(default_factory=lambda: {}) @staticmethod - def construct(standart_name: str, time: TimeSample, stat: str): + def construct(standart_name: str, time: TimeSample, stat: str, moreOptions : Dict = {}): """constructor Parameters: @@ -117,10 +122,12 @@ class Metadata: statistical analysis to be done by the TOAR database time: temporal sampling of this request + moreOptions: + collection of additional query options for the REST API. """ variable = TOARVariable.get(standart_name) - return Metadata(variable, time, stat) + return Metadata(variable, time, stat, moreOptions) @property def statistic(self) -> str: # TODO make better diff --git a/toargridding/toar_rest_client.py b/toargridding/toar_rest_client.py index e01b3f7..554960f 100644 --- a/toargridding/toar_rest_client.py +++ b/toargridding/toar_rest_client.py @@ -1,7 +1,7 @@ import time import io from zipfile import ZipFile -from dataclasses import dataclass, asdict +from dataclasses import dataclass, asdict, field from contextlib import contextmanager import json from pathlib import Path @@ -9,6 +9,8 @@ from pathlib import Path import requests import pandas as pd +from typing import Dict + from toargridding.metadata import Metadata, AnalysisRequestResult, Coordinates @@ -39,6 +41,8 @@ class QueryOptions: limit to amount of extracted data; see Quick Start for TOAR Analysis Service format: output format; see Quick Start for TOAR Analysis Service + moreOptions: + dict with additional query options for the request to the TOAR database. """ daterange: str @@ -49,6 +53,7 @@ class QueryOptions: metadata_scheme: str = "basic" limit: str = "None" format: str = "by_statistic" + moreOptions : Dict = field(default_factory=lambda: {}) #needs to be last element for to dict factory @staticmethod def from_metadata(metadata: Metadata): @@ -62,14 +67,26 @@ class QueryOptions: variable_id=str(metadata.variable.toar_id), statistics=metadata.statistic, sampling=metadata.time.sampling, + moreOptions=metadata.moreOptions ) @property def cache_key(self): """creation to identify the request in the cache of known request. """ - return "".join(asdict(self).values()) + return "".join(asdict(self, dict_factory=quarryToDict).values()) + +def quarryToDict(data : QueryOptions): + print(data) + out = { field : value for field, value in data[:-1] } + extraVals = data[-1][1] + for field, value in extraVals.items(): + if not field in data: + out[field] = value + else: + raise ValueError(f"Providing invalid value for TAOR database: {field} is controlled by Metadata class") + return out class Cache: """cache to store download links for requests to the TOAD database @@ -157,7 +174,7 @@ class Connection: # max wait time is 30min self.wait_seconds = [minutes * 60 for minutes in (5, 5, 5, 5, 5, 5)] - def get(self, query_options): + def get(self, query_options : QueryOptions): """get results for a request. This is the main function to obtained data from the TOAR DB. It will start requests or lookup if an already started requests is finished. @@ -214,14 +231,17 @@ class Connection: query_options: request to the TOAR database. """ - response = self.wait_and_get(self.endpoint, asdict(query_options)) - status_endpoint = response.json()["status"] + response = self.wait_and_get(self.endpoint, asdict(query_options, dict_factory=quarryToDict)) + try: + status_endpoint = response.json()["status"] + except: + raise RuntimeError(f"Request was not successful. Response by TOAR database: {response.text}") self.cache.put(query_options.cache_key, status_endpoint) return status_endpoint def wait_and_get( - self, endpoint, query_options=None, wait_secs=None, timeout=(3.05, 20) + self, endpoint : str, query_options : Dict =None, wait_secs=None, timeout=(3.05, 20) ): """accesses given endpoint @@ -312,11 +332,23 @@ class AnalysisService: return: timeseries without invalid numbers (none, NaN, etc) """ + # TODO: Why are there different numbers of columns to be dropped?? # TODO maybe use cf-index here already ? first, last = timeseries.columns[0], timeseries.columns[-1] + ##here we observe some differences in the number of timestamps. # remove data where utc -> sun/local ? time conversion leads to dateshift - timeseries.drop(columns=[first, last], inplace=True) - timeseries.columns = metadata.time.as_datetime_index() + newDates = metadata.time.as_datetime_index() + if len(timeseries.columns) == len(newDates)+2: + timeseries.drop(columns=[first, last], inplace=True) + print("Info: removed first and last column from retrieved timeseries") + elif len(timeseries.columns) == len(newDates)+1: + timeseries.drop(columns=[last], inplace=True) + print("Info: removed last column from retrieved timeseries") + else: + raise RuntimeError(f"There is a mismatch in the timestamps...\nDownloaded:{timeseries.columns}\nFrom Metadata: {newDates}") + print(timeseries.columns) + print(newDates) + timeseries.columns = newDates all_na = timeseries.isna().all(axis=1) timeseries = timeseries[~all_na] -- GitLab