diff --git a/README.md b/README.md index 946f6e1296cfa1838803bc85becc23435173d131..85b954503d2c36c1918ec0e85a62bd2efab27de2 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,17 @@ tests/produce_data.ipynb ``` Provides an example on how to download data, apply gridding and save the results as netCDF files. ## Retrieving data +``` get_sample_data.ipynb +``` Downloads data from the TOAR database. + +## Retrieving data +``` +get_sample_data_manual.ipynb +``` +Downloads data from the TOAR database with a manual creation of the request to the TOAR database. + ## Retriving data and visualization ``` quality_controll.ipynb @@ -53,7 +62,7 @@ The data are downloaded and reused for subsequent executions of this notebook. # Supported Grids -The first supported grid is the Cartesian grid. +The first supported grid is a regular grid with longitude and latitude. # Supported Variables diff --git a/tests/get_sample_data.ipynb b/tests/get_sample_data.ipynb index d5913c651ad63f4a584b43bf03e3f8881a5fca9b..a5a16cf9139b1e955eadccb1790c771dcb191f5d 100644 --- a/tests/get_sample_data.ipynb +++ b/tests/get_sample_data.ipynb @@ -11,8 +11,8 @@ "from toargridding.metadata import TimeSample, Metadata\n", "\n", "sampling = \"daily\" # FIXME check monthly !!!\n", - "start = datetime(2016, 3, 1)\n", - "end = datetime(2016, 3, 3)\n", + "start = datetime(2010, 1, 1)\n", + "end = datetime(2011, 1, 1)\n", "\n", "statistics_endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/statistics/\"\n", "statistic = \"mean\"\n", @@ -31,12 +31,15 @@ "outputs": [], "source": [ "from pathlib import Path\n", - "\n", "from toargridding.toar_rest_client import AnalysisServiceDownload\n", "\n", - "toargridding_base_path = Path(\"/home/simon/Projects/toar/toargridding/\")\n", - "cache_dir = toargridding_base_path / \"tests\" / \"results\"\n", - "download_dir = toargridding_base_path / \"tests\" / \"data\"\n", + "\n", + "#creation of output directories\n", + "toargridding_base_path = Path(\".\")\n", + "cache_dir = toargridding_base_path / \"results\"\n", + "download_dir = toargridding_base_path / \"data\"\n", + "cache_dir.mkdir(parents=True, exist_ok=True)\n", + "download_dir.mkdir(parents=True, exist_ok=True)\n", "\n", "analysis_service = AnalysisServiceDownload(statistics_endpoint, cache_dir, download_dir)\n", "\n", @@ -53,6 +56,9 @@ "outputs": [], "source": [ "# %%script false --no-error\n", + "\n", + "# manual access to data\n", + "\n", "import requests\n", "\n", "end_with_padding = end + timedelta(1)\n", @@ -82,7 +88,7 @@ "metadata": {}, "outputs": [], "source": [ - "response.headers[\"Content-Type\"] == \"application/json\"\n" + "print(response.headers[\"Content-Type\"] == \"application/json\")\n" ] }, { @@ -93,11 +99,12 @@ "source": [ "import requests\n", "\n", - "status_endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/status/5ec3a54c-322c-4bce-a3f5-fdf485a56514\"\n", + "#this hard coded link does not work.\n", + "#status_endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/status/5ec3a54c-322c-4bce-a3f5-fdf485a56514\"\n", "\n", - "response = requests.get(status_endpoint)\n", - "print(response.headers)\n", - "print(response.json())" + "#response = requests.get(status_endpoint)\n", + "#print(response.headers)\n", + "#print(response.json())" ] }, { @@ -124,7 +131,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/tests/get_sample_data_old.ipynb b/tests/get_sample_data_manual.ipynb similarity index 100% rename from tests/get_sample_data_old.ipynb rename to tests/get_sample_data_manual.ipynb diff --git a/toargridding/gridding.py b/toargridding/gridding.py index 3268dc267c8ca3fd1c00e0c33c59de529ad6c9e4..3b6ab3677e86bfe069884076d30a82aef8044c08 100644 --- a/toargridding/gridding.py +++ b/toargridding/gridding.py @@ -17,6 +17,28 @@ def get_gridded_toar_data( variables: list[str], stats: list[str], ) -> tuple[list[xr.Dataset], list[Metadata]]: + """ API to download data as xarrays + + The function creates all combinations of the variable and stats list + + Parameters: + ---------- + analysis_service: + access to the REST API and manages download of data + grid: + grid for the output data. + time: + sampled values in time + variables: + list of variables to be extracted from the TOARdatabase + stats: + list of statistical properties to be extracted from the TOAR database + + return: + ------- + Gridded datasets for each combination of variables and stats and appropriate metadata for each dataset. + """ + metadatas = [ Metadata.construct(var, stat, time) for var, stat in product(variables, stats) ] @@ -27,4 +49,5 @@ def get_gridded_toar_data( ds = grid.as_xarray(data) datasets.append(ds) + #TODO: return this as a list of tuples to keep data and metadata together? return datasets, metadatas diff --git a/toargridding/grids.py b/toargridding/grids.py index d8f894938d5aa89dd7ef9aeb422f14f0a03959a4..cb0f946a72b124c965872ee798086bc3072ea2e4 100644 --- a/toargridding/grids.py +++ b/toargridding/grids.py @@ -31,6 +31,8 @@ class GridDefinition(ABC): @staticmethod def construct(grid_type: GridType, **kwargs): + """creation of requested grid type + """ match (grid_type): case GridType.regular: return RegularGrid(**kwargs) diff --git a/toargridding/metadata.py b/toargridding/metadata.py index abf93492ec67156b4ad17e45f49e47a253bc0c70..d1d83d214aec1fc97cf029bb8c20fba0794c2107 100644 --- a/toargridding/metadata.py +++ b/toargridding/metadata.py @@ -74,14 +74,18 @@ class TimeSample: Calculates the duration in days relative to start time point. """ + + #TODO: could this be an issue for monthly sampling? n_days = (self.end - self.start).days return np.arange(n_days + 1) @dataclass class Metadata: - """MEtadata of a request. - + """Metadata of a request. + + Attributes: + ---------- variable: support variable of the TOAR data base statistics: @@ -89,18 +93,44 @@ class Metadata: time: requested time points """ + variable: TOARVariable statistic: str time: TimeSample @staticmethod def construct(standart_name: str, stat: str, time: TimeSample): + """constructor + + Parameters: + ---------- + standart_name: + standard name according to CF + stat: + statistical analysis to be done by the TOAR database + time: + temporal sampling of this request + """ + variable = TOARVariable.get(standart_name) - if stat not in STATISTICS_LIST: - raise ValueError(f"invalid statistic: {stat}") return Metadata(variable, stat, time) + + @property + def statistic(self) -> str: # TODO make better + """statistical property for being extracted from the TOAR database + + This can for example be the mean, max, min, median. For a full list see toargridding.toarstat_constants.STATISTICS_LIST + + """ + return self._statistic + + @statistic.setter + def statistics(self, stat : str): + if stat not in STATISTICS_LIST: + raise ValueError(f"invalid statistic: {stat}") + self._statistic = stat def get_id(self) -> str: """creation of a request specific ID diff --git a/toargridding/static_metadata.py b/toargridding/static_metadata.py index 66e3f28062be193d682a54dd8b3577c80280f1ae..51a34d155f44de69855668252e3119680fc6ee1e 100644 --- a/toargridding/static_metadata.py +++ b/toargridding/static_metadata.py @@ -25,7 +25,7 @@ class TOARVariable: """ vars = None - """available TOAR variables.""" + """available variables from the TOAR database.""" name: str longname: str diff --git a/toargridding/toar_rest_client.py b/toargridding/toar_rest_client.py index a1f75ae5592ed3de1578409581170635edd3e5c4..5a956c18813dc1d887b9264d4d4c8e042f2ca3ef 100644 --- a/toargridding/toar_rest_client.py +++ b/toargridding/toar_rest_client.py @@ -20,7 +20,27 @@ COORDS = [STATION_LAT, STATION_LON] @dataclass(frozen=True) class QueryOptions: """Creation of a request to the TOAR database. + + Attributes: + ---------- + datarange: + extraction of timestamps with start<=valid<end. Format: [start],[end] with start and end in isoformat + variable_id: + id of the variable within the TOAR database + statistics: + statistical quantity requested from the TOAR database. see toargridding.toarstats_constants.STATISTICS_LIST. + sampling: + frequency of sampling within the datarange, e.g. daily, monthly + min_data_capture: + most probably the minimum data to include in the request + metadata_schema: + amount of metadata being provided, see Quick Start for TOAR Analysis Service + limit: + limit to amount of extracted data; see Quick Start for TOAR Analysis Service + format: + output format; see Quick Start for TOAR Analysis Service """ + daterange: str variable_id: str statistics: str @@ -34,6 +54,8 @@ class QueryOptions: def from_metadata(metadata: Metadata): """Creation from Metadata object + Copies datarange, variable_id, statistics and sampling from the metadata object. + For the other parameters the default values are used. """ return QueryOptions( daterange=metadata.time.daterange_option, @@ -50,7 +72,26 @@ class QueryOptions: class Cache: + """cache to store download links for requests to the TOAD database + + The extraction of data and statistical processing by the TOAR database can take hours or even days, depending on the amount of requested data. + The cache is persistent to allow the access to data after turning of the computer. + + It is created in a dict like way and supports operations like "in". + A textfile called status_endpoints.json is created in cache_dir. CAVE: The first entry is required for loading the empty file. + + """ + def __init__(self, cache_dir : Path): + """constructor + + Throws exception if cache directory does not exists. + Parameters + ---------- + cache_dir: + directory for storing cache file. + """ + if not cache_dir.exists(): raise RuntimeError(f"Given directory for saving cache file does not exists. Path: {cache_dir}") self.cache_file = cache_dir / "status_endpoints.json" @@ -60,18 +101,26 @@ class Cache: json.dump({"foo": "bar"}, cache) def __contains__(self, item: str): + """allows usage of "in" + """ with self.storage_dict() as storage: return item in storage.keys() def get(self, key: str): + """get an endpoint from the cache. + """ with self.storage_dict() as storage: return storage[key] def put(self, key: str, content: str): + """get add key and content as key-value-pair to cache + """ with self.storage_dict() as storage: storage[key] = content def remove(self, key: str): + """remove a key and content as key-value-pair to cache + """ with self.storage_dict() as storage: del storage[key] @@ -88,12 +137,33 @@ class Cache: class Connection: def __init__(self, endpoint, cache_dir): + """connection to the rest API of the TOAR database + + This class handles the creation of requests and the interaction with the TOAR database. + It relies on the "TOARDB Analysis FastAPI REST interface" + https://toar-data.fz-juelich.de/api/v2/analysis/ + with the default endpoint: https://toar-data.fz-juelich.de/api/v2/analysis/statistics + + Parameters: + ---------- + endpoint: + link to the TOAR database analysis service + cache_dir: + directory to store cache file + """ + self.endpoint = endpoint self.cache = Cache(cache_dir) # max wait time is 30min self.wait_seconds = [minutes * 60 for minutes in (5, 5, 5, 5, 5, 5)] def get(self, query_options): + """get results for a request. + + This is the main function to obtaind data from the TOAR DB. It will start requests or lookup if an already started requests is finished. + + Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results. + """ status_endpoint = self.get_status_endpoint(query_options) for i, wait_time in enumerate(self.wait_seconds): @@ -107,6 +177,17 @@ class Connection: ) def get_status_endpoint(self, query_options: QueryOptions): + """get endpoint to results of a request + + This function checks if the request is already known and has been submitted to the TOAD DB. + If yes, the know endpoint is returned. + If the cache knows the endpoint, but the DB has deleted it, the endpoint is removed from the cache and a new request is started. + Otherwise a new new request is started. + + Parameters: + ---------- + Options for the request. + """ if query_options.cache_key in self.cache: status_endpoint = self.cache.get(query_options.cache_key) @@ -124,6 +205,15 @@ class Connection: return status_endpoint def query_for_status_endpoint(self, query_options: QueryOptions): + """create and new request to the TOAR DB. + + Adds the status endpoint of the request to the cache. + + Parameters: + ---------- + query_options: + request to the TOAR database. + """ response = self.wait_and_get(self.endpoint, asdict(query_options)) status_endpoint = response.json()["status"] self.cache.put(query_options.cache_key, status_endpoint) @@ -133,6 +223,19 @@ class Connection: def wait_and_get( self, endpoint, query_options=None, wait_secs=None, timeout=(3.05, 20) ): + """accesses given endpoint + + Parameters: + ---------- + endpoint: + either full endpoint of a request of base endpoint. + query_options: + used with the base endpoint to create a request. If None, endpoint is expected to be a full endpoint + wait_secs: + sleep in seconds before starting request to TAOR DB + timeout: + timeout for the request. + """ if wait_secs: time.sleep(wait_secs) @@ -143,9 +246,29 @@ class AnalysisService: METADATA = "metadata" def __init__(self, stats_endpoint, cache_dir): + """constructor for setting up cache and connection + + Parameters: + --------- + stats_endpoint: + link to statistics service of TOAR DB + cache_dir: + directory to store cache file for requests, needs to exist + """ self.connection = Connection(stats_endpoint, cache_dir) def get_data(self, metadata: Metadata) -> AnalysisRequestResult: + """main function to obtain data from the TOAR DB + + Handles requesting and loading of data into memory as soon as they are available. + In addition the data and coordinates undergo a cleanup. + + Parameters: + ---------- + metadata: + meta data for the request. + """ + timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata) coords = self.get_clean_coords(timeseries_metadata) timeseries = self.get_clean_timeseries(timeseries, metadata) @@ -154,18 +277,39 @@ class AnalysisService: def get_timeseries_and_metadata( self, metadata: Metadata ) -> tuple[pd.DataFrame, pd.DataFrame]: + """obtain data and metadata from TOAR database + + return: + tuple[timeseries, station coordinates] + """ + query_options = QueryOptions.from_metadata(metadata) result = self.connection.get(query_options) timeseries, timeseries_metadata = self.load_data(result.content, metadata) return timeseries, timeseries_metadata - def get_clean_coords(self, timeseries_metadata): + def get_clean_coords(self, timeseries_metadata : pd.DataFrame): + """remove all stations with invalid coordinates + invalid coordinates are NaN, none etc. + return: + stations with valid coordinates + """ coords = timeseries_metadata[COORDS] coords.columns = [Coordinates.latitude.name, Coordinates.longitude.name] valid_coords = coords.notna().all(axis=1) return coords[valid_coords] - def get_clean_timeseries(self, timeseries, metadata: Metadata): + def get_clean_timeseries(self, timeseries : pd.DataFrame, metadata: Metadata): + """replaces all nan in the data with 0 for plotting + + timeseries: + extracted time series + metadata: + metadate belonging ot the timeseries. + + return: + timeseries without invalid numbers (none, NaN, etc) + """ # TODO maybe use cf-index here already ? first, last = timeseries.columns[0], timeseries.columns[-1] # remove data where utc -> sun/local ? time conversion leads to dateshift @@ -181,6 +325,15 @@ class AnalysisService: def load_data( self, content: bytes, metadata: Metadata ) -> tuple[pd.DataFrame, pd.DataFrame]: + """convert downloaded byte stream into pandas dataframes + + Parameters: + ---------- + content: + downloaded data as zip file + metadata: + information on the request. + """ zip_stream = io.BytesIO(content) with ZipFile(zip_stream) as myzip: timeseries = self.extract_data(myzip, metadata.statistic) @@ -188,7 +341,16 @@ class AnalysisService: return timeseries, timeseries_metadata - def extract_data(self, zip_file, data_file) -> pd.DataFrame: + def extract_data(self, zip_file : ZipFile, data_file : str) -> pd.DataFrame: + """extract a specific csv file from the zip file + + Parameters: + ---------- + zip_file: + opened zip file + data_file: + base file name of the requested file. Extension .csv is added by this function. + """ with zip_file.open(f"{data_file}.csv") as f: s_stream = io.StringIO(f.read().decode("utf-8")) return pd.read_csv(s_stream, comment="#", index_col=0) diff --git a/toargridding/variables.py b/toargridding/variables.py index 3e29d8722d85bfae4ec1ebb28933a41fd64bb59a..eb798ee660b01d4c3a0730a7dad484e866e404b7 100644 --- a/toargridding/variables.py +++ b/toargridding/variables.py @@ -24,6 +24,7 @@ class Variable: encoding: encoding of data type """ + var: Variables data: np.ndarray attributes: dict[str, str] @@ -86,6 +87,18 @@ class Coordinate(Variable): cls, variable: Variables, resolution: float, min: float, max: float ): """construction from a data range and resolution + + Creates a coordinate axis between min and amx with a step size close to resolution. + + Parameters: + ---------- + resolution: + width of a bin; actual size will be selected to obtain equidistant steps between steps + min: + lowest value on coordinate axis + max: + highest value on coordinate axis + """ span = max - min