changed statistics for many stations notebook

added class to extract a list of valid metadata from the toar database added testscript for this class added notebook to use this new class to extract data for all stations between 2000 and 2018.

changed statistics for many stations notebook
472d0c02 · Carsten Hinz · 1ab97564 · 472d0c02 · 472d0c02 · 472d0c02
Commit 472d0c02 authored 9 months ago by Carsten Hinz
--- a/tests/contryCode.py
+++ b/tests/contryCode.py
+"""test script for obtaining valid country codes.
+"""
+from toargridding.metadata_utilities import countryCodes
+import requests
+test = countryCodes()
+validCodes = test.getValidVocabular(controlName="Country Code", varName="country")
+print(validCodes)
+print(len(validCodes))
+print("stations per country code")
+for country in validCodes:
+    getNumber = requests.get(test.stationMetaEndpoint, params={"country" : country, "limit" : "None"}  )
+    getNumber.raise_for_status()
+    print(f"\t{country}: {len(getNumber.json())}")
\ No newline at end of file
--- a/tests/produce_data_manyStations.ipynb
+++ b/tests/produce_data_manyStations.ipynb
@@ -24,17 +24,18 @@
   "source": [
    "#creation of request.\n",
    "\n",
-    "Config = namedtuple(\"Config\", [\"grid\", \"time\", \"variables\", \"stats\"])\n",
+    "Config = namedtuple(\"Config\", [\"grid\", \"time\", \"variables\", \"stats\", \"moreOptions\"])\n",
    "\n",
    "grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5, )\n",
    "\n",
    "configs = dict()\n",
-    "for year in range (1,19):\n",
+    "for year in range (0,19):\n",
    "    valid_data = Config(\n",
    "        grid,\n",
    "        TimeSample( start=dt(2000+year,1,1), end=dt(2000+year,12,31), sampling=\"daily\"),#possibly adopt range:-)\n",
    "        [\"mole_fraction_of_ozone_in_air\"],#variable name\n",
-    "        [ \"dma8epax\" ]# change to dma8epa_strict\n",
+    "        [ \"dma8epa_strict\" ]# change to dma8epa_strict\n",
+    "        \n",
    "    )\n",
    "    \n",
    "    configs[f\"test_ta{year}\"] = valid_data\n"

 %% Cell type:code id: tags:
 ``` python
 from datetime import datetime as dt
 from collections import namedtuple
 from pathlib import Path
 from toargridding.toar_rest_client import AnalysisServiceDownload, Connection
 from toargridding.grids import RegularGrid
 from toargridding.gridding import get_gridded_toar_data
 from toargridding.metadata import TimeSample
 ```
 %% Cell type:code id: tags:
 ``` python
 #creation of request.
-Config = namedtuple("Config", ["grid", "time", "variables", "stats"])
+Config = namedtuple("Config", ["grid", "time", "variables", "stats", "moreOptions"])
 grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5, )
 configs = dict()
-for year in range (1,19):
+for year in range (0,19):
    valid_data = Config(
        grid,
        TimeSample( start=dt(2000+year,1,1), end=dt(2000+year,12,31), sampling="daily"),#possibly adopt range:-)
        ["mole_fraction_of_ozone_in_air"],#variable name
-        [ "dma8epax" ]# change to dma8epa_strict
+        [ "dma8epa_strict" ]# change to dma8epa_strict
    )
    configs[f"test_ta{year}"] = valid_data
 ```
 %% Cell type:code id: tags:
 ``` python
 #CAVE: the request takes over 30min per requested year. Therefore this cell needs to be executed at different times to check, if the results are ready for download.
 #the processing is done on the server of the TOAR database.
 #a restart of the cell continues the request to the REST API if the requested data are ready for download
 # The download can also take a few minutes
 stats_endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/"
 cache_basepath = Path("cache")
 result_basepath = Path("results")
 cache_basepath.mkdir(exist_ok=True)
 result_basepath.mkdir(exist_ok=True)
 analysis_service = AnalysisServiceDownload(stats_endpoint=stats_endpoint, cache_dir=cache_basepath, sample_dir=result_basepath, use_downloaded=True)
 Connection.DEBUG=True
 #here we adopt the durations before, a request is stopped.
 #the default value is 30 minutes.
 #waiting up to 3h for one request
 analysis_service.connection.setRequestTimes(interval_min=45, maxWait_min=12*60)
 for person, config in configs.items():
    print(f"\nProcessing {person}:")
    print(f"--------------------")
    datasets, metadatas = get_gridded_toar_data(
        analysis_service=analysis_service,
        grid=config.grid,
        time=config.time,
        variables=config.variables,
        stats=config.stats,
    )
    for dataset, metadata in zip(datasets, metadatas):
        dataset.to_netcdf(result_basepath / f"{metadata.get_id()}_{config.grid.get_id()}.nc")
        print(metadata.get_id())
 ```

--- a/tests/produce_data_withOptional_country.ipynb
+++ b/tests/produce_data_withOptional_country.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime as dt\n",
+    "from collections import namedtuple\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from toargridding.toar_rest_client import AnalysisServiceDownload, Connection\n",
+    "from toargridding.grids import RegularGrid\n",
+    "from toargridding.gridding import get_gridded_toar_data\n",
+    "from toargridding.metadata import TimeSample\n",
+    "\n",
+    "from toargridding.metadata_utilities import countryCodes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#creation of request.\n",
+    "\n",
+    "Config = namedtuple(\"Config\", [\"grid\", \"time\", \"variables\", \"stats\",\"moreOptions\"])\n",
+    "\n",
+    "#see page 18 in https://toar-data.fz-juelich.de/sphinx/TOAR_UG_Vol03_Database/build/latex/toardatabase--userguide.pdf\n",
+    "\n",
+    "varName = \"country\"\n",
+    "stationCountries = countryCodes()\n",
+    "validCountries = stationCountries.getValidVocabular(controlName=\"Country Code\", varName=varName)\n",
+    "\n",
+    "grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5, )\n",
+    "\n",
+    "configs = dict()\n",
+    "for country in validCountries:\n",
+    "    valid_data = Config(\n",
+    "        grid,\n",
+    "        TimeSample( start=dt(2000,1,1), end=dt(2018,12,31), sampling=\"daily\"),#possibly adopt range:-)\n",
+    "        [\"mole_fraction_of_ozone_in_air\"],#variable name\n",
+    "        [ \"dma8epa_strict\" ],\n",
+    "        {varName : country}\n",
+    "    )\n",
+    "    \n",
+    "    configs[f\"test_ta{country}\"] = valid_data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#CAVE: this cell runs about 45minutes per requested year. therefore we increase the waiting duration to 1h per request.\n",
+    "#the processing is done on the server of the TOAR database.\n",
+    "#a restart of the cell continues the request to the REST API if the requested data are ready for download\n",
+    "# The download can also take a few minutes\n",
+    "\n",
+    "stats_endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/statistics/\"\n",
+    "cache_basepath = Path(\"cache\")\n",
+    "result_basepath = Path(\"results\")\n",
+    "cache_basepath.mkdir(exist_ok=True)\n",
+    "result_basepath.mkdir(exist_ok=True)\n",
+    "analysis_service = AnalysisServiceDownload(stats_endpoint=stats_endpoint, cache_dir=cache_basepath, sample_dir=result_basepath, use_downloaded=True)\n",
+    "\n",
+    "Connection.DEBUG=True\n",
+    "\n",
+    "# maybe adopt the interval for requesting the results and the total duration, before the client pauses the requests.\n",
+    "# as the requests take about 45min, it is more suitable to wait 60min before timing out the requests than the original 30min.\n",
+    "analysis_service.connection.setRequestTimes(interval_min=5, maxWait_min=60)\n",
+    "\n",
+    "createdFiles = []\n",
+    "\n",
+    "for person, config in configs.items():\n",
+    "    print(f\"\\nProcessing {person}:\")\n",
+    "    print(f\"--------------------\")\n",
+    "    datasets, metadatas = get_gridded_toar_data(\n",
+    "        analysis_service=analysis_service,\n",
+    "        grid=config.grid,\n",
+    "        time=config.time,\n",
+    "        variables=config.variables,\n",
+    "        stats=config.stats,\n",
+    "        **config.moreOptions\n",
+    "    )\n",
+    "\n",
+    "    for dataset, metadata in zip(datasets, metadatas):\n",
+    "        outName = result_basepath / f\"{metadata.get_id()}_{config.grid.get_id()}.nc\"\n",
+    "        dataset.to_netcdf(outName)\n",
+    "        createdFiles.append(outName)\n",
+    "        print(metadata.get_id())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##TODO: now we only need to combine all the obtained results...\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "toargridding-8RVrxzmn-py3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+``` python
+from datetime import datetime as dt
+from collections import namedtuple
+from pathlib import Path
+from toargridding.toar_rest_client import AnalysisServiceDownload, Connection
+from toargridding.grids import RegularGrid
+from toargridding.gridding import get_gridded_toar_data
+from toargridding.metadata import TimeSample
+from toargridding.metadata_utilities import countryCodes
+```
+%% Cell type:code id: tags:
+``` python
+#creation of request.
+Config = namedtuple("Config", ["grid", "time", "variables", "stats","moreOptions"])
+#see page 18 in https://toar-data.fz-juelich.de/sphinx/TOAR_UG_Vol03_Database/build/latex/toardatabase--userguide.pdf
+varName = "country"
+stationCountries = countryCodes()
+validCountries = stationCountries.getValidVocabular(controlName="Country Code", varName=varName)
+grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5, )
+configs = dict()
+for country in validCountries:
+    valid_data = Config(
+        grid,
+        TimeSample( start=dt(2000,1,1), end=dt(2018,12,31), sampling="daily"),#possibly adopt range:-)
+        ["mole_fraction_of_ozone_in_air"],#variable name
+        [ "dma8epa_strict" ],
+        {varName : country}
+    )
+    configs[f"test_ta{country}"] = valid_data
+```
+%% Cell type:code id: tags:
+``` python
+#CAVE: this cell runs about 45minutes per requested year. therefore we increase the waiting duration to 1h per request.
+#the processing is done on the server of the TOAR database.
+#a restart of the cell continues the request to the REST API if the requested data are ready for download
+# The download can also take a few minutes
+stats_endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/"
+cache_basepath = Path("cache")
+result_basepath = Path("results")
+cache_basepath.mkdir(exist_ok=True)
+result_basepath.mkdir(exist_ok=True)
+analysis_service = AnalysisServiceDownload(stats_endpoint=stats_endpoint, cache_dir=cache_basepath, sample_dir=result_basepath, use_downloaded=True)
+Connection.DEBUG=True
+# maybe adopt the interval for requesting the results and the total duration, before the client pauses the requests.
+# as the requests take about 45min, it is more suitable to wait 60min before timing out the requests than the original 30min.
+analysis_service.connection.setRequestTimes(interval_min=5, maxWait_min=60)
+createdFiles = []
+for person, config in configs.items():
+    print(f"\nProcessing {person}:")
+    print(f"--------------------")
+    datasets, metadatas = get_gridded_toar_data(
+        analysis_service=analysis_service,
+        grid=config.grid,
+        time=config.time,
+        variables=config.variables,
+        stats=config.stats,
+        **config.moreOptions
+    )
+    for dataset, metadata in zip(datasets, metadatas):
+        outName = result_basepath / f"{metadata.get_id()}_{config.grid.get_id()}.nc"
+        dataset.to_netcdf(outName)
+        createdFiles.append(outName)
+        print(metadata.get_id())
+```
+%% Cell type:code id: tags:
+``` python
+##TODO: now we only need to combine all the obtained results...
+```
--- a/toargridding/metadata_utilities.py
+++ b/toargridding/metadata_utilities.py
+import requests
+from collections import namedtuple
+ControlVoc = namedtuple("ControlVoc", ["ID", "short", "long"])
+class countryCodes:
+    """! this is a quick and dirty utility class to obtain all valid values of a specific metadata value of the stations
+    It is created with the example of country codes in mind, to split the number of requests to the TOAD database for each specific analysis request.
+    Argument:
+    ---------
+    controlVocEndpoint:
+        endpoint to access all valid values of a variable used within the control vocabulary
+    stationMetaEndpoint:
+        endpoint to access all stations fitting to a specific search pattern. Will be used to validate, that a metadata values provides access to stations.
+    """
+    def __init__(self, controlVocEndpoint="https://toar-data.fz-juelich.de/api/v2/controlled_vocabulary/", stationMetaEndpoint="https://toar-data.fz-juelich.de/api/v2/stationmeta/"):
+        self.controlVocEndpoint = controlVocEndpoint
+        if self.controlVocEndpoint[-1] != "/":
+            self.controlVocEndpoint += "/"
+        self.stationMetaEndpoint = stationMetaEndpoint
+        if self.stationMetaEndpoint[-1] != "/":
+            self.stationMetaEndpoint += "/"
+    def getValidVocabular(self, controlName, varName ):
+        """get all valid values for a variable
+        Argument:
+        --------
+        controlName:
+            name of the parameter in the control vocabulary
+        varName:
+            name of the parameter in a request to another endpoint. 
+        return:
+            list with all valid values for the requested varName to be used within request to the TOAR database
+        """
+        #get all possible values for the given variable:
+        response  = requests.get( f"{self.controlVocEndpoint}{controlName}")
+        response.raise_for_status()
+        controlVoc = self.convertLists(response)
+        #check for all valid infos, i.e. is there at least one station for this metadata
+        validCodes = []
+        for voc in controlVoc:
+            params = {varName : voc.short, "limit" : 1}
+            getNumber = requests.get(self.stationMetaEndpoint, params=params  )
+            getNumber.raise_for_status()
+            if len(getNumber.json()) > 0:
+                validCodes.append(voc.short)
+        return validCodes
+    def convertLists(self, response) -> list[namedtuple]:
+        """convert results into an easier to access data type
+        """
+        return [  ControlVoc(id, short, long)  for id, short, long in response.json()]