quality_controll.ipynb

-changed sampling from daily to monthly -fixed Metadata.construct parameter order toar_rest_client: DownloadAnalysisService: -added CF name of variable to sample file name fixed typos and added some documentation

quality_controll.ipynb
4da83bc5 · Carsten Hinz · 8af6b3e8 · 4da83bc5 · 4da83bc5 · 4da83bc5
Commit 4da83bc5 authored 10 months ago by Carsten Hinz
--- a/tests/quality_controll.ipynb
+++ b/tests/quality_controll.ipynb
@@ -39,8 +39,8 @@
    "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir)\n",
    "my_grid = RegularGrid(1.9, 2.5)\n",
    "\n",
-    "time = TimeSample(dt(2016,1,1), dt(2016,12,31), \"daily\")\n",
+    "time = TimeSample(dt(2016,1,1), dt(2016,12,31), \"monthly\")\n",
-    "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", \"mean\", time)\n"
+    "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", time, \"mean\")\n"
   ]
  },
  {

 %% Cell type:markdown id: tags:
 ### Get Dataset from request
 %% Cell type:code id: tags:
 ``` python
 from datetime import datetime as dt
 from pathlib import Path
 import pandas as pd
 import numpy as np
 from toargridding.grids import RegularGrid
 from toargridding.toar_rest_client import (
    AnalysisServiceDownload,
    STATION_LAT,
    STATION_LON,
 )
 from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates
 from toargridding.variables import Coordinate
 endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/"
 #starts in directory [path/to/toargridding]/tests
 #maybe adopt the toargridding_base_path for your machine.
 toargridding_base_path = Path(".")
 cache_dir = toargridding_base_path / "cache"
 data_download_dir = toargridding_base_path / "data"
 analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir)
 my_grid = RegularGrid(1.9, 2.5)
-time = TimeSample(dt(2016,1,1), dt(2016,12,31), "daily")
+time = TimeSample(dt(2016,1,1), dt(2016,12,31), "monthly")
-metadata = Metadata.construct("mole_fraction_of_ozone_in_air", "mean", time)
+metadata = Metadata.construct("mole_fraction_of_ozone_in_air", time, "mean")
 ```
 %% Cell type:code id: tags:
 ``` python
 # this cell can runs longer than 30minutes
 data = analysis_service.get_data(metadata)
 ds = my_grid.as_xarray(data)
 ```
 %% Cell type:markdown id: tags:
 ### Visual inspection
 %% Cell type:code id: tags:
 ``` python
 #calculation of coordinates for plotting
 #especially separation of coordinates with results and without results.
 import cartopy.crs as ccrs
 import matplotlib.pyplot as plt
 import matplotlib.ticker as mticker
 mean_data = ds["mean"]
 clean_coords = data.stations_coords
 all_na = data.stations_data.isna().all(axis=1)
 clean_coords = all_na.to_frame().join(clean_coords)[["latitude", "longitude"]]
 all_na_coords = clean_coords[all_na]
 not_na_coords = clean_coords[~all_na]
 ```
 %% Cell type:code id: tags:
 ``` python
 import matplotlib as mpl
 #definition of plotting function
 def plot_cells(data, stations, na_stations, discrete=True, plot_stations=False):
    fig = plt.figure(figsize=(9, 18))
    ax = plt.axes(projection=ccrs.PlateCarree())
    ax.coastlines()
    gl = ax.gridlines(draw_labels=True)
    gl.top_labels = False
    gl.left_labels = False
    gl.xlocator = mticker.FixedLocator(data.longitude.values)
    gl.ylocator = mticker.FixedLocator(data.latitude.values)
    cmap = mpl.cm.viridis
    if discrete:
        print(np.unique(data.values))
        bounds = np.arange(8)
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend="both")
        ticks = np.arange(bounds.size + 1)[:-1] + 0.5
        ticklables = bounds
        im = plt.pcolormesh(
            data.longitude,
            data.latitude,
            data,
            transform=ccrs.PlateCarree(),
            cmap=cmap,
            shading="nearest",
            norm=norm,
        )
        cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)
        cb.set_ticks(ticks)
        cb.set_ticklabels(ticklables)
        im = plt.pcolormesh(
            data.longitude,
            data.latitude,
            data,
            transform=ccrs.PlateCarree(),
            cmap=cmap,
            shading="nearest",
            norm=norm,
        )
    else:
        im = plt.pcolormesh(
            data.longitude,
            data.latitude,
            data,
            transform=ccrs.PlateCarree(),
            cmap=cmap,
            shading="nearest",
        )
        cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)
    if plot_stations:
        plt.scatter(na_stations["longitude"], na_stations["latitude"], s=1, c="k")
        plt.scatter(stations["longitude"], stations["latitude"], s=1, c="r")
    plt.tight_layout()
    plt.title(f"global ozon at {data.time.values} {data.time.units}")
 ```
 %% Cell type:code id: tags:
 ``` python
 #example visualization for two time points
 print(not_na_coords)
 timestep = 2
 time = ds.time[timestep]
 data = ds.sel(time=time)
 plot_cells(data["mean"], not_na_coords, all_na_coords, discrete=False, plot_stations=True)
 plt.show()
 plot_cells(data["n"], not_na_coords, all_na_coords, discrete=True)
 plt.show()
 n_observations = ds["n"].sum(["latitude", "longitude"])
 plt.plot(ds.time, n_observations)
 print(np.unique(ds["n"]))
 ```
 %% Cell type:code id: tags:
 ``` python
 print(data)
 ```

--- a/toargridding/metadata.py
+++ b/toargridding/metadata.py
@@ -11,6 +11,7 @@ from toargridding.static_metadata import global_cf_attributes, TOARVariable
 from typing import Dict
 date_created = datetime.utcnow().strftime("%Y-%m-dT%H:%M:%SZ")
+#date_created = datetime.now(datetime.UTC).strftime("%Y-%m-dT%H:%M:%SZ") # fix as utcnow will be removed in the future
 COORDINATE_VARIABLES = ["latitude", "longitude", "time"]
 DATA_VARIABLES = ["mean", "std", "n"]
@@ -28,6 +29,14 @@ class TimeSample:
    """Sampling in time
    provides conversion into different formats
+    Attributes:
+    start:
+        start time point
+    end:
+        end time point
+    sampling:
+        temporal aggregation of values, e.g. daily, monthly
    """
    start: datetime
@@ -35,7 +44,7 @@ class TimeSample:
    sampling: str
    @property
-    def sampling(self) -> str: # TODO make better
+    def sampling(self) -> str:
        """sampling for data request
        Sampling, i.e. the period used for the calculation of a parameters within the TOAD DB
@@ -46,7 +55,7 @@ class TimeSample:
    @sampling.setter
    def sampling(self, sampling : str):
        if sampling not in ALLOWED_SAMPLING_VALUES:
-            raise ValueError(f"sampling: {sampling} is not in the list of supported samplings for toargridding.")
+            raise ValueError(f"sampling: {sampling} is not in the list of supported samplings for toargridding: {ALLOWED_SAMPLING_VALUES}")
        self._sampling = sampling
    def as_datetime_index(self) -> pd.DatetimeIndex:

--- a/toargridding/toar_rest_client.py
+++ b/toargridding/toar_rest_client.py
@@ -30,7 +30,7 @@ class QueryOptions:
    statistics:
        statistical quantity requested from the TOAR database. see toargridding.toarstats_constants.STATISTICS_LIST.
    sampling:
-        frequency of sampling within the datarange, e.g. daily, monthly
+        temporal aggregation, e.g. daily, monthly
    min_data_capture:
        most probably the minimum data to include in the request
    metadata_schema:
@@ -113,13 +113,13 @@ class Cache:
            return storage[key]
    def put(self, key: str, content: str):
-        """get add key and content as key-value-pair to  cache
+        """get add key and content as key-value-pair to cache
        """
        with self.storage_dict() as storage:
            storage[key] = content
    def remove(self, key: str):
-        """remove a key and content as key-value-pair to  cache
+        """remove a key and content as key-value-pair to cache
        """
        with self.storage_dict() as storage:
            del storage[key]
@@ -160,7 +160,7 @@ class Connection:
    def get(self, query_options):
        """get results for a request.
-        This is the main function to obtaind data from the TOAR DB. It will start requests or lookup if an already started requests is finished.
+        This is the main function to obtained data from the TOAR DB. It will start requests or lookup if an already started requests is finished.
        Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results.
        """
@@ -232,7 +232,7 @@ class Connection:
        query_options:
            used with the base endpoint to create a request. If None, endpoint is expected to be a full endpoint
        wait_secs:
-            sleep in seconds before starting request to TAOR DB
+            sleep in seconds before starting request to TAORDB
        timeout:
            timeout for the request.
        """
@@ -305,7 +305,7 @@ class AnalysisService:
        timeseries:
            extracted time series
        metadata:
-            metadate belonging ot the timeseries.
+            metadata belonging ot the timeseries.
        return:
            timeseries without invalid numbers (none, NaN, etc)
@@ -357,6 +357,22 @@ class AnalysisService:
 class AnalysisServiceDownload(AnalysisService):
+    """download service with caching of requests to the TOARDB
+    This service performs the request to the TOAR database and downloads the results of the request to disc before returning if for further processing.
+    When retrieving data, a check is donw, if this request has already been cached on disc.
+    Attributes:
+    ----------
+    stats_endpoint:
+        link to statistics service of TOAR DB
+    cache_dir:
+        directory to store cache file for requests, needs to exist
+    sample_dir:
+        directory for caching results of request to the TOARDB
+    use_downloaded:
+        flag to control if the cache of downloaded requests is checked before extracting data from the TOARDB
+    """
    def __init__(
        self, stats_endpoint, cache_dir, sample_dir: Path, use_downloaded=True
    ):
@@ -382,4 +398,12 @@ class AnalysisServiceDownload(AnalysisService):
    @staticmethod
    def get_sample_file_name(metadata: Metadata):
-        return f"{metadata.statistic}_{metadata.time.sampling}_{metadata.time.start.date()}_{metadata.time.end.date()}.zip"
+        """creates a filename from the metadata
+        At the moment considering statistical method, sampling (temporal aggregation) as well as start and end.
+        Parameters:
+        ----------
+        metadata:
+            metadata for the request.
+        """
+        return f"{metadata.statistic}_{metadata.time.sampling}_{metadata.variable.cf_standardname}_{metadata.time.start.date()}_{metadata.time.end.date()}.zip"