Skip to content
Snippets Groups Projects
Commit 4da83bc5 authored by Carsten Hinz's avatar Carsten Hinz
Browse files

quality_controll.ipynb

-changed sampling from daily to monthly
-fixed Metadata.construct parameter order

toar_rest_client: DownloadAnalysisService:
-added CF name of variable to sample file name

fixed typos and added some documentation
parent 8af6b3e8
No related branches found
No related tags found
1 merge request!11Creation of first beta release version
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Get Dataset from request ### Get Dataset from request
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from datetime import datetime as dt from datetime import datetime as dt
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from toargridding.grids import RegularGrid from toargridding.grids import RegularGrid
from toargridding.toar_rest_client import ( from toargridding.toar_rest_client import (
AnalysisServiceDownload, AnalysisServiceDownload,
STATION_LAT, STATION_LAT,
STATION_LON, STATION_LON,
) )
from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates
from toargridding.variables import Coordinate from toargridding.variables import Coordinate
endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/" endpoint = "https://toar-data.fz-juelich.de/api/v2/analysis/statistics/"
#starts in directory [path/to/toargridding]/tests #starts in directory [path/to/toargridding]/tests
#maybe adopt the toargridding_base_path for your machine. #maybe adopt the toargridding_base_path for your machine.
toargridding_base_path = Path(".") toargridding_base_path = Path(".")
cache_dir = toargridding_base_path / "cache" cache_dir = toargridding_base_path / "cache"
data_download_dir = toargridding_base_path / "data" data_download_dir = toargridding_base_path / "data"
analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir) analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir)
my_grid = RegularGrid(1.9, 2.5) my_grid = RegularGrid(1.9, 2.5)
time = TimeSample(dt(2016,1,1), dt(2016,12,31), "daily") time = TimeSample(dt(2016,1,1), dt(2016,12,31), "monthly")
metadata = Metadata.construct("mole_fraction_of_ozone_in_air", "mean", time) metadata = Metadata.construct("mole_fraction_of_ozone_in_air", time, "mean")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# this cell can runs longer than 30minutes # this cell can runs longer than 30minutes
data = analysis_service.get_data(metadata) data = analysis_service.get_data(metadata)
ds = my_grid.as_xarray(data) ds = my_grid.as_xarray(data)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Visual inspection ### Visual inspection
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#calculation of coordinates for plotting #calculation of coordinates for plotting
#especially separation of coordinates with results and without results. #especially separation of coordinates with results and without results.
import cartopy.crs as ccrs import cartopy.crs as ccrs
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.ticker as mticker import matplotlib.ticker as mticker
mean_data = ds["mean"] mean_data = ds["mean"]
clean_coords = data.stations_coords clean_coords = data.stations_coords
all_na = data.stations_data.isna().all(axis=1) all_na = data.stations_data.isna().all(axis=1)
clean_coords = all_na.to_frame().join(clean_coords)[["latitude", "longitude"]] clean_coords = all_na.to_frame().join(clean_coords)[["latitude", "longitude"]]
all_na_coords = clean_coords[all_na] all_na_coords = clean_coords[all_na]
not_na_coords = clean_coords[~all_na] not_na_coords = clean_coords[~all_na]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import matplotlib as mpl import matplotlib as mpl
#definition of plotting function #definition of plotting function
def plot_cells(data, stations, na_stations, discrete=True, plot_stations=False): def plot_cells(data, stations, na_stations, discrete=True, plot_stations=False):
fig = plt.figure(figsize=(9, 18)) fig = plt.figure(figsize=(9, 18))
ax = plt.axes(projection=ccrs.PlateCarree()) ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines() ax.coastlines()
gl = ax.gridlines(draw_labels=True) gl = ax.gridlines(draw_labels=True)
gl.top_labels = False gl.top_labels = False
gl.left_labels = False gl.left_labels = False
gl.xlocator = mticker.FixedLocator(data.longitude.values) gl.xlocator = mticker.FixedLocator(data.longitude.values)
gl.ylocator = mticker.FixedLocator(data.latitude.values) gl.ylocator = mticker.FixedLocator(data.latitude.values)
cmap = mpl.cm.viridis cmap = mpl.cm.viridis
if discrete: if discrete:
print(np.unique(data.values)) print(np.unique(data.values))
bounds = np.arange(8) bounds = np.arange(8)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend="both") norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend="both")
ticks = np.arange(bounds.size + 1)[:-1] + 0.5 ticks = np.arange(bounds.size + 1)[:-1] + 0.5
ticklables = bounds ticklables = bounds
im = plt.pcolormesh( im = plt.pcolormesh(
data.longitude, data.longitude,
data.latitude, data.latitude,
data, data,
transform=ccrs.PlateCarree(), transform=ccrs.PlateCarree(),
cmap=cmap, cmap=cmap,
shading="nearest", shading="nearest",
norm=norm, norm=norm,
) )
cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25) cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)
cb.set_ticks(ticks) cb.set_ticks(ticks)
cb.set_ticklabels(ticklables) cb.set_ticklabels(ticklables)
im = plt.pcolormesh( im = plt.pcolormesh(
data.longitude, data.longitude,
data.latitude, data.latitude,
data, data,
transform=ccrs.PlateCarree(), transform=ccrs.PlateCarree(),
cmap=cmap, cmap=cmap,
shading="nearest", shading="nearest",
norm=norm, norm=norm,
) )
else: else:
im = plt.pcolormesh( im = plt.pcolormesh(
data.longitude, data.longitude,
data.latitude, data.latitude,
data, data,
transform=ccrs.PlateCarree(), transform=ccrs.PlateCarree(),
cmap=cmap, cmap=cmap,
shading="nearest", shading="nearest",
) )
cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25) cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)
if plot_stations: if plot_stations:
plt.scatter(na_stations["longitude"], na_stations["latitude"], s=1, c="k") plt.scatter(na_stations["longitude"], na_stations["latitude"], s=1, c="k")
plt.scatter(stations["longitude"], stations["latitude"], s=1, c="r") plt.scatter(stations["longitude"], stations["latitude"], s=1, c="r")
plt.tight_layout() plt.tight_layout()
plt.title(f"global ozon at {data.time.values} {data.time.units}") plt.title(f"global ozon at {data.time.values} {data.time.units}")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#example visualization for two time points #example visualization for two time points
print(not_na_coords) print(not_na_coords)
timestep = 2 timestep = 2
time = ds.time[timestep] time = ds.time[timestep]
data = ds.sel(time=time) data = ds.sel(time=time)
plot_cells(data["mean"], not_na_coords, all_na_coords, discrete=False, plot_stations=True) plot_cells(data["mean"], not_na_coords, all_na_coords, discrete=False, plot_stations=True)
plt.show() plt.show()
plot_cells(data["n"], not_na_coords, all_na_coords, discrete=True) plot_cells(data["n"], not_na_coords, all_na_coords, discrete=True)
plt.show() plt.show()
n_observations = ds["n"].sum(["latitude", "longitude"]) n_observations = ds["n"].sum(["latitude", "longitude"])
plt.plot(ds.time, n_observations) plt.plot(ds.time, n_observations)
print(np.unique(ds["n"])) print(np.unique(ds["n"]))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(data) print(data)
``` ```
......
...@@ -11,6 +11,7 @@ from toargridding.static_metadata import global_cf_attributes, TOARVariable ...@@ -11,6 +11,7 @@ from toargridding.static_metadata import global_cf_attributes, TOARVariable
from typing import Dict from typing import Dict
date_created = datetime.utcnow().strftime("%Y-%m-dT%H:%M:%SZ") date_created = datetime.utcnow().strftime("%Y-%m-dT%H:%M:%SZ")
#date_created = datetime.now(datetime.UTC).strftime("%Y-%m-dT%H:%M:%SZ") # fix as utcnow will be removed in the future
COORDINATE_VARIABLES = ["latitude", "longitude", "time"] COORDINATE_VARIABLES = ["latitude", "longitude", "time"]
DATA_VARIABLES = ["mean", "std", "n"] DATA_VARIABLES = ["mean", "std", "n"]
...@@ -28,6 +29,14 @@ class TimeSample: ...@@ -28,6 +29,14 @@ class TimeSample:
"""Sampling in time """Sampling in time
provides conversion into different formats provides conversion into different formats
Attributes:
start:
start time point
end:
end time point
sampling:
temporal aggregation of values, e.g. daily, monthly
""" """
start: datetime start: datetime
...@@ -35,7 +44,7 @@ class TimeSample: ...@@ -35,7 +44,7 @@ class TimeSample:
sampling: str sampling: str
@property @property
def sampling(self) -> str: # TODO make better def sampling(self) -> str:
"""sampling for data request """sampling for data request
Sampling, i.e. the period used for the calculation of a parameters within the TOAD DB Sampling, i.e. the period used for the calculation of a parameters within the TOAD DB
...@@ -46,7 +55,7 @@ class TimeSample: ...@@ -46,7 +55,7 @@ class TimeSample:
@sampling.setter @sampling.setter
def sampling(self, sampling : str): def sampling(self, sampling : str):
if sampling not in ALLOWED_SAMPLING_VALUES: if sampling not in ALLOWED_SAMPLING_VALUES:
raise ValueError(f"sampling: {sampling} is not in the list of supported samplings for toargridding.") raise ValueError(f"sampling: {sampling} is not in the list of supported samplings for toargridding: {ALLOWED_SAMPLING_VALUES}")
self._sampling = sampling self._sampling = sampling
def as_datetime_index(self) -> pd.DatetimeIndex: def as_datetime_index(self) -> pd.DatetimeIndex:
......
...@@ -30,7 +30,7 @@ class QueryOptions: ...@@ -30,7 +30,7 @@ class QueryOptions:
statistics: statistics:
statistical quantity requested from the TOAR database. see toargridding.toarstats_constants.STATISTICS_LIST. statistical quantity requested from the TOAR database. see toargridding.toarstats_constants.STATISTICS_LIST.
sampling: sampling:
frequency of sampling within the datarange, e.g. daily, monthly temporal aggregation, e.g. daily, monthly
min_data_capture: min_data_capture:
most probably the minimum data to include in the request most probably the minimum data to include in the request
metadata_schema: metadata_schema:
...@@ -113,13 +113,13 @@ class Cache: ...@@ -113,13 +113,13 @@ class Cache:
return storage[key] return storage[key]
def put(self, key: str, content: str): def put(self, key: str, content: str):
"""get add key and content as key-value-pair to cache """get add key and content as key-value-pair to cache
""" """
with self.storage_dict() as storage: with self.storage_dict() as storage:
storage[key] = content storage[key] = content
def remove(self, key: str): def remove(self, key: str):
"""remove a key and content as key-value-pair to cache """remove a key and content as key-value-pair to cache
""" """
with self.storage_dict() as storage: with self.storage_dict() as storage:
del storage[key] del storage[key]
...@@ -160,7 +160,7 @@ class Connection: ...@@ -160,7 +160,7 @@ class Connection:
def get(self, query_options): def get(self, query_options):
"""get results for a request. """get results for a request.
This is the main function to obtaind data from the TOAR DB. It will start requests or lookup if an already started requests is finished. This is the main function to obtained data from the TOAR DB. It will start requests or lookup if an already started requests is finished.
Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results. Throws an exception, if the results are not available after the waiting time. A restart of the function continues the regular lookup for results.
""" """
...@@ -232,7 +232,7 @@ class Connection: ...@@ -232,7 +232,7 @@ class Connection:
query_options: query_options:
used with the base endpoint to create a request. If None, endpoint is expected to be a full endpoint used with the base endpoint to create a request. If None, endpoint is expected to be a full endpoint
wait_secs: wait_secs:
sleep in seconds before starting request to TAOR DB sleep in seconds before starting request to TAORDB
timeout: timeout:
timeout for the request. timeout for the request.
""" """
...@@ -305,7 +305,7 @@ class AnalysisService: ...@@ -305,7 +305,7 @@ class AnalysisService:
timeseries: timeseries:
extracted time series extracted time series
metadata: metadata:
metadate belonging ot the timeseries. metadata belonging ot the timeseries.
return: return:
timeseries without invalid numbers (none, NaN, etc) timeseries without invalid numbers (none, NaN, etc)
...@@ -357,6 +357,22 @@ class AnalysisService: ...@@ -357,6 +357,22 @@ class AnalysisService:
class AnalysisServiceDownload(AnalysisService): class AnalysisServiceDownload(AnalysisService):
"""download service with caching of requests to the TOARDB
This service performs the request to the TOAR database and downloads the results of the request to disc before returning if for further processing.
When retrieving data, a check is donw, if this request has already been cached on disc.
Attributes:
----------
stats_endpoint:
link to statistics service of TOAR DB
cache_dir:
directory to store cache file for requests, needs to exist
sample_dir:
directory for caching results of request to the TOARDB
use_downloaded:
flag to control if the cache of downloaded requests is checked before extracting data from the TOARDB
"""
def __init__( def __init__(
self, stats_endpoint, cache_dir, sample_dir: Path, use_downloaded=True self, stats_endpoint, cache_dir, sample_dir: Path, use_downloaded=True
): ):
...@@ -382,4 +398,12 @@ class AnalysisServiceDownload(AnalysisService): ...@@ -382,4 +398,12 @@ class AnalysisServiceDownload(AnalysisService):
@staticmethod @staticmethod
def get_sample_file_name(metadata: Metadata): def get_sample_file_name(metadata: Metadata):
return f"{metadata.statistic}_{metadata.time.sampling}_{metadata.time.start.date()}_{metadata.time.end.date()}.zip" """creates a filename from the metadata
At the moment considering statistical method, sampling (temporal aggregation) as well as start and end.
Parameters:
----------
metadata:
metadata for the request.
"""
return f"{metadata.statistic}_{metadata.time.sampling}_{metadata.variable.cf_standardname}_{metadata.time.start.date()}_{metadata.time.end.date()}.zip"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment