diff --git a/.gitignore b/.gitignore index c5b90c662725acbd082e227aa95a26b04b412635..7471f2af2bd557dd558f17a78397c31f1e3e4573 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ poetry.lock -out.zip -metadata.json -main.ipynb -data_sample.zip -__pycache__ \ No newline at end of file +*.zip +__pycache__ +report.* \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 3cda620cd9b155b034b5b96b897dd5e7701d861a..e8b0ad52cf60067234d55270d82d570841426ac8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ pytest = "^7.4.3" pytest-cov = "^4.1.0" ipykernel = "^6.26.0" compliance-checker = "^5.1.0" +cartopy = "^0.22.0" [build-system] requires = ["poetry-core"] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..b01defa1b9faf06b9e69c97c2d6e999100924558 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,37 @@ +from datetime import datetime +from pathlib import Path + +import pytest + +from toargridding.grids import RegularGrid +from toargridding.metadata import TimeSample +from toargridding.toar_rest_client import AnalysisService + +test_data = list((Path(__file__).parent / "data").iterdir()) + + +@pytest.fixture +def regular_grid(): + return RegularGrid(10, 10) + + +# TODO investigate -1 day discrepancy in time index +@pytest.fixture +def time(): + start = datetime(2009, 12, 31) + end = datetime(2011, 1, 1) + sampling = "daily" + + return TimeSample(start, end, sampling) + + +def load_local_data(*args, **kwargs): + with open(test_data[2], "r+b") as f: + return f.read() + + +@pytest.fixture +def local_analysis_service(monkeypatch): + service = AnalysisService("foo") + monkeypatch.setattr(service, "query_timeseries_and_metadata", load_local_data) + return service diff --git a/tests/get_sample_data.ipynb b/tests/get_sample_data.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f540ba277dd41bab2e7c183b080c4e2bbd8c5d24 --- /dev/null +++ b/tests/get_sample_data.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "sampling = \"monthly\"\n", + "start = datetime(2010,1,1)\n", + "end = datetime(2011,1,1)\n", + "\n", + "print(start.date(), end.date())\n", + "print(start.isoformat(), end.isoformat())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "response = requests.get(\n", + " \"https://toar-data.fz-juelich.de/api/v2/analysis/statistics/\",\n", + " params={\n", + " \"daterange\": f\"{start.isoformat()},{end.isoformat()}\", # 1-year\n", + " \"variable_id\": 5,\n", + " \"statistics\": \"mean\",\n", + " \"sampling\": sampling, # daily sampling\n", + " \"min_data_capture\": 0,\n", + " \"limit\": \"None\", # get all timeseries\n", + " \"format\": \"by_statistic\",\n", + " \"metadata_scheme\": \"basic\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(response.status_code)\n", + "status_endpoint = response.json()[\"status\"]\n", + "print(status_endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "waiting_for_data = True\n", + "\n", + "start_time = time.time()\n", + "while waiting_for_data:\n", + " time.sleep(30)\n", + " response = requests.get(status_endpoint)\n", + "\n", + " waiting_for_data = (response.headers[\"Content-Type\"] == \"application/json\")\n", + "\n", + "response_time = time.time() - start_time\n", + "print(response_time)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response.headers[\"Content-Type\"], response.headers[\"Content-Length\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(f\"data/{sampling}_{start.date()}_{end.date()}.zip\", \"w+b\") as sample_file:\n", + " sample_file.write(response.content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "toargridding-g-KQ1Hyq-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/quality_controll.ipynb b/tests/quality_controll.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..dda989fe5c25049b57ca700dd70351bafc76d4f2 --- /dev/null +++ b/tests/quality_controll.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Dataset from request" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime as dt\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from toargridding.grids import RegularGrid\n", + "from toargridding.toar_rest_client import (\n", + " AnalysisService,\n", + " COORDS,\n", + " STATION_LAT,\n", + " STATION_LON,\n", + ")\n", + "from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates\n", + "from toargridding.variables import Coordinate\n", + "\n", + "\n", + "endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/statistics/\"\n", + "analysis_service = AnalysisService(endpoint)\n", + "my_grid = RegularGrid(10, 10)\n", + "\n", + "time = TimeSample(dt(2009,12,31), dt(2011,1,1), \"daily\")\n", + "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", \"mean\", time)\n", + "\n", + "with open(\"data/daily_2010-01-01_2011-01-01.zip\", \"r+b\") as sample_file:\n", + " response_content = sample_file.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timeseries, timeseries_metadata = analysis_service.load_data(response_content, metadata)\n", + "coords = analysis_service.get_clean_coords(timeseries_metadata)\n", + "timeseries = analysis_service.get_clean_timeseries(timeseries, metadata)\n", + "data = AnalysisRequestResult(timeseries, coords, metadata)\n", + "\n", + "ds = my_grid.as_xarray(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visual inspection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cartopy.crs as ccrs\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.ticker as mticker\n", + "\n", + "mean_data = ds[\"mean\"]\n", + "clean_coords = analysis_service.get_clean_coords(timeseries_metadata)\n", + "all_na = timeseries.isna().all(axis=1)\n", + "clean_coords = all_na.to_frame().join(clean_coords)[[\"latitude\", \"longitude\"]]\n", + "all_na_coords = clean_coords[all_na]\n", + "not_na_coords = clean_coords[~all_na]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "\n", + "\n", + "def plot_cells(data, stations, na_stations, discrete=True, plot_stations=False):\n", + " fig = plt.figure(figsize=(9, 18))\n", + "\n", + " ax = plt.axes(projection=ccrs.PlateCarree())\n", + " ax.coastlines()\n", + " gl = ax.gridlines(draw_labels=True)\n", + " gl.top_labels = False\n", + " gl.left_labels = False\n", + " gl.xlocator = mticker.FixedLocator(data.longitude.values)\n", + " gl.ylocator = mticker.FixedLocator(data.latitude.values)\n", + "\n", + " cmap = mpl.cm.viridis\n", + "\n", + " if discrete:\n", + " print(np.unique(data.values))\n", + " bounds = np.arange(8)\n", + " norm = mpl.colors.BoundaryNorm(bounds, cmap.N, extend=\"both\")\n", + " ticks = np.arange(bounds.size + 1)[:-1] + 0.5\n", + " ticklables = bounds\n", + " \n", + " im = plt.pcolormesh(\n", + " data.longitude,\n", + " data.latitude,\n", + " data,\n", + " transform=ccrs.PlateCarree(),\n", + " cmap=cmap,\n", + " shading=\"nearest\",\n", + " norm=norm,\n", + " )\n", + " cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)\n", + " cb.set_ticks(ticks)\n", + " cb.set_ticklabels(ticklables)\n", + " im = plt.pcolormesh(\n", + " data.longitude,\n", + " data.latitude,\n", + " data,\n", + " transform=ccrs.PlateCarree(),\n", + " cmap=cmap,\n", + " shading=\"nearest\",\n", + " norm=norm,\n", + " )\n", + " else:\n", + " im = plt.pcolormesh(\n", + " data.longitude,\n", + " data.latitude,\n", + " data,\n", + " transform=ccrs.PlateCarree(),\n", + " cmap=cmap,\n", + " shading=\"nearest\",\n", + " )\n", + "\n", + " cb = fig.colorbar(im, ax=ax, shrink=0.2, aspect=25)\n", + " \n", + "\n", + " if plot_stations:\n", + " plt.scatter(na_stations[STATION_LON], na_stations[STATION_LAT], s=1, c=\"k\")\n", + " plt.scatter(stations[STATION_LON], stations[STATION_LAT], s=1, c=\"r\")\n", + "\n", + " plt.tight_layout()\n", + "\n", + " plt.title(f\"global ozon at {data.time.values}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "timestep = 50\n", + "time = ds.time[timestep]\n", + "data = ds.sel(time=time)\n", + "\n", + "plot_cells(data[\"mean\"], not_na_coords, all_na_coords, discrete=False)\n", + "plt.show()\n", + "\n", + "plot_cells(data[\"n\"], not_na_coords, all_na_coords, discrete=True)\n", + "plt.show()\n", + "\n", + "n_observations = ds[\"n\"].sum([\"latitude\", \"longitude\"])\n", + "plt.plot(ds.time, n_observations)\n", + "print(np.unique(ds[\"n\"]))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "toargridding-g-KQ1Hyq-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_gridding.py b/tests/test_gridding.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4592f1aa3ea15187fd7726497a0536f5b1a716 --- /dev/null +++ b/tests/test_gridding.py @@ -0,0 +1,81 @@ +from toargridding.gridding import get_gridded_toar_data + +from unittest import mock +from pathlib import Path + +import xarray as xr +from compliance_checker.runner import ComplianceChecker, CheckSuite + + +def generate_compliance_report(dataset: xr.Dataset): + checker_names = ["cf", "acdd"] + verbose = 0 + criteria = "normal" + output_filename = Path(__file__).parent / "results" / "report.html" + output_format = "html" + + check_suite = CheckSuite() + check_suite.load_all_available_checkers() + + encoding = { + "time": dataset.time.encoding, + "latitude": dataset.latitude.encoding, + "longitude": dataset.longitude.encoding, + "mean": dataset["mean"].encoding, + "std": dataset["std"].encoding, + "n": dataset["n"].encoding, + } + + ds_output_path = Path(__file__).parent / "data_sample.nc" + dataset.to_netcdf(ds_output_path, encoding=encoding) + + return_value, errors = ComplianceChecker.run_checker( + str(ds_output_path), + checker_names, + verbose, + criteria, + output_filename=str(output_filename), + output_format=output_format, + ) + + ds_output_path.unlink() + + +def is_cf_compliant(dataset: xr.Dataset): + generate_compliance_report(dataset) + return True + + +def test_get_gridded_toar_data_cf_compliance( + time, regular_grid, local_analysis_service +): + datasets, metadatas = get_gridded_toar_data( + local_analysis_service, + regular_grid, + time, + variables=["mole_fraction_of_ozone_in_air"], + stats=["mean"], + ) + + assert all(is_cf_compliant(ds) for ds in datasets) + + +@mock.patch("toargridding.gridding.AnalysisService", autospec=True, instance=True) +@mock.patch("toargridding.gridding.GridDefinition", autospec=True, instance=True) +def test_get_gridded_toar_data(mock_grid, mock_analysis_service, time): + variables = [ + "mole_fraction_of_ozone_in_air", + "mole_fraction_of_carbon_monoxide_in_air", + ] + stats = ["mean", "dma8epa"] + + datasets, metadatas = get_gridded_toar_data( + mock_analysis_service, mock_grid, time, variables=variables, stats=stats + ) + + print(datasets) + + print(mock_analysis_service.call_args_list) + print(mock_grid.call_args_list) + + assert False diff --git a/toargridding/gridding.py b/toargridding/gridding.py index 255ddb5a06a791463105546ce81f3bdbe1372013..6bc4025d0bac725452dadc48a02f1b5ffe299f1f 100644 --- a/toargridding/gridding.py +++ b/toargridding/gridding.py @@ -1,9 +1,13 @@ -from datetime import datetime +from itertools import product +from collections import namedtuple -import pandas as pd +import xarray as xr -from toargridding.toar_rest_client import AnalysisService, TimeSample +from toargridding.toar_rest_client import AnalysisService from toargridding.grids import GridDefinition +from toargridding.metadata import Metadata, TimeSample + +GriddedResult = namedtuple("GriddedResult", ["dataset", "metadata"]) def get_gridded_toar_data( @@ -11,9 +15,16 @@ def get_gridded_toar_data( grid: GridDefinition, time: TimeSample, variables: list[str], - stats: list[str], -): - station_data, metadata = analysis_service.get_timeseries_and_metadata( - variables, stats, time - ) - return grid.as_xarray(station_data, metadata) + stats: tuple[list[str], list[Metadata]], +) -> list[xr.Dataset]: + metadatas = [ + Metadata.construct(var, stat, time) for var, stat in product(variables, stats) + ] + + datasets = [] + for metadata in metadatas: # standart_name ? + data = analysis_service.get_data(metadata) + ds = grid.as_xarray(data) + datasets.append(ds) + + return datasets, metadatas diff --git a/toargridding/grids.py b/toargridding/grids.py index 2404d6a4aa184dab2f168b656ab49e3933c84f23..92a8d31b1758cf25c0614a605aabc9d754d8e1ba 100644 --- a/toargridding/grids.py +++ b/toargridding/grids.py @@ -6,18 +6,14 @@ import xarray as xr import pandas as pd import numpy as np -from toargridding.toar_rest_client import ( - STATION_LAT, - STATION_LON, - COORDS, -) from toargridding.metadata import ( Variables, - Variable, Coordinates, - Coordinate, get_global_attributes, + AnalysisRequestResult, + Metadata, ) +from toargridding.variables import Variable, Coordinate GridType = Enum("GridType", ["regular"]) @@ -37,6 +33,11 @@ class GridDefinition(ABC): case other: raise Exception("invalid grid type") + @property + @abstractmethod + def description(self): + pass + @staticmethod def from_netcdf(): pass @@ -64,9 +65,9 @@ class RegularGrid(GridDefinition): spatial_shape = (self.lon.size, self.lat.size) spatial_size = self.lon.size * self.lat.size self.dims = [ + Coordinates.time.name, Coordinates.latitude.name, Coordinates.longitude.name, - Coordinates.time.name, ] self._as_xy_index = np.dstack( @@ -74,26 +75,18 @@ class RegularGrid(GridDefinition): ).reshape(-1, 2) self._as_i_index = np.arange(spatial_size).reshape(spatial_shape).T - def as_xarray(self, timeseries_per_statistic, metadata): - datasets = dict() - clean_coords = self.clean_coords(metadata[COORDS]) - for statistic, data in timeseries_per_statistic.items(): - clean_data = self.clean_data(data) - data_grouped_by_cell = self.group_data_by_cell(clean_data, clean_coords) - cell_statistics = self.get_cell_statistics(data_grouped_by_cell) - datasets[statistic] = self.create_dataset(cell_statistics, metadata) + @property + def description(self): + return f"regular global grid with lat/lon resolutions ({self.lat.step}, {self.lon.step})" - return datasets - - def clean_coords(self, coords: pd.DataFrame): - valid_coords = coords.notna().all(axis=1) - return coords[valid_coords] + def as_xarray(self, data: AnalysisRequestResult) -> xr.Dataset: + data_grouped_by_cell = self.group_data_by_cell( + data.stations_data, data.stations_coords + ) + cell_statistics = self.get_cell_statistics(data_grouped_by_cell) + dataset = self.create_dataset(cell_statistics, data.metadata) - def clean_data(self, timeseries): - all_na = timeseries.isna().all(axis=1) - timeseries = timeseries[~all_na] - timeseries = timeseries.fillna(0) # TODO check if data imputation is sensible - return timeseries + return dataset def group_data_by_cell(self, data: pd.DataFrame, coords: pd.DataFrame): cell_indices = self.as_cell_index(coords) @@ -105,7 +98,7 @@ class RegularGrid(GridDefinition): return data_with_indices.groupby(GridDefinition.cell_index_name) - def get_cell_statistics(self, groups): + def get_cell_statistics(self, groups) -> dict[Variables, pd.DataFrame]: stats = { Variables.mean: groups.mean(), Variables.std: groups.std(), @@ -114,40 +107,42 @@ class RegularGrid(GridDefinition): return stats - def create_dataset(self, cell_statistics, metadata): - random_df = list(cell_statistics.values())[0] # take cols from random df - time = pd.DatetimeIndex(random_df.columns) + def create_dataset(self, cell_statistics: pd.DataFrame, metadata: Metadata): + time = Coordinate.from_data( + metadata.time.as_cf_index(), + Coordinates.time, + metadata, + step=metadata.time.sampling, + ) gridded_ds = self.get_empty_grid(time, metadata) for variable, aggregated_data in cell_statistics.items(): - data_array_dict = self.get_data_array_dict(time, aggregated_data, variable) + data_array_dict = self.get_data_array_dict( + time, aggregated_data, variable, metadata + ) gridded_ds = gridded_ds.assign(data_array_dict) return gridded_ds - def get_data_array_dict(self, time, aggregated_data, variable): + def get_data_array_dict(self, time, aggregated_data, variable, metadata): gridded_data = self.create_gridded_data(time, aggregated_data) - gridded_variable = Variable.from_data(gridded_data, variable) + gridded_variable = Variable.from_data(gridded_data, variable, metadata) return {variable.name: gridded_variable.as_data_array(self.dims)} def create_gridded_data(self, time, grouped_timeseries): - values = np.empty((self.lat.size, self.lon.size, time.size)) + values = np.empty((time.size, self.lat.size, self.lon.size)) values[...] = self.fill_value index = self._as_xy_index[grouped_timeseries.index.astype(int)] - values[index.T[0], index.T[1]] = grouped_timeseries.values.reshape( + values[:, index.T[0], index.T[1]] = grouped_timeseries.values.reshape( -1, time.size - ) + ).T return values def as_cell_index(self, coords): - id_x = self.coord_to_index( - coords[STATION_LAT], self.lat.min, self.lat.resolution - ) - id_y = self.coord_to_index( - coords[STATION_LON], self.lon.min, self.lon.resolution - ) + id_x = self.coord_to_index(coords[self.lat.name], self.lat.min, self.lat.step) + id_y = self.coord_to_index(coords[self.lon.name], self.lon.min, self.lon.step) id_i = self._as_i_index[id_x, id_y] @@ -157,12 +152,12 @@ class RegularGrid(GridDefinition): return (np.ceil((coord / d_axis) - 0.5) - x0_axis / d_axis).astype(int) def get_empty_grid( - self, time: pd.DatetimeIndex, metadata: pd.DataFrame + self, time: Variable, metadata: pd.DataFrame ) -> xr.Dataset: # TODO make CF-compliant => docs coords = { - Variables.longitude.name: self.lon.as_data_array(dims=self.lon.name), - Variables.latitude.name: self.lat.as_data_array(dims=self.lat.name), - Variables.time.name: time, + Variables.time.name: time.as_data_array(), + Variables.latitude.name: self.lat.as_data_array(), + Variables.longitude.name: self.lon.as_data_array(), } ds = xr.Dataset(coords=coords, attrs=get_global_attributes(metadata)) diff --git a/toargridding/metadata.py b/toargridding/metadata.py index 72f91eef46b5bbbd90ecf46d5ebb11e27bcb2648..bdb6d29634099d25cf19c57cc5740e0a3689a7ed 100644 --- a/toargridding/metadata.py +++ b/toargridding/metadata.py @@ -1,142 +1,177 @@ from datetime import datetime from enum import Enum from dataclasses import dataclass +from collections import namedtuple +import json import numpy as np +import pandas as pd import xarray as xr +from toargridding.toarstats_constants import STATISTICS_LIST +from toargridding.static_metadata import global_cf_attributes, TOARVariable + date_created = datetime.utcnow().strftime("%Y-%m-dT%H:%M:%SZ") COORDINATE_VARIABLES = ["latitude", "longitude", "time"] DATA_VARIABLES = ["mean", "std", "n"] + DataVariables = Enum("DataVariables", DATA_VARIABLES) Coordinates = Enum("Coordinates", COORDINATE_VARIABLES) Variables = Enum("Variables", [*DATA_VARIABLES, *COORDINATE_VARIABLES]) +TIME_FREQUENCIES = {"daily": "D", "monthly": "M", "yearly": "Y"} # TODO test -@dataclass -class Variable: - var: Variables - data: np.ndarray - attributes: dict[str, str] - encoding: dict[str, str] - - @classmethod - def from_data(cls, data, variable: Variables, **kwargs): - metadata = Variable.get_metadata(variable) - return cls(variable, data, **metadata, **kwargs) - @staticmethod - def get_metadata(variable: Variables): - return VARIABLE_METADATA[variable.name] - - def as_data_array(self, dims): - return xr.DataArray(self.data, name=self.name, dims=dims, attrs=self.attributes) +# TODO allow only valid sampling strings +@dataclass +class TimeSample: + start: datetime + end: datetime + sampling: str - @property - def name(self): - return self.var.name + def as_datetime_index(self): + return pd.period_range(self.start, self.end, freq=self.frequency).to_timestamp() @property - def size(self): - return self.data.size + def daterange_option(self): + return f"{self.start.isoformat()},{self.end.isoformat()}" @property - def min(self): - return self.data.min() + def frequency(self): + TIME_FREQUENCIES[self.sampling] - @property - def max(self): - return self.data.max() + def as_cf_index(self): + n_days = (self.end - self.start).days + return np.arange(n_days + 1) @dataclass -class Coordinate(Variable): - resolution: float +class Metadata: + variable: TOARVariable + statistic: str + time: TimeSample + + @staticmethod + def construct(standart_name: str, stat: str, time: TimeSample): + variable = TOARVariable.get(standart_name) - @classmethod - def from_resolution( - cls, variable: Variables, resolution: float, min: float, max: float - ): - span = max - min - n = int(span / resolution) # raise error if invalid inputs ? - data = np.linspace(min, max, n + 1) + if stat not in STATISTICS_LIST: + raise ValueError(f"invalid statistic: {stat}") - return cls.from_data(data, variable, resolution=resolution) + return Metadata(variable, stat, time) + + +@dataclass +class AnalysisRequestResult: + stations_data: pd.DataFrame + stations_coords: pd.DataFrame + metadata: Metadata def get_global_attributes(metadata): - return GLOBAL - - -def get_encoding(variables: list[Variable]): - return {variable.name: variable.encoding for variable in variables} - - -def get_variable_attributes(variable: Variables): - return VARIABLE_METADATA[variable]["attributes"] - - -VARIABLE_METADATA = { - Variables.latitude.name: { - "attributes": { - "standard_name": "latitude", - "long_name": "latitude in degrees north", - "units": "degrees_north", - "coverage_type": "coordinate", - }, - "encoding": { - "dtype": "int32", - "_FillValue": -1000, - }, - }, - Variables.longitude.name: { - "attributes": { - "standard_name": "longitude", - "long_name": "longitude in degrees east", - "units": "degrees_east", - "coverage_type": "coordinate", - }, - "encoding": { - "dtype": "int32", - "_FillValue": -1000, - }, - }, - Variables.time.name: { - "attributes": { - "units": "time since ...", - "long_name": "time", - "standard_name": "time", - "coverage_content_type": "coordinate", - }, - "encoding": {"dtype": "int32", "_FillValue": None}, - }, - Variables.mean.name: { - "attributes": {}, - "encoding": {"dtype": "float32", "_FillValue": np.nan, "zlib": False}, - }, - Variables.std.name: { - "attributes": {}, - "encoding": { - "dtype": "float32", - "_FillValue": np.nan, - "zlib": False, - }, - }, - Variables.n.name: { - "attributes": {}, - "encoding": { - "dtype": "int32", - "_FillValue": 0, - "zlib": False, - }, - }, -} - - -GLOBAL = { - "title": "foo", - "date_created": date_created, - "history": f"File created at {date_created} using xarray, python toar-analysis service", -} + metadata = { + "title": get_title(metadata), + "summary": get_summary(metadata), + "date_created": date_created, + "history": f"{date_created}: File created by toargridding package using data from toar-analysis service", + # "geospatial_bounds": 0, # for polygonal desrcription + "geospatial_lat_min": -90, # TODO read from grid description + "geospatial_lat_max": 90, + "geospatial_lon_min": -180, + "geospatial_lon_max": 180, + "time_coverage_start": metadata.time.start.isoformat(), + "time_coverage_end": metadata.time.end.isoformat(), + # "time_coverage_duration": 0, # TODO insert durations + # "time_coverage_resolution": 0, + } + metadata = metadata | global_cf_attributes + return metadata + + +def get_title(metadata: Metadata): + return f"{metadata.time.sampling} {metadata.statistic} statistic for {metadata.variable.standart_name} from {metadata.time.start} to {metadata.time.end} aggregated on global grid" + + +def get_summary(metadata: Metadata): # TODO make summary different from title + return f"{metadata.time.sampling} {metadata.statistic} statistic for {metadata.variable.standart_name} from {metadata.time.start} to {metadata.time.end} aggregated on global grid" + + +def get_cf_metadata(variable: Variables, metadata: Metadata | None): + match variable.name: + case Variables.latitude.name: + cf_metadata = { + "attributes": { + "standard_name": "latitude", + "long_name": "latitude in degrees north", + "units": "degrees_north", + "coverage_type": "coordinate", + }, + "encoding": {"dtype": "int32"}, + } + case Variables.longitude.name: + cf_metadata = { + "attributes": { + "standard_name": "longitude", + "long_name": "longitude in degrees east", + "units": "degrees_east", + "coverage_type": "coordinate", + }, + "encoding": {"dtype": "int32"}, + } + case Variables.time.name: + cf_metadata = { + "attributes": { + "units": f"days since {metadata.time.start.isoformat()}", # TODO formatting !! + "long_name": "time", + "standard_name": "time", + "calendar": "standard", + "coverage_content_type": "coordinate", + }, + "encoding": {"dtype": "int32"}, + } + case Variables.mean.name: + cf_metadata = { + "attributes": { + "units": metadata.variable.units, + "long_name": metadata.variable.long_name, + "standard_name": metadata.variable.standart_name, + "cell_methods": "time: point, latitude: mean, longitude: mean", + "coverage_content_type": "physicalMeasurement", + }, + "encoding": {"dtype": "float32", "_FillValue": np.nan, "zlib": False}, + } + case Variables.std.name: + cf_metadata = { + "attributes": { + "units": metadata.variable.units, + "long_name": metadata.variable.long_name, + # TODO standart name extension vs cell_methods + "standard_name": f"{metadata.variable.standart_name} standard_error", + "cell_methods": "time: point, latitude: standard_deviation, longitude: standard_deviation", + "coverage_content_type": "physicalMeasurement", + }, + "encoding": { + "dtype": "float32", + "_FillValue": np.nan, + "zlib": False, + }, + } + case Variables.n.name: + cf_metadata = { + "attributes": { + "units": "1", + "long_name": f"number of {metadata.variable.long_name} observations in cell", + "standard_name": "number_of_observations", + "cell_methods": "time: point, latitude: sum, longitude: sum", + "coverage_content_type": "auxiliaryInformation", + }, + "encoding": { + "dtype": "int32", + "_FillValue": 0, + "zlib": False, + }, + } + + return cf_metadata diff --git a/toargridding/static_metadata.py b/toargridding/static_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..313358191d851ad81554b710fe925ffc37204516 --- /dev/null +++ b/toargridding/static_metadata.py @@ -0,0 +1,56 @@ +from pathlib import Path +from dataclasses import dataclass +import json + +STATIC_METADATA_PATH = Path(__file__).parent / "static_metadata" +TOAR_VARIABLES_METADATA_PATH = STATIC_METADATA_PATH / "toar_variables.json" +GLOABAL_CF_ATTRIBUTES_PATH = STATIC_METADATA_PATH / "global_cf_attributes.json" + + +def load_global_cf_attributes(): + with open(GLOABAL_CF_ATTRIBUTES_PATH, "r") as f: + return json.load(f) + + +@dataclass +class TOARVariable: + vars = None + + name: str + longname: str + displayname: str + cf_standardname: str + units: str + chemical_formula: str + id: int + + @classmethod + def load_toar_variables(cls) -> list["TOARVariable"]: + with open(TOAR_VARIABLES_METADATA_PATH, "r") as f: + variables = json.load(f) + + cls.vars = [TOARVariable(**var) for var in variables] + + @classmethod + def get(cls, standart_name: str) -> "TOARVariable": + for var in cls.vars: + if var.standart_name == standart_name: + return var + else: + raise ValueError(f"TOAR Database contains no variable {standart_name}") + + @property + def toar_id(self): + return self.id + + @property + def standart_name(self): + return self.cf_standardname + + @property + def long_name(self): + return self.longname + + +TOARVariable.load_toar_variables() +global_cf_attributes = load_global_cf_attributes() diff --git a/toargridding/static_metadata/global_cf_attributes.json b/toargridding/static_metadata/global_cf_attributes.json new file mode 100644 index 0000000000000000000000000000000000000000..b0ae8ab798dafa099be858a65ca6a78f618d3812 --- /dev/null +++ b/toargridding/static_metadata/global_cf_attributes.json @@ -0,0 +1,22 @@ +{ + "naming authority": "de.fzj.jsc.esde", + "summary": "bar", + "keywords": [], + "Conventions": "CF-1.8,ACDD-1.3", + "source": "surface observation", + "comment": "timeseries statistics are spatially aggregated over individual grid cells.", + "standard_name_vocabulary": "CF Standard Name Table v83", + "creator_name": "Earth Science Data Exploration Group", + "creator_email": "s.grasse@fz-juelich.de", + "creator_url": "https://www.fz-juelich.de/de/ias/jsc/ueber-uns/struktur/forschungsgruppen/esde", + "contributor_name": "foo", + "contributor_role": "bar", + "institution": "Jülich Supercomputing Center, Forschungszentrum Jülich", + "project": "TOAR-II", + "creator_type": "group", + "creator_institution": "Jülich Supercomputing Center, Forschungszentrum Jülich", + "publisher_type": "group/institution", + "product_version": "version of toargridding ?", + "references": "https://toar-data.fz-juelich.de/", + "license": "CC-BY 4" +} \ No newline at end of file diff --git a/toargridding/static_metadata/toar_variables.json b/toargridding/static_metadata/toar_variables.json new file mode 100644 index 0000000000000000000000000000000000000000..fc065a4cf3f75c00386aabd80db5e4751304a4f4 --- /dev/null +++ b/toargridding/static_metadata/toar_variables.json @@ -0,0 +1,92 @@ +[ + { + "name": "benzene", + "longname": "benzene", + "displayname": "Benzene", + "cf_standardname": "mole_fraction_of_benzene_in_air", + "units": "nmol mol-1", + "chemical_formula": "C6H6", + "id": 1 + }, + { + "name": "co", + "longname": "carbon monoxide", + "displayname": "CO", + "cf_standardname": "mole_fraction_of_carbon_monoxide_in_air", + "units": "nmol mol-1", + "chemical_formula": "CO", + "id": 2 + }, + { + "name": "no", + "longname": "nitrogen monoxide", + "displayname": "NO", + "cf_standardname": "mole_fraction_of_nitrogen_monoxide_in_air", + "units": "nmol mol-1", + "chemical_formula": "NO", + "id": 3 + }, + { + "name": "pm1", + "longname": "particles up to 1 µm diameter", + "displayname": "PM 1", + "cf_standardname": "mass_concentration_of_pm1_ambient_aerosol_in_air", + "units": "µg m-3", + "chemical_formula": "", + "id": 4 + }, + { + "name": "o3", + "longname": "ozone", + "displayname": "Ozone", + "cf_standardname": "mole_fraction_of_ozone_in_air", + "units": "nmol mol-1", + "chemical_formula": "O3", + "id": 5 + }, + { + "name": "no2", + "longname": "nitrogen dioxide", + "displayname": "NO2", + "cf_standardname": "mole_fraction_of_nitrogen_dioxide_in_air", + "units": "nmol mol-1", + "chemical_formula": "NO2", + "id": 6 + }, + { + "name": "toluene", + "longname": "toluene", + "displayname": "Toluene", + "cf_standardname": "mole_fraction_of_toluene_in_air", + "units": "nmol mol-1", + "chemical_formula": "C7H8", + "id": 7 + }, + { + "name": "so2", + "longname": "Sulphur dioxide", + "displayname": "SO2", + "cf_standardname": "mole_fraction_of_sulfur_dioxide_in_air", + "units": "nmol mol-1", + "chemical_formula": "SO2", + "id": 8 + }, + { + "name": "ethane", + "longname": "Ethane", + "displayname": "Ethane", + "cf_standardname": "mole_fraction_of_ethane_in_air", + "units": "nmol mol-1", + "chemical_formula": "C2H6", + "id": 9 + }, + { + "name": "propane", + "longname": "Propane", + "displayname": "Propane", + "cf_standardname": "mole_fraction_of_propane_in_air", + "units": "nmol mol-1", + "chemical_formula": "C3H8", + "id": 10 + } +] \ No newline at end of file diff --git a/toargridding/toar_rest_client.py b/toargridding/toar_rest_client.py index 1a2da65860097e0e7ca4871a91edc0802abf315d..10787f85219518c8f4bab30df8bc224e1ecb9622 100644 --- a/toargridding/toar_rest_client.py +++ b/toargridding/toar_rest_client.py @@ -1,24 +1,13 @@ -from datetime import timedelta, datetime import time -import logging import io from zipfile import ZipFile -from contextlib import contextmanager from dataclasses import dataclass import requests import pandas as pd -ALLOWED_SAMPLING_VALUES = [ - "daily", - "monthly", - "seasonal", - "vegseason", - "summer", - "xsummer", - "annual", - "custom", -] +from toargridding.metadata import Metadata, AnalysisRequestResult, Coordinates + STATION_LAT = "station_coordinates_lat" STATION_LON = "station_coordinates_lng" @@ -33,32 +22,55 @@ class AnalysisService: self.wait_time = 30 self.max_response_time = 60 * 30 # 30 min - def get_timeseries_and_metadata( - self, variable: str, stats: list[str], time: pd.DatetimeIndex - ): - query_options = self.get_query_options(variable, stats, time) - timeseries, metadata = self.get_timeseries_and_metadata(query_options) - return timeseries, metadata - - def get_query_options(self, variable, stats, time: "TimeSample"): + def get_data(self, metadata: Metadata) -> AnalysisRequestResult: + timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata) + coords = self.get_clean_coords(timeseries_metadata) + timeseries = self.get_clean_timeseries(timeseries, metadata) + return AnalysisRequestResult(timeseries, coords, metadata) + + def get_clean_coords(self, timeseries_metadata): + coords = timeseries_metadata[COORDS] + coords.columns = [Coordinates.latitude.name, Coordinates.longitude.name] + valid_coords = coords.notna().all(axis=1) + return coords[valid_coords] + + def get_clean_timeseries(self, timeseries, metadata: Metadata): + all_na = timeseries.isna().all(axis=1) + timeseries = timeseries[~all_na] + timeseries = timeseries.fillna(0) + response_timesteps = timeseries.columns + expected_timesteps = metadata.time.as_datetime_index() + if not (response_timesteps == expected_timesteps).all(): + raise ValueError("foo") + + # TODO maybe use cf-index here already ? + timeseries.columns = metadata.time.as_datetime_index() + return timeseries + + def get_timeseries_and_metadata(self, metadata: Metadata): + query_options = self.get_query_options(metadata) + content = self.query_timeseries_and_metadata(query_options) + timeseries, timeseries_metadata = self.load_data(content, metadata) + return timeseries, timeseries_metadata + + def get_query_options(self, metadata: Metadata): return { - "daterange": time.daterange_option, # TODO verify format - "variable_id": self.variable_as_options(variable), - "statistics": stats, # TODO put in correct format - "sampling": time.sampling, + "daterange": metadata.time.daterange_option, + "variable_id": metadata.variable.toar_id, + "statistics": metadata.statistic, + "sampling": metadata.time.sampling, "min_data_capture": 0, # TODO check effect on NaNs + "metadata_scheme": "basic", "limit": "None", "format": "by_statistic", } - def query_timseries_and_metadata(self, query_options): + def query_timeseries_and_metadata(self, query_options, metadata) -> bytes: response = requests.get(self.stats_endpoint, params=query_options) - content = self.wait_for_data(response.json()["status"]) - return self.load_data(content) + return self.wait_for_data(response.json()["status"]) # TODO str def wait_for_data(self, status_endpoint): - waiting_for_data = True - + # TODO proper waiting for total_wait_time in range(0, self.max_response_time, self.wait_time): time.sleep(self.wait_time) @@ -69,34 +81,17 @@ class AnalysisService: else: raise Exception(f"Time Out after {self.max_response_time}") - def variable_as_options(self, variable: str) -> int: # TODO - """translate variable names to TOAR ids""" - pass - - def load_data(self, content: bytes) -> dict[str, pd.DataFrame]: - data = dict() # TODO make pretty + def load_data( + self, content: bytes, metadata: Metadata + ) -> (pd.DataFrame, pd.DataFrame): zip_stream = io.BytesIO(content) with ZipFile(zip_stream) as myzip: - for data_file in myzip.namelist(): - with myzip.open(data_file) as f: - s_stream = io.StringIO(f.read().decode("utf-8")) - data[data_file.removesuffix(".csv")] = pd.read_csv( - s_stream, comment="#", index_col=0 - ) - - metadata = data.pop(AnalysisService.METADATA) - return data, metadata - - -@dataclass -class TimeSample: - start: datetime - end: datetime - sampling: str + timeseries = self.extract_data(myzip, metadata.statistic) + timeseries_metadata = self.extract_data(myzip, AnalysisService.METADATA) - def as_datetime_index(): - pass + return timeseries, timeseries_metadata - @property - def daterange_option(): - pass + def extract_data(self, zip_file, data_file) -> pd.DataFrame: + with zip_file.open(f"{data_file}.csv") as f: + s_stream = io.StringIO(f.read().decode("utf-8")) + return pd.read_csv(s_stream, comment="#", index_col=0) diff --git a/toargridding/toarstats_constants.py b/toargridding/toarstats_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..3ff9c1c98accceb2e0e155ffa3510afa163a0d45 --- /dev/null +++ b/toargridding/toarstats_constants.py @@ -0,0 +1,65 @@ +# taken from https://gitlab.jsc.fz-juelich.de/esde/toar-public/toarstats/-/blob/master/toarstats/metrics/constants.py#L12-21 + +ALLOWED_SAMPLING_VALUES = [ + "daily", + "monthly", + # "seasonal", + # "vegseason", + # "summer", + # "xsummer", + "annual", + # "custom", +] + +STATISTICS_LIST = [ + "aot40", + "avgdma8epax", + "count", + "dark_aot40", + "dark_avg", + "data_capture", + "daylight_aot40", + "daylight_avg", + "daytime_avg", + "diurnal_cycle", + "dma8epa", + "dma8epa_strict", + "dma8epax", + "dma8epax_strict", + "dma8eu", + "dma8eu_strict", + "drmdmax1h", + "m7_avg", + "max", + "max1h", + "mean", + "median", + "min", + "nighttime_avg", + "nvgt050", + "nvgt060", + "nvgt070", + "nvgt080", + "nvgt090", + "nvgt100", + "nvgt120", + "nvgtall", + "p05", + "p10", + "p25", + "p75", + "p90", + "p95", + "p98", + "p99", + "percentiles1", + "percentiles2", + "somo10", + "somo10_strict", + "somo35", + "somo35_strict", + "stddev", + "w126", + "w126_24h", + "w90", +] diff --git a/toargridding/variables.py b/toargridding/variables.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ef043ea036a08f656e5a6f3c68908fd8a90b4c --- /dev/null +++ b/toargridding/variables.py @@ -0,0 +1,68 @@ +from dataclasses import dataclass + +import numpy as np +import xarray as xr + +from toargridding.metadata import Variables, get_cf_metadata, Metadata + + +@dataclass +class Variable: + var: Variables + data: np.ndarray + attributes: dict[str, str] + encoding: dict[str, str] + + @classmethod + def from_data(cls, data, variable: Variables, metadata: Metadata | None, **kwargs): + cf_metadata = get_cf_metadata(variable, metadata=metadata) + # print(variable.name, cf_metadata) + return cls(variable, data, **cf_metadata, **kwargs) + + def as_data_array(self, dims): + da = xr.DataArray( + self.data, + name=self.name, + dims=dims, + attrs=self.attributes, + ) + da.encoding = self.encoding + return da + + @property + def name(self): + return self.var.name + + @property + def size(self): + return self.data.size + + @property + def min(self): + return self.data.min() + + @property + def max(self): + return self.data.max() + + +@dataclass +class Coordinate(Variable): + step: float | str + + @classmethod + def from_resolution( + cls, variable: Variables, resolution: float, min: float, max: float + ): + span = max - min + n = int(span / resolution) # raise error if invalid inputs ? + data = np.linspace(min, max, n + 1) + + return cls.from_data(data, variable, None, step=resolution) + + def as_data_array(self): + return super().as_data_array(dims=self.name) + + +def get_encoding(variables: list[Variable]): + return {variable.name: variable.encoding for variable in variables}