From e07becb37a389d0ce09abf14332fb00619be2ed2 Mon Sep 17 00:00:00 2001 From: Carsten Hinz <c.hinz@fz-juelich.de> Date: Fri, 28 Jun 2024 16:25:18 +0200 Subject: [PATCH] added missing netCDF4 dependency added info on how to install a different environment to the README did some refactoring, mainly camelCase to snake_case worked on implementation of contributors endpoint --- README.md | 7 +++-- examples/quality_controll.ipynb | 14 ++++++++-- pyproject.toml | 5 ++-- src/toargridding/contributors.py | 7 +++-- src/toargridding/grids.py | 3 +-- src/toargridding/metadata.py | 3 ++- src/toargridding/toar_rest_client.py | 38 +++++++++++++++++----------- tests/conversionOfTimestamps.py | 2 +- tests/test_cache.py | 4 +-- 9 files changed, 54 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 15f12d0..a4505e8 100644 --- a/README.md +++ b/README.md @@ -50,9 +50,12 @@ For the installation of all required dependencies call ```bash pip install -e . ``` +To be able to execute the examples, that are provided as jupyter notebooks, we need to install a different preset by calling +```bash +pip install -e . "interactive" +``` - -To run scripts or notebooks use: +To run the example notebooks: ```bash #for selecting a notebook over the file browser in your webbrowser: jupyter notebook diff --git a/examples/quality_controll.ipynb b/examples/quality_controll.ipynb index 35e29fa..abd8725 100644 --- a/examples/quality_controll.ipynb +++ b/examples/quality_controll.ipynb @@ -28,6 +28,8 @@ "from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates\n", "from toargridding.variables import Coordinate\n", "\n", + "from toargridding.contributors import contributionsManager\n", + "\n", "import logging\n", "from toargridding.defaultLogging import toargridding_defaultLogging\n", "#setup of logging\n", @@ -42,7 +44,7 @@ "cache_dir = toargridding_base_path / \"cache\"\n", "data_download_dir = toargridding_base_path / \"results\"\n", "\n", - "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir, use_downloaded=False)\n", + "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir, use_downloaded=True)\n", "my_grid = RegularGrid(1.9, 2.5)\n", "\n", "time = TimeSample(dt(2016,1,1), dt(2016,2,28), \"daily\")\n", @@ -57,7 +59,15 @@ "source": [ "# this cell can runs longer than 30minutes\n", "data = analysis_service.get_data(metadata)\n", - "ds = my_grid.as_xarray(data)" + "\n", + "# create contributors endpoint and write result to metadata\n", + "contrib = contributionsManager(metadata.get_id(), data_download_dir)\n", + "contrib.extract_contributors_from_data_frame(data.stations_data)\n", + "metadata.contributors_metadata_field = contrib.setup_contributors_endpoint_for_metadata()\n", + "ds = my_grid.as_xarray(data)\n", + "print(ds)\n", + "#store dataset\n", + "ds.to_netcdf(data_download_dir / f\"{metadata.get_id()}_{my_grid.get_id()}.nc\")" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 51d4d7c..527d939 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "numpy", "xarray", "pandas", + "netCDF4", ] [project.urls] @@ -48,7 +49,7 @@ installer = "uv" extra-dependencies = [ "jupyter", "ipykernel", - "cartopy" + "cartopy", ] [tool.hatch.envs.hatch-static-analysis] @@ -79,4 +80,4 @@ exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", -] \ No newline at end of file +] diff --git a/src/toargridding/contributors.py b/src/toargridding/contributors.py index 4bb85a9..07ce3e7 100644 --- a/src/toargridding/contributors.py +++ b/src/toargridding/contributors.py @@ -1,11 +1,12 @@ +from pandas import DataFrame from pandas.core.groupby import DataFrameGroupBy from typing import Iterable from pathlib import Path #TODO: maybe create an abstract base class and separate the offline and service implementations class contributionsManager: - def __init__(self, requestID, contributors_path : Path = None, endpoint=""): + def __init__(self, requestID, contributors_path : Path = None, endpoint="https://toar-data.fz-juelich.de/api/v2/request_contributors"): self.requestID = requestID self.timeseriesIDs = set() self.endpoint = endpoint @@ -32,7 +33,9 @@ class contributionsManager: def add_timeseries_ids(self, ids : Iterable[int]) -> None: for id in ids: self.timeseriesIDs.add(id) - def extract_contributors(self, data_grouped_by_cell : DataFrameGroupBy): + def extract_contributors_from_data_frame(self, data_frame : DataFrame): + self.add_timeseries_ids( data_frame.index.to_list() ) + def extract_contributors_from_grouped_dataframe(self, data_grouped_by_cell : DataFrameGroupBy): for _, table in data_grouped_by_cell: self.add_timeseries_ids( table.index.to_list() ) diff --git a/src/toargridding/grids.py b/src/toargridding/grids.py index 1e7a003..f5e8180 100644 --- a/src/toargridding/grids.py +++ b/src/toargridding/grids.py @@ -137,7 +137,6 @@ class RegularGrid(GridDefinition): results of the request, including data, station coordinates and metadata of request """ data_grouped_by_cell = self.group_data_by_cell(data.stations_data, data.stations_coords) - data.contributions.extract_contributors(data_grouped_by_cell) cell_statistics = self.get_cell_statistics(data_grouped_by_cell) dataset = self.create_dataset(cell_statistics, data.metadata) @@ -182,7 +181,7 @@ class RegularGrid(GridDefinition): return stats - def create_dataset(self, cell_statistics: dict[str, pd.DataFrame], metadata: Metadata, contributions : contributionsManager) -> xr.Dataset: + def create_dataset(self, cell_statistics: dict[str, pd.DataFrame], metadata: Metadata) -> xr.Dataset: """creation of data set and filling with results from the gridding Parameters: diff --git a/src/toargridding/metadata.py b/src/toargridding/metadata.py index d6ad9af..63ca02d 100644 --- a/src/toargridding/metadata.py +++ b/src/toargridding/metadata.py @@ -96,6 +96,8 @@ class TimeSample: @dataclass class Metadata: """Metadata of a request. + This class is responsible for the book keeping of all processing steps on the data. + This starts with the metadata used for the request and includes further aspects like filters that are applied. Attributes: ---------- @@ -113,7 +115,6 @@ class Metadata: time: TimeSample statistic: str moreOptions: dict = field(default_factory=dict) - requestID : str = None contributors_metadata_field : str = None @staticmethod diff --git a/src/toargridding/toar_rest_client.py b/src/toargridding/toar_rest_client.py index 3561b8a..57c7981 100644 --- a/src/toargridding/toar_rest_client.py +++ b/src/toargridding/toar_rest_client.py @@ -168,7 +168,7 @@ class Cache: """! get now as string for saving in cache""" return datetime.now().strftime(Cache.__timeFormat) - def clearCache(self): + def clear_cache(self): """!Delete all values from the cache, that are older than the maxDaysInCache value""" with self.storage_dict() as storage: now = datetime.now() @@ -184,7 +184,7 @@ class Cache: del storage[key] @staticmethod - def setMaxDaysInCache(maxDays: float): + def set_max_days_in_cache(maxDays: float): """! set the maximum age of entries in the cache. Also accepts fractions of a day down to seconds. Parameters: @@ -218,13 +218,13 @@ class Connection: self.endpoint = endpoint self.cache = Cache(cache_dir) - self.cache.clearCache() + self.cache.clear_cache() self.cache_backup = Cache(cache_dir, "status_endpoints.old") # max wait time is 30min self.wait_seconds = [] - self.setRequestTimes(5, 30) + self.set_request_times(5, 30) - def setRequestTimes(self, interval_min, maxWait_min): + def set_request_times(self, interval_min, max_wait_minutes): """set the intervals and maximum duration to wait, before requests to the analysis service are stopped The waiting intervals determine how long and often the status endpoint is checked if the results are available. @@ -237,13 +237,13 @@ class Connection: maxWait_min: maximum duration to wait in minutes. """ - if maxWait_min <= 0: + if max_wait_minutes <= 0: msg = "The maximum waiting time needs to be larger than 0min." raise RuntimeError(msg) - elif interval_min < 0 or interval_min > maxWait_min: - self.wait_seconds[0] = maxWait_min + elif interval_min < 0 or interval_min > max_wait_minutes: + self.wait_seconds[0] = max_wait_minutes else: - self.wait_seconds = [interval_min * 60 for _ in range(interval_min, maxWait_min + 1, interval_min)] + self.wait_seconds = [interval_min * 60 for _ in range(interval_min, max_wait_minutes + 1, interval_min)] def get(self, query_options: QueryOptions) -> requests.models.Response: """get results for a request. @@ -419,18 +419,19 @@ class AnalysisService: Handles requesting and loading of data into memory as soon as they are available. In addition the data and coordinates undergo a cleanup. + This results in a datasets which only contain timeseries and station coordinates, if both data are present and valid, i.e. not nan. + A timeseries needs to have at least 1 non nan value to be considered as valid. Parameters: ---------- metadata: meta data for the request. return: - Requested data and statistics, station coordinates and metadata of the request + Requested data and statistics, station coordinates and metadata of the request. """ timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata) - coords = self.get_clean_coords(timeseries_metadata) - timeseries = self.get_clean_timeseries(timeseries, metadata) + coords, timeseries = self.get_clean_timeseries_and_data(timeseries, timeseries_metadata, metadata ) return AnalysisRequestResult(timeseries, coords, metadata) def get_timeseries_and_metadata(self, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -442,10 +443,16 @@ class AnalysisService: query_options = QueryOptions.from_metadata(metadata) result = self.connection.get(query_options) - print(result.request) - print(result.request.url) timeseries, timeseries_metadata = self.load_data(result.content, metadata) return timeseries, timeseries_metadata + + def get_clean_timeseries_and_data(self, timeseries : pd.DataFrame, timeseries_metadata : pd.DataFrame, metadata : Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: + coords = self.get_clean_coords(timeseries_metadata) + timeseries = self.get_clean_timeseries(timeseries, metadata) + + timeseries, coords = timeseries.align( coords, join="inner", axis=0 ) + return coords, timeseries + def get_clean_coords(self, timeseries_metadata: pd.DataFrame): """remove all stations with invalid coordinates and drop unused metadata @@ -579,8 +586,9 @@ class AnalysisServiceDownload(AnalysisService): if needs_fresh_download: logger.info("Performing request to TOAR DB") + if filename.is_file():#delete old status endpoint, in case a fresh download is requested for an successful request. + self.connection.cache.remove(query_options.cache_key) response = self.connection.get(query_options) - print(response.request.url) with open(filename, "w+b") as downloaded_file: downloaded_file.write(response.content) else: diff --git a/tests/conversionOfTimestamps.py b/tests/conversionOfTimestamps.py index a14c4ba..16e3a57 100644 --- a/tests/conversionOfTimestamps.py +++ b/tests/conversionOfTimestamps.py @@ -56,7 +56,7 @@ analysis_service = AnalysisServiceDownload( # maybe adopt the interval for requesting the results and the total duration, before the client pauses the requests. # as the requests take about 45min, it is more suitable to wait 60min before timing out the requests than the original 30min. -analysis_service.connection.setRequestTimes(interval_min=5, maxWait_min=60) +analysis_service.connection.set_request_times(interval_min=5, max_wait_minutes=60) createdFiles = [] diff --git a/tests/test_cache.py b/tests/test_cache.py index 11e256c..e856cd2 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -15,7 +15,7 @@ def test_cache(): assert "test key 1" in myTestCache assert "test key 2" in myTestCache assert myTestCache.get("test key 1") == "test content 1" - myTestCache.clearCache() + myTestCache.clear_cache() assert "test key 2" in myTestCache # edit creation date of key 2: @@ -24,5 +24,5 @@ def test_cache(): assert "test key 2" in myTestCache assert myTestCache.get("test key 2") == "other test content 2" - myTestCache.clearCache() + myTestCache.clear_cache() assert "test key 2" not in myTestCache -- GitLab