From e07becb37a389d0ce09abf14332fb00619be2ed2 Mon Sep 17 00:00:00 2001
From: Carsten Hinz <c.hinz@fz-juelich.de>
Date: Fri, 28 Jun 2024 16:25:18 +0200
Subject: [PATCH] added missing netCDF4 dependency

added info on how to install a different environment to the README

did some refactoring, mainly camelCase to snake_case

worked on implementation of contributors endpoint
---
 README.md                            |  7 +++--
 examples/quality_controll.ipynb      | 14 ++++++++--
 pyproject.toml                       |  5 ++--
 src/toargridding/contributors.py     |  7 +++--
 src/toargridding/grids.py            |  3 +--
 src/toargridding/metadata.py         |  3 ++-
 src/toargridding/toar_rest_client.py | 38 +++++++++++++++++-----------
 tests/conversionOfTimestamps.py      |  2 +-
 tests/test_cache.py                  |  4 +--
 9 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 15f12d0..a4505e8 100644
--- a/README.md
+++ b/README.md
@@ -50,9 +50,12 @@ For the installation of all required dependencies call
 ```bash
 pip install -e .
 ```
+To be able to execute the examples, that are provided as jupyter notebooks, we need to install a different preset by calling
+```bash
+pip install -e . "interactive"
+```
 
-
-To run scripts or notebooks use:
+To run the example notebooks:
 ```bash
 #for selecting a notebook over the file browser in your webbrowser:
 jupyter notebook
diff --git a/examples/quality_controll.ipynb b/examples/quality_controll.ipynb
index 35e29fa..abd8725 100644
--- a/examples/quality_controll.ipynb
+++ b/examples/quality_controll.ipynb
@@ -28,6 +28,8 @@
     "from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates\n",
     "from toargridding.variables import Coordinate\n",
     "\n",
+    "from toargridding.contributors import contributionsManager\n",
+    "\n",
     "import logging\n",
     "from toargridding.defaultLogging import toargridding_defaultLogging\n",
     "#setup of logging\n",
@@ -42,7 +44,7 @@
     "cache_dir = toargridding_base_path / \"cache\"\n",
     "data_download_dir = toargridding_base_path / \"results\"\n",
     "\n",
-    "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir, use_downloaded=False)\n",
+    "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir, use_downloaded=True)\n",
     "my_grid = RegularGrid(1.9, 2.5)\n",
     "\n",
     "time = TimeSample(dt(2016,1,1), dt(2016,2,28), \"daily\")\n",
@@ -57,7 +59,15 @@
    "source": [
     "# this cell can runs longer than 30minutes\n",
     "data = analysis_service.get_data(metadata)\n",
-    "ds = my_grid.as_xarray(data)"
+    "\n",
+    "# create contributors endpoint and write result to metadata\n",
+    "contrib = contributionsManager(metadata.get_id(), data_download_dir)\n",
+    "contrib.extract_contributors_from_data_frame(data.stations_data)\n",
+    "metadata.contributors_metadata_field = contrib.setup_contributors_endpoint_for_metadata()\n",
+    "ds = my_grid.as_xarray(data)\n",
+    "print(ds)\n",
+    "#store dataset\n",
+    "ds.to_netcdf(data_download_dir / f\"{metadata.get_id()}_{my_grid.get_id()}.nc\")"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index 51d4d7c..527d939 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "numpy",
     "xarray",
     "pandas",
+    "netCDF4",
 ]
 
 [project.urls]
@@ -48,7 +49,7 @@ installer = "uv"
 extra-dependencies = [
   "jupyter",
   "ipykernel",
-  "cartopy"
+  "cartopy",
 ]
 
 [tool.hatch.envs.hatch-static-analysis]
@@ -79,4 +80,4 @@ exclude_lines = [
   "no cov",
   "if __name__ == .__main__.:",
   "if TYPE_CHECKING:",
-]
\ No newline at end of file
+]
diff --git a/src/toargridding/contributors.py b/src/toargridding/contributors.py
index 4bb85a9..07ce3e7 100644
--- a/src/toargridding/contributors.py
+++ b/src/toargridding/contributors.py
@@ -1,11 +1,12 @@
 
+from pandas import DataFrame
 from pandas.core.groupby import DataFrameGroupBy
 from typing import Iterable
 from pathlib import Path
 
 #TODO: maybe create an abstract base class and separate the offline and service implementations
 class contributionsManager:
-    def __init__(self, requestID, contributors_path : Path = None, endpoint=""):
+    def __init__(self, requestID, contributors_path : Path = None, endpoint="https://toar-data.fz-juelich.de/api/v2/request_contributors"):
         self.requestID = requestID
         self.timeseriesIDs = set()
         self.endpoint = endpoint
@@ -32,7 +33,9 @@ class contributionsManager:
     def add_timeseries_ids(self, ids : Iterable[int]) -> None:
         for id in ids:
             self.timeseriesIDs.add(id)
-    def extract_contributors(self, data_grouped_by_cell : DataFrameGroupBy):
+    def extract_contributors_from_data_frame(self, data_frame : DataFrame):
+        self.add_timeseries_ids( data_frame.index.to_list() )
+    def extract_contributors_from_grouped_dataframe(self, data_grouped_by_cell : DataFrameGroupBy):
         for _, table in data_grouped_by_cell:
             self.add_timeseries_ids( table.index.to_list() )
 
diff --git a/src/toargridding/grids.py b/src/toargridding/grids.py
index 1e7a003..f5e8180 100644
--- a/src/toargridding/grids.py
+++ b/src/toargridding/grids.py
@@ -137,7 +137,6 @@ class RegularGrid(GridDefinition):
             results of the request, including data, station coordinates and metadata of request
         """
         data_grouped_by_cell = self.group_data_by_cell(data.stations_data, data.stations_coords)
-        data.contributions.extract_contributors(data_grouped_by_cell)
         cell_statistics = self.get_cell_statistics(data_grouped_by_cell)
         dataset = self.create_dataset(cell_statistics, data.metadata)
 
@@ -182,7 +181,7 @@ class RegularGrid(GridDefinition):
 
         return stats
 
-    def create_dataset(self, cell_statistics: dict[str, pd.DataFrame], metadata: Metadata, contributions : contributionsManager) -> xr.Dataset:
+    def create_dataset(self, cell_statistics: dict[str, pd.DataFrame], metadata: Metadata) -> xr.Dataset:
         """creation of data set and filling with results from the gridding
 
         Parameters:
diff --git a/src/toargridding/metadata.py b/src/toargridding/metadata.py
index d6ad9af..63ca02d 100644
--- a/src/toargridding/metadata.py
+++ b/src/toargridding/metadata.py
@@ -96,6 +96,8 @@ class TimeSample:
 @dataclass
 class Metadata:
     """Metadata of a request.
+    This class is responsible for the book keeping of all processing steps on the data.
+    This starts with the metadata used for the request and includes further aspects like filters that are applied.
 
     Attributes:
     ----------
@@ -113,7 +115,6 @@ class Metadata:
     time: TimeSample
     statistic: str
     moreOptions: dict = field(default_factory=dict)
-    requestID : str = None
     contributors_metadata_field : str = None
 
     @staticmethod
diff --git a/src/toargridding/toar_rest_client.py b/src/toargridding/toar_rest_client.py
index 3561b8a..57c7981 100644
--- a/src/toargridding/toar_rest_client.py
+++ b/src/toargridding/toar_rest_client.py
@@ -168,7 +168,7 @@ class Cache:
         """! get now as string for saving in cache"""
         return datetime.now().strftime(Cache.__timeFormat)
 
-    def clearCache(self):
+    def clear_cache(self):
         """!Delete all values from the cache, that are older than the maxDaysInCache value"""
         with self.storage_dict() as storage:
             now = datetime.now()
@@ -184,7 +184,7 @@ class Cache:
                 del storage[key]
 
     @staticmethod
-    def setMaxDaysInCache(maxDays: float):
+    def set_max_days_in_cache(maxDays: float):
         """! set the maximum age of entries in the cache.
         Also accepts fractions of a day down to seconds.
         Parameters:
@@ -218,13 +218,13 @@ class Connection:
 
         self.endpoint = endpoint
         self.cache = Cache(cache_dir)
-        self.cache.clearCache()
+        self.cache.clear_cache()
         self.cache_backup = Cache(cache_dir, "status_endpoints.old")
         # max wait time is 30min
         self.wait_seconds = []
-        self.setRequestTimes(5, 30)
+        self.set_request_times(5, 30)
 
-    def setRequestTimes(self, interval_min, maxWait_min):
+    def set_request_times(self, interval_min, max_wait_minutes):
         """set the intervals and maximum duration to wait, before requests to the analysis service are stopped
 
         The waiting intervals determine how long and often the status endpoint is checked if the results are available.
@@ -237,13 +237,13 @@ class Connection:
         maxWait_min:
             maximum duration to wait in minutes.
         """
-        if maxWait_min <= 0:
+        if max_wait_minutes <= 0:
             msg = "The maximum waiting time needs to be larger than 0min."
             raise RuntimeError(msg)
-        elif interval_min < 0 or interval_min > maxWait_min:
-            self.wait_seconds[0] = maxWait_min
+        elif interval_min < 0 or interval_min > max_wait_minutes:
+            self.wait_seconds[0] = max_wait_minutes
         else:
-            self.wait_seconds = [interval_min * 60 for _ in range(interval_min, maxWait_min + 1, interval_min)]
+            self.wait_seconds = [interval_min * 60 for _ in range(interval_min, max_wait_minutes + 1, interval_min)]
 
     def get(self, query_options: QueryOptions) -> requests.models.Response:
         """get results for a request.
@@ -419,18 +419,19 @@ class AnalysisService:
 
         Handles requesting and loading of data into memory as soon as they are available.
         In addition the data and coordinates undergo a cleanup.
+        This results in a datasets which only contain timeseries and station coordinates, if both data are present and valid, i.e. not nan.
+        A timeseries needs to have at least 1 non nan value to be considered as valid.
 
         Parameters:
         ----------
         metadata:
             meta data for the request.
         return:
-            Requested data and statistics, station coordinates and metadata of the request
+            Requested data and statistics, station coordinates and metadata of the request.
         """
 
         timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata)
-        coords = self.get_clean_coords(timeseries_metadata)
-        timeseries = self.get_clean_timeseries(timeseries, metadata)
+        coords, timeseries = self.get_clean_timeseries_and_data(timeseries, timeseries_metadata, metadata )
         return AnalysisRequestResult(timeseries, coords, metadata)
 
     def get_timeseries_and_metadata(self, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]:
@@ -442,10 +443,16 @@ class AnalysisService:
 
         query_options = QueryOptions.from_metadata(metadata)
         result = self.connection.get(query_options)
-        print(result.request)
-        print(result.request.url)
         timeseries, timeseries_metadata = self.load_data(result.content, metadata)
         return timeseries, timeseries_metadata
+    
+    def get_clean_timeseries_and_data(self, timeseries : pd.DataFrame, timeseries_metadata : pd.DataFrame, metadata : Metadata) -> tuple[pd.DataFrame, pd.DataFrame]:
+        coords = self.get_clean_coords(timeseries_metadata)
+        timeseries = self.get_clean_timeseries(timeseries, metadata)
+
+        timeseries, coords = timeseries.align( coords, join="inner", axis=0 )
+        return coords, timeseries
+
 
     def get_clean_coords(self, timeseries_metadata: pd.DataFrame):
         """remove all stations with invalid coordinates and drop unused metadata
@@ -579,8 +586,9 @@ class AnalysisServiceDownload(AnalysisService):
 
         if needs_fresh_download:
             logger.info("Performing request to TOAR DB")
+            if filename.is_file():#delete old status endpoint, in case a fresh download is requested for an successful request.
+                self.connection.cache.remove(query_options.cache_key)
             response = self.connection.get(query_options)
-            print(response.request.url)
             with open(filename, "w+b") as downloaded_file:
                 downloaded_file.write(response.content)
         else:
diff --git a/tests/conversionOfTimestamps.py b/tests/conversionOfTimestamps.py
index a14c4ba..16e3a57 100644
--- a/tests/conversionOfTimestamps.py
+++ b/tests/conversionOfTimestamps.py
@@ -56,7 +56,7 @@ analysis_service = AnalysisServiceDownload(
 
 # maybe adopt the interval for requesting the results and the total duration, before the client pauses the requests.
 # as the requests take about 45min, it is more suitable to wait 60min before timing out the requests than the original 30min.
-analysis_service.connection.setRequestTimes(interval_min=5, maxWait_min=60)
+analysis_service.connection.set_request_times(interval_min=5, max_wait_minutes=60)
 
 createdFiles = []
 
diff --git a/tests/test_cache.py b/tests/test_cache.py
index 11e256c..e856cd2 100644
--- a/tests/test_cache.py
+++ b/tests/test_cache.py
@@ -15,7 +15,7 @@ def test_cache():
     assert "test key 1" in myTestCache
     assert "test key 2" in myTestCache
     assert myTestCache.get("test key 1") == "test content 1"
-    myTestCache.clearCache()
+    myTestCache.clear_cache()
     assert "test key 2" in myTestCache
 
     # edit creation date of key 2:
@@ -24,5 +24,5 @@ def test_cache():
 
     assert "test key 2" in myTestCache
     assert myTestCache.get("test key 2") == "other test content 2"
-    myTestCache.clearCache()
+    myTestCache.clear_cache()
     assert "test key 2" not in myTestCache
-- 
GitLab