From 733ae64e7fdc0ea457d3ca324051a7d06b51ef9c Mon Sep 17 00:00:00 2001
From: Carsten Hinz <c.hinz@fz-juelich.de>
Date: Thu, 27 Jun 2024 17:19:44 +0200
Subject: [PATCH] continued rework of metadata and how the contributors will be
 included

---
 examples/produce_data_withOptional.ipynb | 20 +++++++++++++++++---
 examples/quality_controll.ipynb          | 10 ++++++++--
 src/toargridding/contributors.py         | 20 +++++++++++++-------
 src/toargridding/gridding.py             |  3 +++
 src/toargridding/grids.py                |  8 ++++----
 src/toargridding/metadata.py             | 10 +++++++---
 src/toargridding/toar_rest_client.py     |  8 +++++---
 7 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/examples/produce_data_withOptional.ipynb b/examples/produce_data_withOptional.ipynb
index d1b0031..e73b77e 100644
--- a/examples/produce_data_withOptional.ipynb
+++ b/examples/produce_data_withOptional.ipynb
@@ -2,9 +2,22 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'kwargs' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtoar_rest_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnalysisServiceDownload, Connection\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgrids\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RegularGrid\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgridding\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_gridded_toar_data\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetadata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TimeSample\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdefaultLogging\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m toargridding_defaultLogging\n",
+      "File \u001b[0;32m~/Eigene Daten/FZJ/JSC/workingDirectories/TOAR/toargridding/src/toargridding/gridding.py:21\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtoargridding\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtoar_rest_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AnalysisService\n\u001b[1;32m     11\u001b[0m GriddedResult \u001b[38;5;241m=\u001b[39m namedtuple(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGriddedResult\u001b[39m\u001b[38;5;124m\"\u001b[39m, [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_gridded_toar_data\u001b[39m(\n\u001b[1;32m     15\u001b[0m     analysis_service: AnalysisService,\n\u001b[1;32m     16\u001b[0m     grid: GridDefinition,\n\u001b[1;32m     17\u001b[0m     time: TimeSample,\n\u001b[1;32m     18\u001b[0m     variables: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m     19\u001b[0m     stats: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m     20\u001b[0m     contributors_path : Path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m---> 21\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[43mkwargs\u001b[49m,\n\u001b[1;32m     22\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mtuple\u001b[39m[\u001b[38;5;28mlist\u001b[39m[xr\u001b[38;5;241m.\u001b[39mDataset], \u001b[38;5;28mlist\u001b[39m[Metadata]]:\n\u001b[1;32m     23\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"API to download data as xarrays\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \n\u001b[1;32m     25\u001b[0m \u001b[38;5;124;03m    The function creates all combinations of the variable and stats list\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[38;5;124;03m        Gridded datasets for each combination of variables and stats and appropriate metadata for each dataset.\u001b[39;00m\n\u001b[1;32m     43\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m     45\u001b[0m     metadatas \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m     46\u001b[0m         Metadata\u001b[38;5;241m.\u001b[39mconstruct(standard_name\u001b[38;5;241m=\u001b[39mvar, time\u001b[38;5;241m=\u001b[39mtime, stat\u001b[38;5;241m=\u001b[39mstat, moreOptions\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[1;32m     47\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m var, stat \u001b[38;5;129;01min\u001b[39;00m product(variables, stats)\n\u001b[1;32m     48\u001b[0m     ]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'kwargs' is not defined"
+     ]
+    }
+   ],
    "source": [
     "import logging\n",
     "from datetime import datetime as dt\n",
@@ -100,6 +113,7 @@
     "        time=config.time,\n",
     "        variables=config.variables,\n",
     "        stats=config.stats,\n",
+    "        contributors_path=result_basepath\n",
     "        **config.moreOptions\n",
     "    )\n",
     "\n",
@@ -125,7 +139,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/examples/quality_controll.ipynb b/examples/quality_controll.ipynb
index b3ad13b..35e29fa 100644
--- a/examples/quality_controll.ipynb
+++ b/examples/quality_controll.ipynb
@@ -28,6 +28,12 @@
     "from toargridding.metadata import Metadata, TimeSample, AnalysisRequestResult, Coordinates\n",
     "from toargridding.variables import Coordinate\n",
     "\n",
+    "import logging\n",
+    "from toargridding.defaultLogging import toargridding_defaultLogging\n",
+    "#setup of logging\n",
+    "logger = toargridding_defaultLogging()\n",
+    "logger.addShellLogger(logging.DEBUG)\n",
+    "logger.logExceptions()\n",
     "\n",
     "endpoint = \"https://toar-data.fz-juelich.de/api/v2/analysis/statistics/\"\n",
     "#starts in directory [path/to/toargridding]/tests\n",
@@ -36,10 +42,10 @@
     "cache_dir = toargridding_base_path / \"cache\"\n",
     "data_download_dir = toargridding_base_path / \"results\"\n",
     "\n",
-    "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir)\n",
+    "analysis_service = AnalysisServiceDownload(endpoint, cache_dir, data_download_dir, use_downloaded=False)\n",
     "my_grid = RegularGrid(1.9, 2.5)\n",
     "\n",
-    "time = TimeSample(dt(2016,1,1), dt(2016,12,31), \"monthly\")\n",
+    "time = TimeSample(dt(2016,1,1), dt(2016,2,28), \"daily\")\n",
     "metadata = Metadata.construct(\"mole_fraction_of_ozone_in_air\", time, \"mean\")\n"
    ]
   },
diff --git a/src/toargridding/contributors.py b/src/toargridding/contributors.py
index 11703f6..4bb85a9 100644
--- a/src/toargridding/contributors.py
+++ b/src/toargridding/contributors.py
@@ -1,23 +1,29 @@
 
 from pandas.core.groupby import DataFrameGroupBy
 from typing import Iterable
+from pathlib import Path
 
 #TODO: maybe create an abstract base class and separate the offline and service implementations
 class contributionsManager:
-    def __init__(self, requestID, endpoint="not yet defined"):
+    def __init__(self, requestID, contributors_path : Path = None, endpoint=""):
         self.requestID = requestID
         self.timeseriesIDs = set()
         self.endpoint = endpoint
-        self.runsAsService = False
+        self.runsAsService = True
+        if contributors_path is not None:
+            self.runsAsService = False
+            self.contributors_path = contributors_path
     def setup_contributors_endpoint_for_metadata(self):
         if self.runsAsService:
             return self.setup_contributors_service()
         else:
-            return self.setup_contributors_offline()
-    def setup_contributors_offline(self) -> str:
-        # write time series IDs to file
-        # save file
-        return f"This dataset has been created in standalone mode. To retrieve the contributors from for this dataset execute: curl {self.endpoint} -file {self.requestID}.contributors"
+            return self.setup_contributors_id_file()
+    def setup_contributors_id_file(self) -> str:
+        ext = "contributors"
+        with open(self.contributors_path / f"{self.requestID}.{ext}", "w") as f:
+            for id in self.timeseriesIDs:
+                f.write(f"{id}\n")
+        return f"curl {self.endpoint} -file {self.requestID}.{ext}"
     def setup_contributors_service(self) -> str:
         # TODO: missing implementation
         raise NotImplementedError("This has not been implemented as this package is not yet operated as a service.")
diff --git a/src/toargridding/gridding.py b/src/toargridding/gridding.py
index fcd5e33..94e8dfe 100644
--- a/src/toargridding/gridding.py
+++ b/src/toargridding/gridding.py
@@ -1,5 +1,6 @@
 from collections import namedtuple
 from itertools import product
+from pathlib import Path
 
 import xarray as xr
 
@@ -16,6 +17,7 @@ def get_gridded_toar_data(
     time: TimeSample,
     variables: list[str],
     stats: list[str],
+    contributors_path : Path = None,
     **kwargs,
 ) -> tuple[list[xr.Dataset], list[Metadata]]:
     """API to download data as xarrays
@@ -48,6 +50,7 @@ def get_gridded_toar_data(
     datasets = []
     for metadata in metadatas:  # standard_name ?
         data = analysis_service.get_data(metadata)
+        #TODO add processing of contributors
         ds = grid.as_xarray(data)
         datasets.append(ds)
 
diff --git a/src/toargridding/grids.py b/src/toargridding/grids.py
index 1fe0731..1e7a003 100644
--- a/src/toargridding/grids.py
+++ b/src/toargridding/grids.py
@@ -139,7 +139,7 @@ class RegularGrid(GridDefinition):
         data_grouped_by_cell = self.group_data_by_cell(data.stations_data, data.stations_coords)
         data.contributions.extract_contributors(data_grouped_by_cell)
         cell_statistics = self.get_cell_statistics(data_grouped_by_cell)
-        dataset = self.create_dataset(cell_statistics, data.metadata, data.contributions)
+        dataset = self.create_dataset(cell_statistics, data.metadata)
 
         return dataset
 
@@ -202,7 +202,7 @@ class RegularGrid(GridDefinition):
             step=metadata.time.sampling,
         )
 
-        gridded_ds = self.get_empty_grid(time, metadata, contributions)
+        gridded_ds = self.get_empty_grid(time, metadata)
         for variable, aggregated_data in cell_statistics.items():
             data_array_dict = self.get_data_array_dict(time, aggregated_data, variable, metadata)
             gridded_ds = gridded_ds.assign(data_array_dict)
@@ -286,7 +286,7 @@ class RegularGrid(GridDefinition):
             ids[ids < 0] += maxBin4Wrap
         return ids
 
-    def get_empty_grid(self, time: Variable, metadata: Metadata, contributions: contributionsManager) -> xr.Dataset:  # TODO make CF-compliant => docs
+    def get_empty_grid(self, time: Variable, metadata: Metadata) -> xr.Dataset:  # TODO make CF-compliant => docs
         """creation of an empty dataset without data
 
         Sets up a dataset with its three axis: time, longitude and latitude.
@@ -306,7 +306,7 @@ class RegularGrid(GridDefinition):
             Variables.longitude.name: self.lon.as_data_array(),
         }
 
-        ds = xr.Dataset(coords=coords, attrs=get_global_attributes(metadata, contributions))
+        ds = xr.Dataset(coords=coords, attrs=get_global_attributes(metadata))
 
         return ds
 
diff --git a/src/toargridding/metadata.py b/src/toargridding/metadata.py
index 552e17d..d6ad9af 100644
--- a/src/toargridding/metadata.py
+++ b/src/toargridding/metadata.py
@@ -113,6 +113,8 @@ class Metadata:
     time: TimeSample
     statistic: str
     moreOptions: dict = field(default_factory=dict)
+    requestID : str = None
+    contributors_metadata_field : str = None
 
     @staticmethod
     def construct(standard_name: str, time: TimeSample, stat: str, moreOptions: dict = {}):
@@ -198,14 +200,16 @@ class AnalysisRequestResult:
     stations_data: pd.DataFrame
     stations_coords: pd.DataFrame
     metadata: Metadata
-    contributions: contributionsManager = None
 
 
-def get_global_attributes(metadata: Metadata, contributions: contributionsManager) -> dict:
+def get_global_attributes(metadata: Metadata) -> dict:
     """combination of global metadata with request specific values.
     Also adds all additional options passed to the request as meta data.
     Throws an exception if moreOptions contains an key already in use by the metadata.
     """
+    if metadata.contributors_metadata_field is None:
+        raise ValueError("metadata.contributors_metadata_field must be set before calling get_global_attributes")
+
     dynamic_cf_attributes = {
         "id": metadata.get_id(),
         "title": metadata.get_title(),
@@ -222,7 +226,7 @@ def get_global_attributes(metadata: Metadata, contributions: contributionsManage
         # "time_coverage_duration": 0, # TODO insert durations
         # "time_coverage_resolution": 0,
         "product_version": f"version of toargridding {importlib.metadata.version( __package__ or __name__ )}",
-        "contributors": contributions.setup_contributors_endpoint_for_metadata(),
+        "contributors": metadata.contributors_metadata_field,
     }
     for key, value in metadata.moreOptions.items():
         if key not in dynamic_cf_attributes:
diff --git a/src/toargridding/toar_rest_client.py b/src/toargridding/toar_rest_client.py
index b199238..3561b8a 100644
--- a/src/toargridding/toar_rest_client.py
+++ b/src/toargridding/toar_rest_client.py
@@ -431,8 +431,7 @@ class AnalysisService:
         timeseries, timeseries_metadata = self.get_timeseries_and_metadata(metadata)
         coords = self.get_clean_coords(timeseries_metadata)
         timeseries = self.get_clean_timeseries(timeseries, metadata)
-        contributions = contributionsManager(metadata.get_id())
-        return AnalysisRequestResult(timeseries, coords, metadata, contributions=contributions)
+        return AnalysisRequestResult(timeseries, coords, metadata)
 
     def get_timeseries_and_metadata(self, metadata: Metadata) -> tuple[pd.DataFrame, pd.DataFrame]:
         """obtain data and metadata from TOAR database
@@ -443,11 +442,13 @@ class AnalysisService:
 
         query_options = QueryOptions.from_metadata(metadata)
         result = self.connection.get(query_options)
+        print(result.request)
+        print(result.request.url)
         timeseries, timeseries_metadata = self.load_data(result.content, metadata)
         return timeseries, timeseries_metadata
 
     def get_clean_coords(self, timeseries_metadata: pd.DataFrame):
-        """remove all stations with invalid coordinates
+        """remove all stations with invalid coordinates and drop unused metadata
         invalid coordinates are NaN, none etc.
         return:
             stations with valid coordinates
@@ -579,6 +580,7 @@ class AnalysisServiceDownload(AnalysisService):
         if needs_fresh_download:
             logger.info("Performing request to TOAR DB")
             response = self.connection.get(query_options)
+            print(response.request.url)
             with open(filename, "w+b") as downloaded_file:
                 downloaded_file.write(response.content)
         else:
-- 
GitLab