diff --git a/README.md b/README.md index ce5c78ddc9b62a217b572c83e331ae0d249cb19a..f3e60ec0533953eda72b1f18cce44aac6e855483 100644 --- a/README.md +++ b/README.md @@ -201,11 +201,25 @@ As soon as a request is finished, the status endpoint will not be valid forever. There is no check, if a request is already running. Therefore, submitting a request multiple times, leads to additional load on the system and slows down all requests. The TOAR database has only a limited number of workers for performing a statistical analysis. Therefore, it is advised to run one request after another, especially for large requests covering a large number of stations and or a longer time. +## A brief reminder on timeseries and stations +The TOAR database uses timeseries, which are associated to a station. +At an individual station, one or more physical sensors are mounted. Those can measure different variables or in some cases the same variable with different techniques. +A station can also be part of different networks, that contribute data to the TOAR database. +A more detailed description on the included data can be found in +[Chapter Three: The TOAR data processing workflow](https://toar-data.fz-juelich.de/sphinx/TOAR_TG_Vol02_Data_Processing/build/latex/toardataprocessing--technicalguide.pdf). + +In the case of gridding, this can lead to systematic errors. For example, the statistical weight of a station can be increased, if it is contributed twice. + ## Gridding The gridding uses a user defined grid to combine all stations in a cell. Per cell mean, standard deviation and the number of stations are reported in the resulting xarray dataset. +### Station averaging + +The timeseries at each station can be averages before the gridding is done. This results in the same statistical weight for each station. +This can introduce or remove systematic errors in the data analysis, depending on the calculated statistical aggregates. + ## Contributors The contributors include all projects, organizations and persons that are associated to any timeseries of a gridded dataset with the roles "contributor" and "originator". In offline mode, this information is conserved by saving the timeseries IDs in a dedicated file with one ID per line. In the metadata of a dataset, this file name is stated together with the contributors endpoint (at the moment: `https://toar-data.fz-juelich.de/api/v2/timeseries/request_contributors`) to retrieve the actual names. Therefore, the created contributor file need to be submitted as a POST request. diff --git a/examples/01_produce_data_one_week.ipynb b/examples/01_produce_data_one_week.ipynb index 7cfc297293a04945c0ed2405eb744a8d6c238b38..db0c98679594361096750df567f9f822997a559a 100644 --- a/examples/01_produce_data_one_week.ipynb +++ b/examples/01_produce_data_one_week.ipynb @@ -74,7 +74,7 @@ "source": [ "\n", "variable = [\"mole_fraction_of_ozone_in_air\"]\n", - "time_sampling = TimeSample( start=dt(2000,1,1), end=dt(2000,1,8), sampling=\"daily\")\n", + "time_sampling = TimeSample( start=dt(2018,1,1), end=dt(2018,1,8), sampling=\"daily\")\n", "statistics = [ \"mean\" ]\n", "\n", "grid = RegularGrid( lat_resolution=1.9, lon_resolution=2.5 )" @@ -166,7 +166,7 @@ ], "metadata": { "kernelspec": { - "display_name": "interactive", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -180,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/examples/03_produce_data_station_metadata.ipynb b/examples/03_produce_data_station_metadata.ipynb index cb2ce1af3dd9f0ddb5456a231411105bee4abd83..0b902dfe310a3e7bf9c92440a0ebe5fa36b511cf 100644 --- a/examples/03_produce_data_station_metadata.ipynb +++ b/examples/03_produce_data_station_metadata.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -52,14 +52,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from toargridding.defaultLogging import toargridding_defaultLogging\n", "\n", "logger = toargridding_defaultLogging()\n", - "logger.addShellLogger(logging.INFO)\n", + "#logger.addShellLogger(logging.INFO)\n", + "logger.addShellLogger(logging.DEBUG)\n", "logger.logExceptions()\n", "log_path = Path(\"log\")\n", "log_path.mkdir(exist_ok=True)\n", @@ -81,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -135,7 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "Config = namedtuple(\"Config\", [\"grid\", \"time\", \"variables\", \"stats\", \"station_metadata\"])\n", + "Config = namedtuple(\"Config\", [\"grid\", \"time\", \"variables\", \"stats\", \"station_metadata\", \"average_timeseries_at_station\"])\n", "\n", "#uncomment, if you want to change the metadata:\n", "station_metadata ={\n", @@ -150,8 +151,10 @@ " grid,\n", " TimeSample( start=dt(2012,1,1), end=dt(2012,12,31), sampling=\"daily\"),\n", " [\"mole_fraction_of_ozone_in_air\"],\n", - " [ \"mean\" ], # \"dma8epa_strict\"\n", - " station_metadata\n", + " #[ \"mean\" ], \n", + " [ \"dma8epa_strict\" ],\n", + " station_metadata,\n", + " average_timeseries_at_station=True\n", ")\n", "configs[f\"test_ta\"] = request_config\n" ] @@ -190,6 +193,7 @@ " variables=config.variables,\n", " stats=config.stats,\n", " contributors_path=result_basepath,\n", + " average_TS_at_station=config.average_timeseries_at_station,\n", " **config.station_metadata\n", " )\n", "\n", diff --git a/src/toargridding/__about__.py b/src/toargridding/__about__.py index 1e83593182fb2ecf319029c04ec294ee15d3d168..56c3a35c1b2c7544772f3e5e718054d26c3ef9cd 100644 --- a/src/toargridding/__about__.py +++ b/src/toargridding/__about__.py @@ -1 +1 @@ -VERSION = "0.4.3" +VERSION = "0.4.4" diff --git a/src/toargridding/gridding.py b/src/toargridding/gridding.py index b9824e376a1373c636d7b30636ea280bf417509e..f8ca688e9e079c8271ebbccd24e1f258676b8bff 100644 --- a/src/toargridding/gridding.py +++ b/src/toargridding/gridding.py @@ -21,6 +21,7 @@ def get_gridded_toar_data( stats: list[str], contributors_path : Path = None, contributors_manager : contributors_manager | None = None, + average_TS_at_station : bool = False, **kwargs, ) -> tuple[list[xr.Dataset], list[Metadata]]: """API to download data as xarrays @@ -44,16 +45,31 @@ def get_gridded_toar_data( contributors_path: pathname to write the contributors path. We advise to store the contributor files into the same directory as the resulting data. Without a provided path, it is assumed, that toargridding is operated as a service and the contributors can be directly provided through the contributors endpoint. This is not yet implemented. + average_TS_at_station_ + enable the averaging of all timeseries at a station. This is useful for the calculation of the average timeseries at a station. Be careful, depending on your statistical aggregation, this can lead to systematic errors or remove those. + kwards: + - history this allows a replacement of the history field in the metadata of the resulting datasets + - all remaining kwargs are passed as filters to the request. This allows a refinement of the request. return: ------- Gridded datasets for each combination of variables and stats and appropriate metadata for each dataset. """ + moreOptions = {} + history = None + for key, val in kwargs.items(): + if key == "history": + history = val + else: + moreOptions[key] = val + metadatas = [ - Metadata.construct(standard_name=var, time=time, stat=stat, moreOptions=kwargs) + Metadata.construct(standard_name=var, time=time, stat=stat, average_timeseries_at_station=average_TS_at_station, moreOptions=moreOptions) for var, stat in product(variables, stats) ] + for metadata in metadatas: + metadata.histoy = history datasets = [] for metadata in metadatas: # standard_name ? diff --git a/src/toargridding/grids.py b/src/toargridding/grids.py index 3a4f5804f6e486107b05b925efe5230fc0e27e0d..fb93a255b2fc30861298c6077d5464e5fd3e2deb 100644 --- a/src/toargridding/grids.py +++ b/src/toargridding/grids.py @@ -155,7 +155,7 @@ class RegularGrid(GridDefinition): """ cell_indices = self.as_cell_index(coords) - + # will convert cell_indices to float as some nans ar present data_with_indices = data.join(cell_indices.to_frame(GridDefinition.cell_index_name), how="inner") diff --git a/src/toargridding/metadata.py b/src/toargridding/metadata.py index fca82a493fc1b0afc6033993c139f61dbd2ac2ce..7fa21c7ab7936d9cf4d1634a13f4ff5eaaf9e8d5 100644 --- a/src/toargridding/metadata.py +++ b/src/toargridding/metadata.py @@ -115,9 +115,11 @@ class Metadata: statistic: str moreOptions: dict = field(default_factory=dict) contributors_metadata_field : str = None + average_timeseries_at_station : bool = False + history : str = None @staticmethod - def construct(standard_name: str, time: TimeSample, stat: str, moreOptions: dict = {}): + def construct(standard_name: str, time: TimeSample, stat: str, average_timeseries_at_station : bool = False, moreOptions: dict = {}): """constructor Parameters: @@ -133,7 +135,7 @@ class Metadata: """ variable = TOARVariable.get(standard_name) - return Metadata(variable, time, stat, moreOptions) + return Metadata(variable, time, stat, average_timeseries_at_station=average_timeseries_at_station, moreOptions=moreOptions) @property def statistic(self) -> str: # TODO make better @@ -158,6 +160,10 @@ class Metadata: """ addition = "_".join(f"{key}-{val}" for key, val in sorted(self.moreOptions.items())) addition = addition.replace("/", "%2F") + if self.average_timeseries_at_station: + if addition != "": + addition += "_" + addition += "meanTSbyStation" return "_".join( str(i) for i in [ @@ -210,12 +216,15 @@ def get_global_attributes(metadata: Metadata) -> dict: if metadata.contributors_metadata_field is None: raise ValueError("metadata.contributors_metadata_field must be set before calling get_global_attributes") + history = f"{date_created}: File created by toargridding package using data from toar-analysis service." + if metadata.history is not None: + history += " " + metadata.history dynamic_cf_attributes = { "id": metadata.get_id(), "title": metadata.get_title(), "summary": metadata.get_summary(), "date_created": date_created, - "history": f"{date_created}: File created by toargridding package using data from toar-analysis service", + "history" : history, # "geospatial_bounds": 0, # for polygonal description "geospatial_lat_min": -90, # TODO read from grid description "geospatial_lat_max": 90, diff --git a/src/toargridding/toar_rest_client.py b/src/toargridding/toar_rest_client.py index 853880a4a5aae2edb62d91b127d2bfb9c2565b87..4dc4967cf70a2f6858c31e1116d143c876a26262 100644 --- a/src/toargridding/toar_rest_client.py +++ b/src/toargridding/toar_rest_client.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) STATION_LAT = "station_coordinates_lat" STATION_LON = "station_coordinates_lng" -COORDS = [STATION_LAT, STATION_LON] +COORDS = [STATION_LAT, STATION_LON, "station_id"] class EmptyDataError(ValueError): @@ -480,6 +480,8 @@ class AnalysisService: def get_clean_timeseries_and_data(self, timeseries : pd.DataFrame, timeseries_metadata : pd.DataFrame, metadata : Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: coords = self.get_clean_coords(timeseries_metadata) timeseries = self.get_clean_timeseries(timeseries, metadata) + + timeseries, coords = self.average_timeseries_at_station(timeseries, coords, metadata) timeseries, coords = timeseries.align( coords, join="inner", axis=0 ) return coords, timeseries @@ -487,12 +489,13 @@ class AnalysisService: def get_clean_coords(self, timeseries_metadata: pd.DataFrame): """remove all stations with invalid coordinates and drop unused metadata + also changes the index of the dataframe to the station ID instead of the timeseries ID invalid coordinates are NaN, none etc. return: stations with valid coordinates """ coords = timeseries_metadata[COORDS] - coords.columns = [Coordinates.latitude.name, Coordinates.longitude.name] + coords.columns = [Coordinates.latitude.name, Coordinates.longitude.name, "station_id"] valid_coords = coords.notna().all(axis=1) return coords[valid_coords] @@ -580,6 +583,37 @@ class AnalysisService: with zip_file.open(f"{data_file}.csv") as f: s_stream = io.StringIO(f.read().decode("utf-8")) return pd.read_csv(s_stream, comment="#", index_col=0) + + def average_timeseries_at_station(self, timeseries : pd.DataFrame, coords : pd.DataFrame, metadata : Metadata) -> tuple[pd.DataFrame, pd.DataFrame]: + + if metadata.average_timeseries_at_station: + data_with_sid = timeseries.join(coords["station_id"], how="inner") + data_by_station = data_with_sid.groupby("station_id") + mean_data_by_station = data_by_station.mean() + n_data_by_station = data_by_station.count() + + stations_with_gt2_ts = n_data_by_station[n_data_by_station.gt(2).any(axis=1)] + sid_with_gt2_ts = stations_with_gt2_ts.index.get_level_values(0).values + if len(sid_with_gt2_ts) > 0: + logger.warning(f"Stations with more than 2 values: {sid_with_gt2_ts}") + logger.warning(f"You can check the timeseries for each station by using:") + for sid in sid_with_gt2_ts: + logger.warning(f"https://toar-data.fz-juelich.de/api/v2/search/?station_id={sid}&variable_id=5&fields=data_start_date,data_end_date,id") + + # we also need to change the index for the station coordinates. + coords.set_index("station_id", inplace=True) + # now lets remove duplicates from the coordinates + coords = coords.loc[ coords.duplicated() == False ] + + return mean_data_by_station, coords + + else: + # ok, we do not need the station Id in the coordinates, so drop it:-) + coords.drop("station_id", axis=1,inplace=True) + return timeseries, coords + + + class AnalysisServiceDownload(AnalysisService):