From 8e25c6728060a099c7a6d09108fa1d1ac6d8dd77 Mon Sep 17 00:00:00 2001
From: lukas leufen <l.leufen@fz-juelich.de>
Date: Wed, 6 Nov 2019 08:05:18 +0100
Subject: [PATCH] added some method descriptions, split load_data method

---
 src/data_preparation.py | 64 ++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/src/data_preparation.py b/src/data_preparation.py
index 65babf15..c23c9a9f 100644
--- a/src/data_preparation.py
+++ b/src/data_preparation.py
@@ -13,19 +13,35 @@ from typing import Union, List, Iterable
 import datetime as dt
 
 
-# define more general date type for type hinting
+# define a more general date type for type hinting
 date = Union[dt.date, dt.datetime]
 
 
 class DataPrep(object):
     """
-    This class prepares data to be used in neural networks. Especially the following steps can be performed
+    This class prepares data to be used in neural networks. The instance searches for local stored data, that meet the
+    given demands. If no local data is found, the DataPrep instance will load data from TOAR database and store this
+    data locally to use the next time. For the moment, there is only support for daily aggregated time series. The
+    aggregation can be set manually and differ for each variable.
+
+    After data loading, different data pre-processing steps can be executed to prepare the data for further
+    applications. Especially the following methods can be used for the pre-processing step:
     - interpolate: interpolate between data points by using xarray's interpolation method
-    - standardise: standardise data to mean=1 and std=1, or just centralise to mean=0
-    - make window history: to present the history (time steps before) for training/ testing; X
-    - make labels: create target vector for training/ testing; y
-    - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y
-    - some other methods to ensure that the functions above are working properly
+    - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on
+      interval [0, 1] are not implemented yet.
+    - make window history: represent the history (time steps before) for training/ testing; X
+    - make labels: create target vector with given leading time steps for training/ testing; y
+    - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. Use
+      this method after the creation of the window history and labels to clean up the data cube.
+
+    To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA,
+    "Umweltbundesamt") and the variables to use. Further options can be set in the instance.
+    * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable.
+    * `start`: define a start date for the data cube creation. Default: Use the first entry in time series
+    * `end`: set the end date for the data cube. Default: Use last date in time series.
+    * `store_data_locally`: store recently downloaded data on local disk. Default: True
+    * set further parameters for xarray's interpolation methods to modify the interpolation scheme
+
     """
 
     def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str], **kwargs):
@@ -45,11 +61,16 @@ class DataPrep(object):
         if self.statistics_per_var is not None:
             self.load_data()
         else:
-            raise NotImplementedError
+            raise NotImplementedError  # hourly data usage is not implemented yet
             # self.data, self.meta = Fkf.read_hourly_data_from_csv_to_xarray(self.path, self.network, self.station,
             #                                                               self.variables, **kwargs)
 
     def load_data(self):
+        """
+        Load data and meta data either from local disk (preferred) or download new data from TOAR database if no local
+        data is  available. The latter case, store downloaded data locally if wished (default yes).
+        """
+
         self.check_path_and_create()
         file_name = self._set_file_name()
         meta_file = self._set_meta_file_name()
@@ -59,17 +80,28 @@ class DataPrep(object):
             self.meta = pd.read_csv(meta_file, index_col=0)
         except FileNotFoundError as e:
             logging.warning(e)
-            df_all = {}
-            df, self.meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var)
-            df_all[self.station[0]] = df
-            # convert df_all to xarray
-            xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
-            xarr = xr.Dataset(xarr).to_array(dim='Stations')
-            data = self._slice_prep(xarr)
+            data, self.meta = self.download_data_from_join(file_name, meta_file)
+            data = self._slice_prep(data)
             self.data = self.check_for_negative_concentrations(data)
+
+    def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]:
+        """
+        Download data from TOAR database using the JOIN interface.
+        :param file_name:
+        :param meta_file:
+        :return:
+        """
+        df_all = {}
+        df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var)
+        df_all[self.station[0]] = df
+        # convert df_all to xarray
+        xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
+        xarr = xr.Dataset(xarr).to_array(dim='Stations')
+        if self.kwargs.get('store_data_locally', True):
             # save locally as nc/csv file
             xarr.to_netcdf(path=file_name)
-            self.meta.to_csv(meta_file)
+            meta.to_csv(meta_file)
+        return xarr, meta
 
     def _set_file_name(self):
         return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(sorted(self.variables))}.nc")
-- 
GitLab