From 8e25c6728060a099c7a6d09108fa1d1ac6d8dd77 Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Wed, 6 Nov 2019 08:05:18 +0100 Subject: [PATCH] added some method descriptions, split load_data method --- src/data_preparation.py | 64 ++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/src/data_preparation.py b/src/data_preparation.py index 65babf15..c23c9a9f 100644 --- a/src/data_preparation.py +++ b/src/data_preparation.py @@ -13,19 +13,35 @@ from typing import Union, List, Iterable import datetime as dt -# define more general date type for type hinting +# define a more general date type for type hinting date = Union[dt.date, dt.datetime] class DataPrep(object): """ - This class prepares data to be used in neural networks. Especially the following steps can be performed + This class prepares data to be used in neural networks. The instance searches for local stored data, that meet the + given demands. If no local data is found, the DataPrep instance will load data from TOAR database and store this + data locally to use the next time. For the moment, there is only support for daily aggregated time series. The + aggregation can be set manually and differ for each variable. + + After data loading, different data pre-processing steps can be executed to prepare the data for further + applications. Especially the following methods can be used for the pre-processing step: - interpolate: interpolate between data points by using xarray's interpolation method - - standardise: standardise data to mean=1 and std=1, or just centralise to mean=0 - - make window history: to present the history (time steps before) for training/ testing; X - - make labels: create target vector for training/ testing; y - - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y - - some other methods to ensure that the functions above are working properly + - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on + interval [0, 1] are not implemented yet. + - make window history: represent the history (time steps before) for training/ testing; X + - make labels: create target vector with given leading time steps for training/ testing; y + - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. Use + this method after the creation of the window history and labels to clean up the data cube. + + To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA, + "Umweltbundesamt") and the variables to use. Further options can be set in the instance. + * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable. + * `start`: define a start date for the data cube creation. Default: Use the first entry in time series + * `end`: set the end date for the data cube. Default: Use last date in time series. + * `store_data_locally`: store recently downloaded data on local disk. Default: True + * set further parameters for xarray's interpolation methods to modify the interpolation scheme + """ def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str], **kwargs): @@ -45,11 +61,16 @@ class DataPrep(object): if self.statistics_per_var is not None: self.load_data() else: - raise NotImplementedError + raise NotImplementedError # hourly data usage is not implemented yet # self.data, self.meta = Fkf.read_hourly_data_from_csv_to_xarray(self.path, self.network, self.station, # self.variables, **kwargs) def load_data(self): + """ + Load data and meta data either from local disk (preferred) or download new data from TOAR database if no local + data is available. The latter case, store downloaded data locally if wished (default yes). + """ + self.check_path_and_create() file_name = self._set_file_name() meta_file = self._set_meta_file_name() @@ -59,17 +80,28 @@ class DataPrep(object): self.meta = pd.read_csv(meta_file, index_col=0) except FileNotFoundError as e: logging.warning(e) - df_all = {} - df, self.meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var) - df_all[self.station[0]] = df - # convert df_all to xarray - xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} - xarr = xr.Dataset(xarr).to_array(dim='Stations') - data = self._slice_prep(xarr) + data, self.meta = self.download_data_from_join(file_name, meta_file) + data = self._slice_prep(data) self.data = self.check_for_negative_concentrations(data) + + def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: + """ + Download data from TOAR database using the JOIN interface. + :param file_name: + :param meta_file: + :return: + """ + df_all = {} + df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var) + df_all[self.station[0]] = df + # convert df_all to xarray + xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} + xarr = xr.Dataset(xarr).to_array(dim='Stations') + if self.kwargs.get('store_data_locally', True): # save locally as nc/csv file xarr.to_netcdf(path=file_name) - self.meta.to_csv(meta_file) + meta.to_csv(meta_file) + return xarr, meta def _set_file_name(self): return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(sorted(self.variables))}.nc") -- GitLab