Skip to content
Snippets Groups Projects
Commit 8e25c672 authored by lukas leufen's avatar lukas leufen
Browse files

added some method descriptions, split load_data method

parent 93db459b
Branches
Tags
2 merge requests!6updated inception model and data prep class,!4data prep class
...@@ -13,19 +13,35 @@ from typing import Union, List, Iterable ...@@ -13,19 +13,35 @@ from typing import Union, List, Iterable
import datetime as dt import datetime as dt
# define more general date type for type hinting # define a more general date type for type hinting
date = Union[dt.date, dt.datetime] date = Union[dt.date, dt.datetime]
class DataPrep(object): class DataPrep(object):
""" """
This class prepares data to be used in neural networks. Especially the following steps can be performed This class prepares data to be used in neural networks. The instance searches for local stored data, that meet the
given demands. If no local data is found, the DataPrep instance will load data from TOAR database and store this
data locally to use the next time. For the moment, there is only support for daily aggregated time series. The
aggregation can be set manually and differ for each variable.
After data loading, different data pre-processing steps can be executed to prepare the data for further
applications. Especially the following methods can be used for the pre-processing step:
- interpolate: interpolate between data points by using xarray's interpolation method - interpolate: interpolate between data points by using xarray's interpolation method
- standardise: standardise data to mean=1 and std=1, or just centralise to mean=0 - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on
- make window history: to present the history (time steps before) for training/ testing; X interval [0, 1] are not implemented yet.
- make labels: create target vector for training/ testing; y - make window history: represent the history (time steps before) for training/ testing; X
- remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y - make labels: create target vector with given leading time steps for training/ testing; y
- some other methods to ensure that the functions above are working properly - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. Use
this method after the creation of the window history and labels to clean up the data cube.
To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA,
"Umweltbundesamt") and the variables to use. Further options can be set in the instance.
* `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable.
* `start`: define a start date for the data cube creation. Default: Use the first entry in time series
* `end`: set the end date for the data cube. Default: Use last date in time series.
* `store_data_locally`: store recently downloaded data on local disk. Default: True
* set further parameters for xarray's interpolation methods to modify the interpolation scheme
""" """
def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str], **kwargs): def __init__(self, path: str, network: str, station: Union[str, List[str]], variables: List[str], **kwargs):
...@@ -45,11 +61,16 @@ class DataPrep(object): ...@@ -45,11 +61,16 @@ class DataPrep(object):
if self.statistics_per_var is not None: if self.statistics_per_var is not None:
self.load_data() self.load_data()
else: else:
raise NotImplementedError raise NotImplementedError # hourly data usage is not implemented yet
# self.data, self.meta = Fkf.read_hourly_data_from_csv_to_xarray(self.path, self.network, self.station, # self.data, self.meta = Fkf.read_hourly_data_from_csv_to_xarray(self.path, self.network, self.station,
# self.variables, **kwargs) # self.variables, **kwargs)
def load_data(self): def load_data(self):
"""
Load data and meta data either from local disk (preferred) or download new data from TOAR database if no local
data is available. The latter case, store downloaded data locally if wished (default yes).
"""
self.check_path_and_create() self.check_path_and_create()
file_name = self._set_file_name() file_name = self._set_file_name()
meta_file = self._set_meta_file_name() meta_file = self._set_meta_file_name()
...@@ -59,17 +80,28 @@ class DataPrep(object): ...@@ -59,17 +80,28 @@ class DataPrep(object):
self.meta = pd.read_csv(meta_file, index_col=0) self.meta = pd.read_csv(meta_file, index_col=0)
except FileNotFoundError as e: except FileNotFoundError as e:
logging.warning(e) logging.warning(e)
data, self.meta = self.download_data_from_join(file_name, meta_file)
data = self._slice_prep(data)
self.data = self.check_for_negative_concentrations(data)
def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]:
"""
Download data from TOAR database using the JOIN interface.
:param file_name:
:param meta_file:
:return:
"""
df_all = {} df_all = {}
df, self.meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var) df, meta = join.download_join(station_name=self.station, statvar=self.statistics_per_var)
df_all[self.station[0]] = df df_all[self.station[0]] = df
# convert df_all to xarray # convert df_all to xarray
xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()} xarr = {k: xr.DataArray(v, dims=['datetime', 'variables']) for k, v in df_all.items()}
xarr = xr.Dataset(xarr).to_array(dim='Stations') xarr = xr.Dataset(xarr).to_array(dim='Stations')
data = self._slice_prep(xarr) if self.kwargs.get('store_data_locally', True):
self.data = self.check_for_negative_concentrations(data)
# save locally as nc/csv file # save locally as nc/csv file
xarr.to_netcdf(path=file_name) xarr.to_netcdf(path=file_name)
self.meta.to_csv(meta_file) meta.to_csv(meta_file)
return xarr, meta
def _set_file_name(self): def _set_file_name(self):
return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(sorted(self.variables))}.nc") return os.path.join(self.path, f"{''.join(self.station)}_{'_'.join(sorted(self.variables))}.nc")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment