diff --git a/src/__init__.py b/src/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..452d0ed8b95a6300a2a47b65be78a5ddf4e968d6 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1,5 @@ +""" +Test string + +This is all about machine learning tools +""" \ No newline at end of file diff --git a/src/data_handling/__init__.py b/src/data_handling/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e03e44b2e17389ca64a5589e4185e8abb59c105d 100644 --- a/src/data_handling/__init__.py +++ b/src/data_handling/__init__.py @@ -0,0 +1,9 @@ +__author__ = 'Lukas Leufen, Felix Kleinert' +__date__ = '2020-04-17' + +""" +Data Handling. + +The module data_handling contains all methods and classes that are somehow related to data preprocessing, +postprocessing, loading, and distribution for training. +""" diff --git a/src/data_handling/bootstraps.py b/src/data_handling/bootstraps.py index 46fa7c2be39d3dadb1922a1b710065aa42d9e2d2..bf125d8705b08ff6bdae90e5930a934ed3fb7efd 100644 --- a/src/data_handling/bootstraps.py +++ b/src/data_handling/bootstraps.py @@ -1,29 +1,53 @@ +""" +Collections of bootstrap methods and classes. + +How to use +---------- + +test + +""" + __author__ = 'Felix Kleinert, Lukas Leufen' __date__ = '2020-02-07' -from src.data_handling.data_generator import DataGenerator -import numpy as np import logging -import keras -import dask.array as da -import xarray as xr import os import re -from src import helpers from typing import List, Union, Pattern, Tuple +import dask.array as da +import keras +import numpy as np +import xarray as xr + +from src import helpers +from src.data_handling.data_generator import DataGenerator + class BootStrapGenerator(keras.utils.Sequence): """ + Generator that returns bootstrapped history objects for given boot index while iteration. + generator for bootstraps as keras sequence inheritance. Initialise with number of boots, the original history, the shuffled data, all used variables and the current shuffled variable. While iterating over this generator, it returns the bootstrapped history for given boot index (this is the iterator index) in the same format like the original history ready to use. Note, that in some cases some samples can contain nan values (in these cases the entire data row is null, not only single entries). """ + def __init__(self, number_of_boots: int, history: xr.DataArray, shuffled: xr.DataArray, variables: List[str], shuffled_variable: str): + """ + Set up the generator. + + :param number_of_boots: number of bootstrap realisations + :param history: original history (the ground truth) + :param shuffled: the shuffled history + :param variables: list with all variables of interest + :param shuffled_variable: name of the variable that shall be bootstrapped + """ self.number_of_boots = number_of_boots self.variables = variables self.history_orig = history @@ -31,11 +55,17 @@ class BootStrapGenerator(keras.utils.Sequence): self.shuffled = shuffled.sel(variables=shuffled_variable) def __len__(self) -> int: + """ + Return number of bootstraps. + + :return: number of bootstraps + """ return self.number_of_boots def __getitem__(self, index: int) -> xr.DataArray: """ - return bootstrapped history for given bootstrap index in same index structure like the original history object + Return bootstrapped history for given bootstrap index in same index structure like the original history object. + :param index: boot index e [0, nboots-1] :return: bootstrapped history ready to use """ @@ -46,7 +76,8 @@ class BootStrapGenerator(keras.utils.Sequence): def __get_shuffled(self, index: int) -> xr.DataArray: """ - returns shuffled data for given boot index from shuffled attribute + Return shuffled data for given boot index from shuffled attribute. + :param index: boot index e [0, nboots-1] :return: shuffled data """ @@ -56,10 +87,20 @@ class BootStrapGenerator(keras.utils.Sequence): class CreateShuffledData: """ - Verify and create shuffled data for all data contained in given data generator class. Starts automatically on - initialisation, no further calls are required. Check and new creations are all performed inside bootstrap_path. + Verify and create shuffled data for all data contained in given data generator class. + + Starts automatically on initialisation, no further calls are required. Check and new creations are all performed + inside bootstrap_path. """ + def __init__(self, data: DataGenerator, number_of_bootstraps: int, bootstrap_path: str): + """ + Shuffled data is automatically created in initialisation. + + :param data: data to shuffle + :param number_of_bootstraps: + :param bootstrap_path: Path to find and store the bootstraps + """ self.data = data self.number_of_bootstraps = number_of_bootstraps self.bootstrap_path = bootstrap_path @@ -67,9 +108,11 @@ class CreateShuffledData: def create_shuffled_data(self) -> None: """ - Create shuffled data. Use original test data, add dimension 'boots' with length number of bootstraps and insert - randomly selected variables. If there is a suitable local file for requested window size and number of - bootstraps, no additional file will be created inside this function. + Create shuffled data. + + Use original test data, add dimension 'boots' with length number of bootstraps and insert randomly selected + variables. If there is a suitable local file for requested window size and number of bootstraps, no additional + file will be created inside this function. """ logging.info("create / check shuffled bootstrap data") variables_str = '_'.join(sorted(self.data.variables)) @@ -92,8 +135,11 @@ class CreateShuffledData: def _set_file_path(self, station: str, variables: str, window: int, nboots: int) -> str: """ + Set file name. + Set file name following naming convention <station>_<var1>_<var2>_..._hist<window>_nboots<nboots>_shuffled.nc - and creates joined path using bootstrap_path attribute set on initialisation. + and create joined path using bootstrap_path attribute set on initialisation. + :param station: station name :param variables: variables already preprocessed as single string with all variables seperated by underscore :param window: window length @@ -105,13 +151,15 @@ class CreateShuffledData: def valid_bootstrap_file(self, station: str, variables: str, window: int) -> [bool, Union[None, int]]: """ - Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. If a - match was found, this method returns a tuple (True, None). In any other case, it returns (False, max_nboot), - where max_nboot is the highest boot number found in the local storage. A match is defined so that the window - length is ge than given window size form args and the number of boots is also ge than the given number of boots - from this class. Furthermore, this functions deletes local files, if the match the station pattern but don't fit - the window and bootstrap condition. This is performed, because it is assumed, that the corresponding file will - be created with a longer or at the least same window size and numbers of bootstraps. + Compare local bootstrap file with given settings for station, variables, window and number of bootstraps. + + If a match was found, this method returns a tuple (True, None). In any other case, it returns (False, + max_nboot), where max_nboot is the highest boot number found in the local storage. A match is defined so that + the window length is ge than given window size form args and the number of boots is also ge than the given + number of boots from this class. Furthermore, this functions deletes local files, if the match the station + pattern but don't fit the window and bootstrap condition. This is performed, because it is assumed, that the + corresponding file will be created with a longer or at the least same window size and numbers of bootstraps. + :param station: name of the station to validate :param variables: all variables already merged in single string seperated by underscore :param window: required window size @@ -136,21 +184,25 @@ class CreateShuffledData: @staticmethod def shuffle(data: da.array, chunks: Tuple) -> da.core.Array: """ - Shuffle randomly from given data (draw elements with replacement) + Shuffle randomly from given data (draw elements with replacement). + :param data: data to shuffle :param chunks: chunk size for dask :return: shuffled data as dask core array (not computed yet) """ size = data.shape - return da.random.choice(data.reshape(-1,), size=size, chunks=chunks) + return da.random.choice(data.reshape(-1, ), size=size, chunks=chunks) class BootStraps: """ - Main class to perform bootstrap operations. This class requires a DataGenerator object and a path, where to find and - store all data related to the bootstrap operation. In initialisation, this class will automatically call the class - CreateShuffleData to set up the shuffled data sets. How to use BootStraps: - * call .get_generator(<station>, <variable>) to get a generator for given station and variable combination that + Main class to perform bootstrap operations. + + This class requires a DataGenerator object and a path, where to find and store all data related to the bootstrap + operation. In initialisation, this class will automatically call the class CreateShuffleData to set up the shuffled + data sets. How to use BootStraps: + + * call .get_generator(<station>, <variable>) to get a generator for given station and variable combination that \ iterates over all bootstrap realisations (as keras sequence) * call .get_labels(<station>) to get the measured observations in the same format as bootstrap predictions * call .get_bootstrap_predictions(<station>, <variable>) to get the bootstrapped predictions @@ -158,6 +210,13 @@ class BootStraps: """ def __init__(self, data: DataGenerator, bootstrap_path: str, number_of_bootstraps: int = 10): + """ + Automatically check and create (if needed) shuffled data on initialisation. + + :param data: a data generator object to get data / history + :param bootstrap_path: path to find and store the bootstrap data + :param number_of_bootstraps: the number of bootstrap realisations + """ self.data = data self.number_of_bootstraps = number_of_bootstraps self.bootstrap_path = bootstrap_path @@ -165,20 +224,38 @@ class BootStraps: @property def stations(self) -> List[str]: + """ + Station property inherits directly from data generator object. + + :return: list with all stations + """ return self.data.stations @property def variables(self) -> List[str]: + """ + Variables property inherits directly from data generator object. + + :return: list with all variables + """ return self.data.variables @property def window_history_size(self) -> int: + """ + Window history size property inherits directly from data generator object. + + :return: the window history size + """ return self.data.window_history_size def get_generator(self, station: str, variable: str) -> BootStrapGenerator: """ - Returns the actual generator to use for the bootstrap evaluation. The generator requires information on station - and bootstrapped variable. There is only a loop on the bootstrap realisation and not on stations or variables. + Return the actual generator to use for the bootstrap evaluation. + + The generator requires information on station and bootstrapped variable. There is only a loop on the bootstrap + realisation and not on stations or variables. + :param station: name of the station :param variable: name of the variable to bootstrap :return: BootStrapGenerator class ready to use. @@ -189,7 +266,8 @@ class BootStraps: def get_labels(self, station: str) -> np.ndarray: """ - Repeats labels for given key by the number of boots and returns as single array. + Repeat labels for given key by the number of boots and returns as single array. + :param station: name of station :return: repeated labels as single array """ @@ -198,7 +276,8 @@ class BootStraps: def get_orig_prediction(self, path: str, file_name: str, prediction_name: str = "CNN") -> np.ndarray: """ - Repeats predictions from given file(_name) in path by the number of boots. + Repeat predictions from given file(_name) in path by the number of boots. + :param path: path to file :param file_name: file name :param prediction_name: name of the prediction to select from loaded file (default CNN) @@ -211,9 +290,11 @@ class BootStraps: def _load_shuffled_data(self, station: str, variables: List[str]) -> xr.DataArray: """ - Load shuffled data from bootstrap path. Data is stored as - '<station>_<var1>_<var2>_..._hist<histsize>_nboots<nboots>_shuffled.nc', e.g. + Load shuffled data from bootstrap path. + + Data is stored as '<station>_<var1>_<var2>_..._hist<histsize>_nboots<nboots>_shuffled.nc', e.g. 'DEBW107_cloudcover_no_no2_temp_u_v_hist13_nboots20_shuffled.nc' + :param station: name of station :param variables: list of variables :return: shuffled data as xarray @@ -224,7 +305,8 @@ class BootStraps: def _get_shuffled_data_file(self, station: str, variables: List[str]) -> str: """ - Looks for data file using regular expressions and returns found file or raise FileNotFoundError + Look for data file using regular expressions and returns found file or raise FileNotFoundError. + :param station: name of station :param variables: name of variables :return: found file with complete path @@ -240,8 +322,11 @@ class BootStraps: @staticmethod def _create_file_regex(station: str, variables: List[str]) -> Pattern: """ - Creates regex for given station and variables to look for shuffled data with pattern: + Create regex for given station and variables. + + With this regex, it is possible to look for shuffled data with pattern: `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc` + :param station: station name to use as prefix :param variables: variables to add after station :return: compiled regular expression @@ -253,10 +338,13 @@ class BootStraps: @staticmethod def _filter_files(regex: Pattern, files: List[str], window: int, nboot: int) -> Union[str, None]: """ - Filter list of files by regex. Regex has to be structured to match the following string structure + Filter list of files by regex. + + Regex has to be structured to match the following string structure `<station>(_<var>)*_hist(<hist>)_nboots(<nboots>)_shuffled.nc`. Hist and nboots values have to be included as group. All matches are compared to given window and nboot parameters. A valid file must have the same value (or larger) than these parameters and contain all variables. + :param regex: compiled regular expression pattern following the style from method description :param files: list of file names to filter :param window: minimum length of window to look for @@ -267,7 +355,7 @@ class BootStraps: match = regex.match(f) if match: last = match.lastindex - if (int(match.group(last-1)) >= window) and (int(match.group(last)) >= nboot): + if (int(match.group(last - 1)) >= window) and (int(match.group(last)) >= nboot): return f diff --git a/src/data_handling/data_distributor.py b/src/data_handling/data_distributor.py index e8c6044280799ded080ab4bff3627aeb9ffde2db..2600afcbd8948c26a2b4cf37329b424cac69f40a 100644 --- a/src/data_handling/data_distributor.py +++ b/src/data_handling/data_distributor.py @@ -1,3 +1,24 @@ +""" +Data Distribution Module. + +How to use +---------- + +Create distributor object from a generator object and parse it to the fit generator method. Provide the number of +steps per epoch with distributor's length method. + +.. code-block:: python + + model = YourKerasModel() + data_generator = DataGenerator(*args, **kwargs) + data_distributor = Distributor(data_generator, model, **kwargs) + history = model.fit_generator(generator=data_distributor.distribute_on_batches(), + steps_per_epoch=len(data_distributor), + epochs=10,) + +Additionally, a validation data set can be parsed using the length and distribute methods. +""" + from __future__ import generator_stop __author__ = "Lukas Leufen, Felix Kleinert" @@ -12,9 +33,20 @@ from src.data_handling.data_generator import DataGenerator class Distributor(keras.utils.Sequence): + """Distribute data generator elements according to mini batch size.""" def __init__(self, generator: DataGenerator, model: keras.models, batch_size: int = 256, permute_data: bool = False, upsampling: bool = False): + """ + Set up distributor. + + :param generator: The generator object must be iterable and return inputs and targets on each iteration + :param model: a keras model with one or more output branches + :param batch_size: batch size to use + :param permute_data: data is randomly permuted if enabled on each train step + :param upsampling: upsample data with upsample extremes data from generator object and shuffle data or use only + the standard input data. + """ self.generator = generator self.model = model self.batch_size = batch_size @@ -38,7 +70,11 @@ class Distributor(keras.utils.Sequence): def _permute_data(self, x, y): """ - Permute inputs x and labels y + Permute inputs x and labels y if permutation is enabled in instance. + + :param x: inputs + :param y: labels + :return: permuted or original data """ if self.do_data_permutation: p = np.random.permutation(len(x)) # equiv to .shape[0] @@ -47,6 +83,17 @@ class Distributor(keras.utils.Sequence): return x, y def distribute_on_batches(self, fit_call=True): + """ + Create generator object to distribute mini batches. + + Split data from given generator object (usually for single station) according to the given batch size. Also + perform upsampling if enabled and random shuffling (either if data permutation is enabled or if upsampling is + enabled). Lastly multiply targets if provided model has multiple output branches. + + :param fit_call: switch to exit while loop after first iteration. This is used to determine the length of all + distributed mini batches. For default, fit_call is True to obtain infinite loop for training. + :return: yields next mini batch + """ while True: for k, v in enumerate(self.generator): # get rank of output @@ -65,15 +112,20 @@ class Distributor(keras.utils.Sequence): num_mini_batches = self._get_number_of_mini_batches(x_total) # permute order for mini-batches x_total, y_total = self._permute_data(x_total, y_total) - for prev, curr in enumerate(range(1, num_mini_batches+1)): - x = x_total[prev*self.batch_size:curr*self.batch_size, ...] - y = [y_total[prev*self.batch_size:curr*self.batch_size, ...] for _ in range(mod_rank)] + for prev, curr in enumerate(range(1, num_mini_batches + 1)): + x = x_total[prev * self.batch_size:curr * self.batch_size, ...] + y = [y_total[prev * self.batch_size:curr * self.batch_size, ...] for _ in range(mod_rank)] if x is not None: # pragma: no branch - yield (x, y) + yield x, y if (k + 1) == len(self.generator) and curr == num_mini_batches and not fit_call: return - def __len__(self): + def __len__(self) -> int: + """ + Total number of distributed mini batches. + + :return: the length of the distribute on batches object + """ num_batch = 0 for _ in self.distribute_on_batches(fit_call=False): num_batch += 1 diff --git a/src/data_handling/data_generator.py b/src/data_handling/data_generator.py index 8d10b3e438e185b9fd158259a6ba49a5612737be..d45713c94a5346e1eff7db7b3c6ab663bfd7ebe3 100644 --- a/src/data_handling/data_generator.py +++ b/src/data_handling/data_generator.py @@ -1,14 +1,16 @@ +"""Data Generator class to handle large arrays for machine learning.""" + __author__ = 'Felix Kleinert, Lukas Leufen' __date__ = '2019-11-07' +import logging import os +import pickle from typing import Union, List, Tuple, Any, Dict import dask.array as da import keras import xarray as xr -import pickle -import logging from src import helpers from src.data_handling.data_preparation import DataPrep @@ -20,17 +22,64 @@ num_or_list = Union[number, List[number]] class DataGenerator(keras.utils.Sequence): """ - This class is a generator to handle large arrays for machine learning. This class can be used with keras' - fit_generator and predict_generator. Individual stations are the iterables. This class uses class Dataprep and - returns X, y when an item is called. - Item can be called manually by position (integer) or station id (string). Methods also accept lists with exactly - one entry of integer or string + This class is a generator to handle large arrays for machine learning. + + .. code-block:: python + + data_generator = DataGenerator(**args, **kwargs) + + Data generator item can be called manually by position (integer) or station id (string). Methods also accept lists + with exactly one entry of integer or string. + + .. code-block:: + + # select generator elements by position index + first_element = data_generator.get_data_generator([0]) # 1st element + n_element = data_generator.get_data_generator([4]) # 5th element + + # select by name + station_xy = data_generator.get_data_generator(["station_xy"]) # will raise KeyError if not available + + If used as iterator or directly called by get item method, the data generator class returns transposed labels and + history object from underlying data preparation class DataPrep. + + .. code-block:: python + + # select history and label by position + hist, labels = data_generator[0] + # by name + hist, labels = data_generator["station_xy"] + # as iterator + for (hist, labels) in data_generator: + pass + + This class can also be used with keras' fit_generator and predict_generator. Individual stations are the iterables. """ def __init__(self, data_path: str, network: str, stations: Union[str, List[str]], variables: List[str], interpolate_dim: str, target_dim: str, target_var: str, station_type: str = None, interpolate_method: str = "linear", limit_nan_fill: int = 1, window_history_size: int = 7, window_lead_time: int = 4, transformation: Dict = None, extreme_values: num_or_list = None, **kwargs): + """ + Set up data generator. + + :param data_path: path to data + :param network: the observational network, the data should come from + :param stations: list with all stations to include + :param variables: list with all used variables + :param interpolate_dim: dimension along which interpolation is applied + :param target_dim: dimension of target variable + :param target_var: name of target variable + :param station_type: TOAR station type classification (background, traffic) + :param interpolate_method: method of interpolation + :param limit_nan_fill: maximum gab in data to fill by interpolation + :param window_history_size: length of the history window + :param window_lead_time: lenght of the label window + :param transformation: transformation method to apply on data + :param extreme_values: set up the extreme value upsampling + :param kwargs: additional kwargs that are used in either DataPrep (transformation, start / stop period, ...) + or extreme values + """ self.data_path = os.path.abspath(data_path) self.data_path_tmp = os.path.join(os.path.abspath(data_path), "tmp") if not os.path.exists(self.data_path_tmp): @@ -51,34 +100,30 @@ class DataGenerator(keras.utils.Sequence): self.transformation = self.setup_transformation(transformation) def __repr__(self): - """ - display all class attributes - """ + """Display all class attributes.""" return f"DataGenerator(path='{self.data_path}', network='{self.network}', stations={self.stations}, " \ f"variables={self.variables}, station_type={self.station_type}, " \ f"interpolate_dim='{self.interpolate_dim}', target_dim='{self.target_dim}', " \ f"target_var='{self.target_var}', **{self.kwargs})" def __len__(self): - """ - display the number of stations - """ + """Return the number of stations.""" return len(self.stations) def __iter__(self) -> "DataGenerator": """ - Define the __iter__ part of the iterator protocol to iterate through this generator. Sets the private attribute - `_iterator` to 0. - :return: + Define the __iter__ part of the iterator protocol to iterate through this generator. + + Sets the private attribute `_iterator` to 0. """ self._iterator = 0 return self def __next__(self) -> Tuple[xr.DataArray, xr.DataArray]: """ - This is the implementation of the __next__ method of the iterator protocol. Get the data generator, and return - the history and label data of this generator. - :return: + Get the data generator, and return the history and label data of this generator. + + This is the implementation of the __next__ method of the iterator protocol. """ if self._iterator < self.__len__(): data = self.get_data_generator() @@ -92,7 +137,10 @@ class DataGenerator(keras.utils.Sequence): def __getitem__(self, item: Union[str, int]) -> Tuple[xr.DataArray, xr.DataArray]: """ - Defines the get item method for this generator. Retrieve data from generator and return history and labels. + Define the get item method for this generator. + + Retrieve data from generator and return history and labels. + :param item: station key to choose the data generator. :return: The generator's time series of history data and its labels """ @@ -149,7 +197,7 @@ class DataGenerator(keras.utils.Sequence): return mean, std def calculate_estimated_transformation(self, method): - data = [[]]*len(self.variables) + data = [[]] * len(self.variables) coords = {"variables": self.variables, "Stations": range(0)} mean = xr.DataArray(data, coords=coords, dims=["variables", "Stations"]) std = xr.DataArray(data, coords=coords, dims=["variables", "Stations"]) @@ -168,8 +216,18 @@ class DataGenerator(keras.utils.Sequence): def get_data_generator(self, key: Union[str, int] = None, load_local_tmp_storage: bool = True, save_local_tmp_storage: bool = True) -> DataPrep: """ - Select data for given key, create a DataPrep object and interpolate, transform, make history and labels and - remove nans. + Create DataPrep object and preprocess data for given key. + + Select data for given key, create a DataPrep object and + * apply transformation (optional) + * interpolate + * make history, labels, and observation + * remove nans + * upsample extremes (optional). + Processed data can be stored locally in a .pickle file. If load local tmp storage is enabled, the get data + generator tries first to load data from local pickle file and only creates a new DataPrep object if it couldn't + load this data from disk. + :param key: station key to choose the data generator. :param load_local_tmp_storage: say if data should be processed from scratch or loaded as already processed data from tmp pickle file to save computational time (but of course more disk space required). @@ -201,7 +259,8 @@ class DataGenerator(keras.utils.Sequence): def _save_pickle_data(self, data: Any): """ - Save given data locally as .pickle in self.data_path_tmp with name '<station>_<var1>_<var2>_..._<varX>.pickle' + Save given data locally as .pickle in self.data_path_tmp with name '<station>_<var1>_<var2>_..._<varX>.pickle'. + :param data: any data, that should be saved """ date = f"{self.kwargs.get('start')}_{self.kwargs.get('end')}" @@ -215,6 +274,7 @@ class DataGenerator(keras.utils.Sequence): def _load_pickle_data(self, station: Union[str, List[str]], variables: List[str]) -> Any: """ Load locally saved data from self.data_path_tmp and name '<station>_<var1>_<var2>_..._<varX>.pickle'. + :param station: station to load :param variables: list of variables to load :return: loaded data @@ -230,7 +290,8 @@ class DataGenerator(keras.utils.Sequence): def get_station_key(self, key: Union[None, str, int, List[Union[None, str, int]]]) -> str: """ - Return a valid station key or raise KeyError if this wasn't possible + Return a valid station key or raise KeyError if this wasn't possible. + :param key: station key to choose the data generator. :return: station key (id from database) """ diff --git a/src/data_handling/data_preparation.py b/src/data_handling/data_preparation.py index 5628394271918dc5631182d7de610db4ad335b7f..6c89c9c20fd8d88825fc117dcdeecf38776f894a 100644 --- a/src/data_handling/data_preparation.py +++ b/src/data_handling/data_preparation.py @@ -2,9 +2,9 @@ __author__ = 'Felix Kleinert, Lukas Leufen' __date__ = '2019-10-16' import datetime as dt -from functools import reduce import logging import os +from functools import reduce from typing import Union, List, Iterable, Tuple import numpy as np @@ -30,16 +30,18 @@ class DataPrep(object): After data loading, different data pre-processing steps can be executed to prepare the data for further applications. Especially the following methods can be used for the pre-processing step: + - interpolate: interpolate between data points by using xarray's interpolation method - - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on - interval [0, 1] are not implemented yet. + - standardise: standardise data to mean=1 and std=1, centralise to mean=0, additional methods like normalise on \ + interval [0, 1] are not implemented yet. - make window history: represent the history (time steps before) for training/ testing; X - make labels: create target vector with given leading time steps for training/ testing; y - - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. Use - this method after the creation of the window history and labels to clean up the data cube. + - remove Nans jointly from desired input and output, only keeps time steps where no NaNs are present in X AND y. \ + Use this method after the creation of the window history and labels to clean up the data cube. To create a DataPrep instance, it is needed to specify the stations by id (e.g. "DEBW107"), its network (e.g. UBA, "Umweltbundesamt") and the variables to use. Further options can be set in the instance. + * `statistics_per_var`: define a specific statistic to extract from the TOAR database for each variable. * `start`: define a start date for the data cube creation. Default: Use the first entry in time series * `end`: set the end date for the data cube. Default: Use last date in time series. @@ -124,6 +126,7 @@ class DataPrep(object): def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: """ Download data from TOAR database using the JOIN interface. + :param file_name: :param meta_file: :return: @@ -206,7 +209,8 @@ class DataPrep(object): def inverse_transform(self) -> None: """ Perform inverse transformation - :return: + + :return: None """ def f_inverse(data, mean, std, method_inverse): @@ -225,7 +229,8 @@ class DataPrep(object): self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) self._transform_method = None - def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean = None, std=None) -> None: + def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None, + std=None) -> None: """ This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0 and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale @@ -303,6 +308,7 @@ class DataPrep(object): """ This function uses xarray's shift function multiple times to represent history (if window <= 0) or lead time (if window > 0) + :param dim: dimension along shift is applied :param window: number of steps to shift (corresponds to the window length) :return: @@ -320,7 +326,8 @@ class DataPrep(object): res = xr.concat(res, dim=window_array) return res - def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, window: int) -> None: + def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, + window: int) -> None: """ This function creates a xarray.DataArray containing labels @@ -355,7 +362,8 @@ class DataPrep(object): non_nan_history = self.history.dropna(dim=dim) non_nan_label = self.label.dropna(dim=dim) non_nan_observation = self.observation.dropna(dim=dim) - intersect = reduce(np.intersect1d, (non_nan_history.coords[dim].values, non_nan_label.coords[dim].values, non_nan_observation.coords[dim].values)) + intersect = reduce(np.intersect1d, (non_nan_history.coords[dim].values, non_nan_label.coords[dim].values, + non_nan_observation.coords[dim].values)) min_length = self.kwargs.get("min_length", 0) if len(intersect) < max(min_length, 1): @@ -384,6 +392,7 @@ class DataPrep(object): def _slice_prep(self, data: xr.DataArray, coord: str = 'datetime') -> xr.DataArray: """ This function prepares all settings for slicing and executes _slice + :param data: :param coord: name of axis to slice :return: @@ -396,6 +405,7 @@ class DataPrep(object): def _slice(data: xr.DataArray, start: Union[date, str], end: Union[date, str], coord: str) -> xr.DataArray: """ This function slices through a given data_item (for example select only values of 2011) + :param data: :param start: :param end: @@ -407,7 +417,8 @@ class DataPrep(object): def check_for_negative_concentrations(self, data: xr.DataArray, minimum: int = 0) -> xr.DataArray: """ This function sets all negative concentrations to zero. Names of all concentrations are extracted from - https://join.fz-juelich.de/services/rest/surfacedata/ #2.1 Parameters + https://join.fz-juelich.de/services/rest/surfacedata/ \#2.1 Parameters + :param data: :param minimum: :return: @@ -465,7 +476,7 @@ class DataPrep(object): if (self.extremes_label is None) or (self.extremes_history is None): # extract extremes based on occurance in labels if extremes_on_right_tail_only: - extreme_label_idx = (self.label > extr_val).any(axis=0).values.reshape(-1,) + extreme_label_idx = (self.label > extr_val).any(axis=0).values.reshape(-1, ) else: extreme_label_idx = np.concatenate(((self.label < -extr_val).any(axis=0).values.reshape(-1, 1), (self.label > extr_val).any(axis=0).values.reshape(-1, 1)), @@ -474,15 +485,16 @@ class DataPrep(object): extremes_history = self.history[..., extreme_label_idx, :] extremes_label.datetime.values += np.timedelta64(*timedelta) extremes_history.datetime.values += np.timedelta64(*timedelta) - self.extremes_label = extremes_label#.squeeze('Stations').transpose('datetime', 'window') - self.extremes_history = extremes_history#.transpose('datetime', 'window', 'Stations', 'variables') + self.extremes_label = extremes_label # .squeeze('Stations').transpose('datetime', 'window') + self.extremes_history = extremes_history # .transpose('datetime', 'window', 'Stations', 'variables') else: # one extr value iteration is done already: self.extremes_label is NOT None... if extremes_on_right_tail_only: extreme_label_idx = (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, ) else: - extreme_label_idx = np.concatenate(((self.extremes_label < -extr_val).any(axis=0).values.reshape(-1, 1), - (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, 1) - ), axis=1).any(axis=1) + extreme_label_idx = np.concatenate( + ((self.extremes_label < -extr_val).any(axis=0).values.reshape(-1, 1), + (self.extremes_label > extr_val).any(axis=0).values.reshape(-1, 1) + ), axis=1).any(axis=1) # check on existing extracted extremes to minimise computational costs for comparison extremes_label = self.extremes_label[..., extreme_label_idx] extremes_history = self.extremes_history[..., extreme_label_idx, :]