diff --git a/README.md b/README.md index 6d9dc21f59923e14b04de8f728ca15a6655abb55..c33aab4b8643d2907b07b5ebcb254076515d03d2 100644 --- a/README.md +++ b/README.md @@ -348,6 +348,62 @@ parameter call. True ``` +# Data Handlers + +Data handlers are responsible for all tasks related to data like data acquisition, preparation and provision. A data +handler must inherit from the abstract base class `AbstractDataHandler` and requires the implementation of the +`__init__()` method and the accessors `get_X()` and `get_Y()`. In the following, we show an example how a custom data +handler could look like. + +```python +import datetime as dt +import numpy as np +import pandas as pd +import xarray as xr + +from mlair.data_handler import AbstractDataHandler + +class DummyDataHandler(AbstractDataHandler): + + def __init__(self, name, number_of_samples=None): + """This data handler takes a name argument and the number of samples to generate. If not provided, a random + number between 100 and 150 is set.""" + super().__init__() + self.name = name + self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150) + self._X = self.create_X() + self._Y = self.create_Y() + + def create_X(self): + """Inputs are random numbers between 0 and 10 with shape (no_samples, window=14, variables=5).""" + X = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(X, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(14), + "variables": range(5)}) + + def create_Y(self): + """Targets are normal distributed random numbers with shape (no_samples, window=5, variables=1).""" + Y = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(Y, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(5), + "variables": range(1)}) + + def get_X(self, upsampling=False, as_numpy=False): + """Upsampling parameter is not used for X.""" + return np.copy(self._X) if as_numpy is True else self._X + + def get_Y(self, upsampling=False, as_numpy=False): + """Upsampling parameter is not used for Y.""" + return np.copy(self._Y) if as_numpy is True else self._Y + + def __str__(self): + return self.name + +``` + + # Special Remarks ## Special instructions for installation on Jülich HPC systems diff --git a/mlair/data_handler/advanced_data_handler.py b/mlair/data_handler/advanced_data_handler.py index f0dc874a050c274b0b4b6692073d8f7332d27c1d..bf7defa56709c53e9c11b54baca54efcd105843c 100644 --- a/mlair/data_handler/advanced_data_handler.py +++ b/mlair/data_handler/advanced_data_handler.py @@ -305,15 +305,15 @@ class DefaultDataHandler(AbstractDataHandler): def run_data_prep(): from .data_preparation_neighbors import DataHandlerNeighbors - data = DummyDataSingleStation("main_class") + data = DummyDataHandler("main_class") data.get_X() data.get_Y() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") - data_prep = DataHandlerNeighbors(DummyDataSingleStation("main_class"), + data_prep = DataHandlerNeighbors(DummyDataHandler("main_class"), path, - neighbors=[DummyDataSingleStation("neighbor1"), - DummyDataSingleStation("neighbor2")], + neighbors=[DummyDataHandler("neighbor1"), + DummyDataHandler("neighbor2")], extreme_values=[1., 1.2]) data_prep.get_data(upsampling=False) @@ -344,6 +344,45 @@ def create_data_prep(): return data_prep +class DummyDataHandler(AbstractDataHandler): + + def __init__(self, name, number_of_samples=None): + """This data handler takes a name argument and the number of samples to generate. If not provided, a random + number between 100 and 150 is set.""" + super().__init__() + self.name = name + self.number_of_samples = number_of_samples if number_of_samples is not None else np.random.randint(100, 150) + self._X = self.create_X() + self._Y = self.create_Y() + + def create_X(self): + """Inputs are random numbers between 0 and 10 with shape (no_samples, window=14, variables=5).""" + X = np.random.randint(0, 10, size=(self.number_of_samples, 14, 5)) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(X, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(14), + "variables": range(5)}) + + def create_Y(self): + """Targets are normal distributed random numbers with shape (no_samples, window=5, variables=1).""" + Y = np.round(0.5 * np.random.randn(self.number_of_samples, 5, 1), 1) # samples, window, variables + datelist = pd.date_range(dt.datetime.today().date(), periods=self.number_of_samples, freq="H").tolist() + return xr.DataArray(Y, dims=['datetime', 'window', 'variables'], coords={"datetime": datelist, + "window": range(5), + "variables": range(1)}) + + def get_X(self, upsampling=False, as_numpy=False): + """Upsampling parameter is not used for X.""" + return np.copy(self._X) if as_numpy is True else self._X + + def get_Y(self, upsampling=False, as_numpy=False): + """Upsampling parameter is not used for Y.""" + return np.copy(self._Y) if as_numpy is True else self._Y + + def __str__(self): + return self.name + + if __name__ == "__main__": from mlair.data_handler.station_preparation import DataHandlerSingleStation from mlair.data_handler.iterator import KerasIterator, DataCollection