diff --git a/mlair/configuration/.gitignore b/mlair/configuration/.gitignore index 8e2358dc56797578fe0de020aa827b1fef8663bf..91eccc695f4ea58374a14a1ba0272f98f210c203 100644 --- a/mlair/configuration/.gitignore +++ b/mlair/configuration/.gitignore @@ -1 +1,2 @@ -join_settings.py \ No newline at end of file +join_settings.py +join_rest \ No newline at end of file diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index d191af2edd8a6fe2c1093b3f1c3f5d419cc42b76..51d4beafbbc0b346331db80567946c3acc702b8e 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -1,6 +1,7 @@ __author__ = "Lukas Leufen" __date__ = '2020-06-25' +from mlair.helpers.statistics import TransformationClass DEFAULT_STATIONS = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', @@ -13,8 +14,7 @@ DEFAULT_START = "1997-01-01" DEFAULT_END = "2017-12-31" DEFAULT_WINDOW_HISTORY_SIZE = 13 DEFAULT_OVERWRITE_LOCAL_DATA = False -# DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} -DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise"} +DEFAULT_TRANSFORMATION = TransformationClass(inputs_method="standardise", targets_method="standardise") DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"] # ju[wels} #hdfmll(ogin) DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"] # first part of node names for Juwels (jw[comp], hdfmlc(ompute). DEFAULT_CREATE_NEW_MODEL = True @@ -46,15 +46,11 @@ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS = True DEFAULT_EVALUATE_BOOTSTRAPS = True DEFAULT_CREATE_NEW_BOOTSTRAPS = False DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 -#DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", -# "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", -# "PlotAvailability"] -DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", +DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability"] - def get_defaults(): """Return all default parameters set in defaults.py""" return {key: value for key, value in globals().items() if key.startswith('DEFAULT')} diff --git a/mlair/data_handler/__init__.py b/mlair/data_handler/__init__.py index 01d660031bbbdda08eba80044a08fcb034d8171b..495b6e7c8604a839a084a2b78a54563c13eb06e6 100644 --- a/mlair/data_handler/__init__.py +++ b/mlair/data_handler/__init__.py @@ -13,4 +13,4 @@ from .bootstraps import BootStraps from .iterator import KerasIterator, DataCollection from .default_data_handler import DefaultDataHandler from .abstract_data_handler import AbstractDataHandler -from .data_preparation_neighbors import DataHandlerNeighbors +from .data_handler_neighbors import DataHandlerNeighbors diff --git a/mlair/data_handler/abstract_data_handler.py b/mlair/data_handler/abstract_data_handler.py index 04b3d4651347759130da15a05056f6ace3d0fc1f..26ccf69c85e999c540e656a2ceac5737390a579e 100644 --- a/mlair/data_handler/abstract_data_handler.py +++ b/mlair/data_handler/abstract_data_handler.py @@ -27,7 +27,10 @@ class AbstractDataHandler: @classmethod def own_args(cls, *args): - return remove_items(inspect.getfullargspec(cls).args, ["self"] + list(args)) + """Return all arguments (including kwonlyargs).""" + arg_spec = inspect.getfullargspec(cls) + list_of_args = arg_spec.args + arg_spec.kwonlyargs + return remove_items(list_of_args, ["self"] + list(args)) @classmethod def transformation(cls, *args, **kwargs): diff --git a/mlair/data_handler/advanced_data_handler.py b/mlair/data_handler/advanced_data_handler.py index c2d210bffdb598b23c025f60b903ddef84e4509d..f04748e82f11116b265796afba7f401c1cad9342 100644 --- a/mlair/data_handler/advanced_data_handler.py +++ b/mlair/data_handler/advanced_data_handler.py @@ -10,15 +10,18 @@ import datetime as dt from mlair.data_handler import AbstractDataHandler -from typing import Union, List +from typing import Union, List, Tuple, Dict +import logging +from functools import reduce +from mlair.helpers.join import EmptyQueryResult +from mlair.helpers import TimeTracking number = Union[float, int] num_or_list = Union[number, List[number]] def run_data_prep(): - - from .data_preparation_neighbors import DataHandlerNeighbors + from .data_handler_neighbors import DataHandlerNeighbors data = DummyDataHandler("main_class") data.get_X() data.get_Y() @@ -33,8 +36,7 @@ def run_data_prep(): def create_data_prep(): - - from .data_preparation_neighbors import DataHandlerNeighbors + from .data_handler_neighbors import DataHandlerNeighbors path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") station_type = None network = 'UBA' @@ -98,7 +100,7 @@ class DummyDataHandler(AbstractDataHandler): if __name__ == "__main__": - from mlair.data_handler.station_preparation import DataHandlerSingleStation + from mlair.data_handler.data_handler_single_station import DataHandlerSingleStation from mlair.data_handler.iterator import KerasIterator, DataCollection data_prep = create_data_prep() data_collection = DataCollection(data_prep) diff --git a/mlair/data_handler/data_handler_kz_filter.py b/mlair/data_handler/data_handler_kz_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..ce96a8f5c039b5a232aa56765209927dd4019168 --- /dev/null +++ b/mlair/data_handler/data_handler_kz_filter.py @@ -0,0 +1,90 @@ +"""Data Handler using kz-filtered data.""" + +__author__ = 'Lukas Leufen' +__date__ = '2020-08-26' + +import inspect +import numpy as np +import pandas as pd +import xarray as xr +from typing import List, Union + +from mlair.data_handler.data_handler_single_station import DataHandlerSingleStation +from mlair.data_handler import DefaultDataHandler +from mlair.helpers import remove_items, to_list, TimeTrackingWrapper +from mlair.helpers.statistics import KolmogorovZurbenkoFilterMovingWindow as KZFilter + +# define a more general date type for type hinting +str_or_list = Union[str, List[str]] + + +class DataHandlerKzFilterSingleStation(DataHandlerSingleStation): + """Data handler for a single station to be used by a superior data handler. Inputs are kz filtered.""" + + _requirements = remove_items(inspect.getfullargspec(DataHandlerSingleStation).args, ["self", "station"]) + + def __init__(self, *args, kz_filter_length, kz_filter_iter, **kwargs): + assert kwargs.get("sampling") == "hourly" # This data handler requires hourly data resolution + kz_filter_length = to_list(kz_filter_length) + kz_filter_iter = to_list(kz_filter_iter) + # self.original_data = None # ToDo: implement here something to store unfiltered data + self.kz_filter_length = kz_filter_length + self.kz_filter_iter = kz_filter_iter + self.cutoff_period = None + self.cutoff_period_days = None + super().__init__(*args, **kwargs) + + def setup_samples(self): + """ + Setup samples. This method prepares and creates samples X, and labels Y. + """ + self.load_data() + self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) + self.set_inputs_and_targets() + self.apply_kz_filter() + # this is just a code snippet to check the results of the kz filter + # import matplotlib + # matplotlib.use("TkAgg") + # import matplotlib.pyplot as plt + # self.input_data.data.sel(filter="74d", variables="temp", Stations="DEBW107").plot() + # self.input_data.data.sel(variables="temp", Stations="DEBW107").plot.line(hue="filter") + if self.do_transformation is True: + self.call_transform() + self.make_samples() + + @TimeTrackingWrapper + def apply_kz_filter(self): + """Apply kolmogorov zurbenko filter only on inputs.""" + kz = KZFilter(self.input_data.data, wl=self.kz_filter_length, itr=self.kz_filter_iter, filter_dim="datetime") + filtered_data: List[xr.DataArray] = kz.run() + self.cutoff_period = kz.period_null() + self.cutoff_period_days = kz.period_null_days() + self.input_data.data = xr.concat(filtered_data, pd.Index(self.create_filter_index(), name="filter")) + + def create_filter_index(self) -> pd.Index: + """ + Round cut off periods in days and append 'res' for residuum index. + + Round small numbers (<10) to single decimal, and higher numbers to int. Transform as list of str and append + 'res' for residuum index. + """ + index = np.round(self.cutoff_period_days, 1) + f = lambda x: int(np.round(x)) if x >= 10 else np.round(x, 1) + index = list(map(f, index.tolist())) + index = list(map(lambda x: str(x) + "d", index)) + ["res"] + return pd.Index(index, name="filter") + + def get_transposed_history(self) -> xr.DataArray: + """Return history. + + :return: history with dimensions datetime, window, Stations, variables. + """ + return self.history.transpose("datetime", "window", "Stations", "variables", "filter").copy() + + +class DataHandlerKzFilter(DefaultDataHandler): + """Data handler using kz filtered data.""" + + data_handler = DataHandlerKzFilterSingleStation + data_handler_transformation = DataHandlerKzFilterSingleStation + _requirements = data_handler.requirements() diff --git a/mlair/data_handler/data_preparation_neighbors.py b/mlair/data_handler/data_handler_neighbors.py similarity index 85% rename from mlair/data_handler/data_preparation_neighbors.py rename to mlair/data_handler/data_handler_neighbors.py index 1482bb9fe20afcc2b92d2b91ae523a6dca19c54d..a004e659969232a080d49eb6905007d353bbe99c 100644 --- a/mlair/data_handler/data_preparation_neighbors.py +++ b/mlair/data_handler/data_handler_neighbors.py @@ -4,9 +4,9 @@ __date__ = '2020-07-17' from mlair.helpers import to_list -from mlair.data_handler.station_preparation import DataHandlerSingleStation from mlair.data_handler import DefaultDataHandler import os +import copy from typing import Union, List @@ -15,6 +15,7 @@ num_or_list = Union[number, List[number]] class DataHandlerNeighbors(DefaultDataHandler): + """Data handler including neighboring stations.""" def __init__(self, id_class, data_path, neighbors=None, min_length=0, extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False): @@ -24,14 +25,14 @@ class DataHandlerNeighbors(DefaultDataHandler): @classmethod def build(cls, station, **kwargs): - sp_keys = {k: kwargs[k] for k in cls._requirements if k in kwargs} - sp = DataHandlerSingleStation(station, **sp_keys) + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} + sp = cls.data_handler(station, **sp_keys) n_list = [] for neighbor in kwargs.get("neighbors", []): - n_list.append(DataHandlerSingleStation(neighbor, **sp_keys)) + n_list.append(cls.data_handler(neighbor, **sp_keys)) else: kwargs["neighbors"] = n_list if len(n_list) > 0 else None - dp_args = {k: kwargs[k] for k in cls.own_args("id_class") if k in kwargs} + dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} return cls(sp, **dp_args) def _create_collection(self): diff --git a/mlair/data_handler/station_preparation.py b/mlair/data_handler/data_handler_single_station.py similarity index 74% rename from mlair/data_handler/station_preparation.py rename to mlair/data_handler/data_handler_single_station.py index f3428e91bae3dc1d94a45dd7ff2bf931cff1fa54..460d1c100dadbc2aea5d43932e902cc080177b27 100644 --- a/mlair/data_handler/station_preparation.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -3,6 +3,7 @@ __author__ = 'Lukas Leufen, Felix Kleinert' __date__ = '2020-07-20' +import copy import datetime as dt import logging import os @@ -15,7 +16,7 @@ import xarray as xr from mlair.configuration import check_path_and_create from mlair import helpers -from mlair.helpers import join, statistics +from mlair.helpers import join, statistics, TimeTrackingWrapper from mlair.data_handler.abstract_data_handler import AbstractDataHandler # define a more general date type for type hinting @@ -53,7 +54,8 @@ class DataHandlerSingleStation(AbstractDataHandler): self.station = helpers.to_list(station) self.path = os.path.abspath(data_path) self.statistics_per_var = statistics_per_var - self.transformation = self.setup_transformation(transformation) + self.do_transformation = transformation is not None + self.input_data, self.target_data = self.setup_transformation(transformation) self.station_type = station_type self.network = network @@ -74,20 +76,13 @@ class DataHandlerSingleStation(AbstractDataHandler): self.end = end # internal - self.data = None + self._data: xr.DataArray = None # loaded raw data self.meta = None self.variables = list(statistics_per_var.keys()) if variables is None else variables self.history = None self.label = None self.observation = None - # internal for transformation - self.mean = None - self.std = None - self.max = None - self.min = None - self._transform_method = None - # create samples self.setup_samples() @@ -100,7 +95,7 @@ class DataHandlerSingleStation(AbstractDataHandler): @property def shape(self): - return self.data.shape, self.get_X().shape, self.get_Y().shape + return self._data.shape, self.get_X().shape, self.get_Y().shape def __repr__(self): return f"StationPrep(station={self.station}, data_path='{self.path}', " \ @@ -109,24 +104,7 @@ class DataHandlerSingleStation(AbstractDataHandler): f"sampling='{self.sampling}', target_dim='{self.target_dim}', target_var='{self.target_var}', " \ f"time_dim='{self.time_dim}', window_history_size={self.window_history_size}, " \ f"window_lead_time={self.window_lead_time}, interpolation_limit={self.interpolation_limit}, " \ - f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data}, " \ - f"transformation={self._print_transformation_as_string})" - - @property - def _print_transformation_as_string(self): - str_name = '' - if self.transformation is None: - str_name = f'{None}' - else: - for k, v in self.transformation.items(): - if v is not None: - try: - v_pr = f"xr.DataArray.from_dict({v.to_dict()})" - except AttributeError: - v_pr = f"'{v}'" - str_name += f"'{k}':{v_pr}, " - str_name = f"{{{str_name}}}" - return str_name + f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data})" def get_transposed_history(self) -> xr.DataArray: """Return history. @@ -153,29 +131,29 @@ class DataHandlerSingleStation(AbstractDataHandler): return coords.rename(index={"station_lon": "lon", "station_lat": "lat"}).to_dict()[str(self)] def call_transform(self, inverse=False): - self.transform(dim=self.time_dim, method=self.transformation["method"], - mean=self.transformation['mean'], std=self.transformation["std"], - min_val=self.transformation["min"], max_val=self.transformation["max"], - inverse=inverse - ) - - def set_transformation(self, transformation: dict): - if self._transform_method is not None: - self.call_transform(inverse=True) - self.transformation = self.setup_transformation(transformation) - self.call_transform() - self.make_samples() + kwargs = helpers.remove_items(self.input_data.as_dict(), ["data"]) + self.transform(self.input_data, dim=self.time_dim, inverse=inverse, **kwargs) + kwargs = helpers.remove_items(self.target_data.as_dict(), ["data"]) + self.transform(self.target_data, dim=self.time_dim, inverse=inverse, **kwargs) + @TimeTrackingWrapper def setup_samples(self): """ Setup samples. This method prepares and creates samples X, and labels Y. """ self.load_data() self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) - if self.transformation is not None: + self.set_inputs_and_targets() + if self.do_transformation is True: self.call_transform() self.make_samples() + def set_inputs_and_targets(self): + inputs = self._data.sel({self.target_dim: helpers.to_list(self.variables)}) + targets = self._data.sel({self.target_dim: self.target_var}) + self.input_data.data = inputs + self.target_data.data = targets + def make_samples(self): self.make_history_window(self.target_dim, self.window_history_size, self.time_dim) self.make_labels(self.target_dim, self.target_var, self.time_dim, self.window_lead_time) @@ -216,7 +194,7 @@ class DataHandlerSingleStation(AbstractDataHandler): logging.debug("loading finished") # create slices and check for negative concentration. data = self._slice_prep(data) - self.data = self.check_for_negative_concentrations(data) + self._data = self.check_for_negative_concentrations(data) def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: """ @@ -283,10 +261,11 @@ class DataHandlerSingleStation(AbstractDataHandler): data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) return data - def shift(self, dim: str, window: int) -> xr.DataArray: + def shift(self, data: xr.DataArray, dim: str, window: int) -> xr.DataArray: """ Shift data multiple times to represent history (if window <= 0) or lead time (if window > 0). + :param data: data set to shift :param dim: dimension along shift is applied :param window: number of steps to shift (corresponds to the window length) @@ -300,7 +279,7 @@ class DataHandlerSingleStation(AbstractDataHandler): end = window + 1 res = [] for w in range(start, end): - res.append(self.data.shift({dim: -w})) + res.append(data.shift({dim: -w})) window_array = self.create_index_array('window', range(start, end), squeeze_dim=self.target_dim) res = xr.concat(res, dim=window_array) return res @@ -370,8 +349,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :return: xarray.DataArray """ - self.data = self.data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate, - **kwargs) + self._data = self._data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate, + **kwargs) def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None: """ @@ -388,7 +367,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :param dim_name_of_shift: Dimension along shift will be applied """ window = -abs(window) - self.history = self.shift(dim_name_of_shift, window).sel({dim_name_of_inputs: self.variables}) + data = self.input_data.data + self.history = self.shift(data, dim_name_of_shift, window) def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, window: int) -> None: @@ -404,7 +384,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :param window: lead time of label """ window = abs(window) - self.label = self.shift(dim_name_of_shift, window).sel({dim_name_of_target: target_var}) + data = self.target_data.data + self.label = self.shift(data, dim_name_of_shift, window) def make_observation(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str) -> None: """ @@ -416,7 +397,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :param target_var: Name of observation variable(s) in 'dimension' :param dim_name_of_shift: Name of dimension on which xarray.DataArray.shift will be applied """ - self.observation = self.shift(dim_name_of_shift, 0).sel({dim_name_of_target: target_var}) + data = self.target_data.data + self.observation = self.shift(data, dim_name_of_shift, 0) def remove_nan(self, dim: str) -> None: """ @@ -490,89 +472,23 @@ class DataHandlerSingleStation(AbstractDataHandler): return data @staticmethod - def setup_transformation(transformation: Dict): + def setup_transformation(transformation: statistics.TransformationClass): """ Set up transformation by extracting all relevant information. - Extract all information from transformation dictionary. Possible keys are method, mean, std, min, max. - * If a transformation should be applied on base of existing values, these need to be provided in the respective - keys "mean" and "std" (again only if required for given method). - - :param transformation: the transformation dictionary as described above. - - :return: updated transformation dictionary - - ## Transformation - - There are two different approaches (called scopes) to transform the data: - 1) `station`: transform data for each station independently (somehow like batch normalisation) - 1) `data`: transform all data of each station with shared metrics - - Transformation must be set by the `transformation` attribute. If `transformation = None` is given to `ExperimentSetup`, - data is not transformed at all. For all other setups, use the following dictionary structure to specify the - transformation. - ``` - transformation = {"scope": <...>, - "method": <...>, - "mean": <...>, - "std": <...>} - ExperimentSetup(..., transformation=transformation, ...) - ``` - - ### scopes - - **station**: mean and std are not used - - **data**: either provide already calculated values for mean and std (if required by transformation method), or choose - from different calculation schemes, explained in the mean and std section. - - ### supported transformation methods - Currently supported methods are: - * standardise (default, if method is not given) - * centre - - ### mean and std - `"mean"="accurate"`: calculate the accurate values of mean and std (depending on method) by using all data. Although, - this method is accurate, it may take some time for the calculation. Furthermore, this could potentially lead to memory - issue (not explored yet, but could appear for a very big amount of data) - - `"mean"="estimate"`: estimate mean and std (depending on method). For each station, mean and std are calculated and - afterwards aggregated using the mean value over all station-wise metrics. This method is less accurate, especially - regarding the std calculation but therefore much faster. - - We recommend to use the later method *estimate* because of following reasons: - * much faster calculation - * real accuracy of mean and std is less important, because it is "just" a transformation / scaling - * accuracy of mean is almost as high as in the *accurate* case, because of - $\bar{x_{ij}} = \bar{\left(\bar{x_i}\right)_j}$. The only difference is, that in the *estimate* case, each mean is - equally weighted for each station independently of the actual data count of the station. - * accuracy of std is lower for *estimate* because of $\var{x_{ij}} \ne \bar{\left(\var{x_i}\right)_j}$, but still the mean of all - station-wise std is a decent estimate of the true std. - - `"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the - scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the - *standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation - class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. - + * Either return new empty DataClass instances if given transformation arg is None, + * or return given object twice if transformation is a DataClass instance, + * or return the inputs and targets attributes if transformation is a TransformationClass instance (default + design behaviour) """ if transformation is None: - return - elif not isinstance(transformation, dict): - raise TypeError(f"`transformation' must be either `None' or dict like e.g. `{{'method': 'standardise'}}," - f" but transformation is of type {type(transformation)}.") - transformation = transformation.copy() - method = transformation.get("method", None) - mean = transformation.get("mean", None) - std = transformation.get("std", None) - max_val = transformation.get("max", None) - min_val = transformation.get("min", None) - - transformation["method"] = method - transformation["mean"] = mean - transformation["std"] = std - transformation["max"] = max_val - transformation["min"] = min_val - return transformation + return statistics.DataClass(), statistics.DataClass() + elif isinstance(transformation, statistics.DataClass): + return transformation, transformation + elif isinstance(transformation, statistics.TransformationClass): + return copy.deepcopy(transformation.inputs), copy.deepcopy(transformation.targets) + else: + raise NotImplementedError("Cannot handle this.") def load_data(self): try: @@ -581,8 +497,9 @@ class DataHandlerSingleStation(AbstractDataHandler): self.download_data() self.load_data() - def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None, - std=None, min_val=None, max_val=None) -> None: + def transform(self, data_class, dim: Union[str, int] = 0, transform_method: str = 'standardise', + inverse: bool = False, mean=None, + std=None, min=None, max=None) -> None: """ Transform data according to given transformation settings. @@ -602,9 +519,9 @@ class DataHandlerSingleStation(AbstractDataHandler): calculated over the data in this class instance. :param std: Used for transformation (if required by 'method') based on external data. If 'None' the std is calculated over the data in this class instance. - :param min_val: Used for transformation (if required by 'method') based on external data. If 'None' min_val is + :param min: Used for transformation (if required by 'method') based on external data. If 'None' min_val is extracted from the data in this class instance. - :param max_val: Used for transformation (if required by 'method') based on external data. If 'None' max_val is + :param max: Used for transformation (if required by 'method') based on external data. If 'None' max_val is extracted from the data in this class instance. :return: xarray.DataArrays or pandas.DataFrames: @@ -614,36 +531,37 @@ class DataHandlerSingleStation(AbstractDataHandler): """ def f(data): - if method == 'standardise': + if transform_method == 'standardise': return statistics.standardise(data, dim) - elif method == 'centre': + elif transform_method == 'centre': return statistics.centre(data, dim) - elif method == 'normalise': + elif transform_method == 'normalise': # use min/max of data or given min/max raise NotImplementedError else: raise NotImplementedError def f_apply(data): - if method == "standardise": + if transform_method == "standardise": return mean, std, statistics.standardise_apply(data, mean, std) - elif method == "centre": + elif transform_method == "centre": return mean, None, statistics.centre_apply(data, mean) else: raise NotImplementedError if not inverse: - if self._transform_method is not None: - raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with " - f"{self._transform_method}. Please perform inverse transformation of data first.") + if data_class._method is not None: + raise AssertionError(f"Internal _method is already set. Therefore, data was already transformed with " + f"{data_class._method}. Please perform inverse transformation of data first.") # apply transformation on local data instance (f) if mean is None, else apply by using mean (and std) from # external data. - self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data) + data_class.mean, data_class.std, data_class.data = locals()["f" if mean is None else "f_apply"]( + data_class.data) # set transform method to find correct method for inverse transformation. - self._transform_method = method + data_class._method = transform_method else: - self.inverse_transform() + self.inverse_transform(data_class) @staticmethod def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None: @@ -665,7 +583,7 @@ class DataHandlerSingleStation(AbstractDataHandler): if len(msg) > 0: raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}") - def inverse_transform(self) -> None: + def inverse_transform(self, data_class) -> None: """ Perform inverse transformation. @@ -685,36 +603,26 @@ class DataHandlerSingleStation(AbstractDataHandler): else: raise NotImplementedError - if self._transform_method is None: + if data_class.transform_method is None: raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.") - self.check_inverse_transform_params(self.mean, self.std, self._transform_method) - self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) - self._transform_method = None + self.check_inverse_transform_params(data_class.mean, data_class.std, data_class._method) + data_class.data, data_class.mean, data_class.std = f_inverse(data_class.data, data_class.mean, data_class.std, + data_class._method) + data_class.transform_method = None # update X and Y self.make_samples() - def get_transformation_information(self, variable: str = None) -> Tuple[data_or_none, data_or_none, str]: + def get_transformation_targets(self) -> Tuple[data_or_none, data_or_none, str]: """ Extract transformation statistics and method. - Get mean and standard deviation for given variable and the transformation method if set. If a transformation + Get mean and standard deviation for target values and the transformation method if set. If a transformation depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are returned with None as fill value. - :param variable: Variable for which the information on transformation is requested. - :return: mean, standard deviation and transformation method """ - variable = self.target_var if variable is None else variable - try: - mean = self.mean.sel({'variables': variable}).values - except AttributeError: - mean = None - try: - std = self.std.sel({'variables': variable}).values - except AttributeError: - std = None - return mean, std, self._transform_method + return self.target_data.mean, self.target_data.std, self.target_data.transform_method if __name__ == "__main__": @@ -727,7 +635,6 @@ if __name__ == "__main__": time_dim='datetime', window_history_size=7, window_lead_time=3, interpolation_limit=0 ) # transformation={'method': 'standardise'}) - # sp.set_transformation({'method': 'standardise', 'mean': sp.mean+2, 'std': sp.std+1}) sp2 = DataHandlerSingleStation(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122', statistics_per_var=statistics_per_var, station_type='background', network='UBA', sampling='daily', target_dim='variables', target_var='o3', diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 47f63a3e7bcbebd131c2a0da47d2e0833b02efed..e6dde10bf6bd13013fa454eadd1a7976c00dd3e2 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -4,6 +4,7 @@ __date__ = '2020-09-21' import copy import inspect +import gc import logging import os import pickle @@ -15,7 +16,6 @@ import numpy as np import xarray as xr from mlair.data_handler.abstract_data_handler import AbstractDataHandler -from mlair.data_handler.station_preparation import DataHandlerSingleStation from mlair.helpers import remove_items, to_list from mlair.helpers.join import EmptyQueryResult @@ -25,11 +25,14 @@ num_or_list = Union[number, List[number]] class DefaultDataHandler(AbstractDataHandler): + from mlair.data_handler.data_handler_single_station import DataHandlerSingleStation as data_handler + from mlair.data_handler.data_handler_single_station import DataHandlerSingleStation as data_handler_transformation - _requirements = remove_items(inspect.getfullargspec(DataHandlerSingleStation).args, ["self", "station"]) + _requirements = remove_items(inspect.getfullargspec(data_handler).args, ["self", "station"]) - def __init__(self, id_class: DataHandlerSingleStation, data_path: str, min_length: int = 0, - extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None): + def __init__(self, id_class: data_handler, data_path: str, min_length: int = 0, + extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None, + store_processed_data=True): super().__init__() self.id_class = id_class self.interpolation_dim = "datetime" @@ -43,12 +46,12 @@ class DefaultDataHandler(AbstractDataHandler): self._collection = self._create_collection() self.harmonise_X() self.multiply_extremes(extreme_values, extremes_on_right_tail_only, dim=self.interpolation_dim) - self._store(fresh_store=True) + self._store(fresh_store=True, store_processed_data=store_processed_data) @classmethod def build(cls, station: str, **kwargs): sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} - sp = DataHandlerSingleStation(station, **sp_keys) + sp = cls.data_handler(station, **sp_keys) dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} return cls(sp, **dp_args) @@ -61,6 +64,7 @@ class DefaultDataHandler(AbstractDataHandler): def _reset_data(self): self._X, self._Y, self._X_extreme, self._Y_extreme = None, None, None, None + gc.collect() def _cleanup(self): directory = os.path.dirname(self._save_file) @@ -69,13 +73,14 @@ class DefaultDataHandler(AbstractDataHandler): if os.path.exists(self._save_file): shutil.rmtree(self._save_file, ignore_errors=True) - def _store(self, fresh_store=False): - self._cleanup() if fresh_store is True else None - data = {"X": self._X, "Y": self._Y, "X_extreme": self._X_extreme, "Y_extreme": self._Y_extreme} - with open(self._save_file, "wb") as f: - pickle.dump(data, f) - logging.debug(f"save pickle data to {self._save_file}") - self._reset_data() + def _store(self, fresh_store=False, store_processed_data=True): + if store_processed_data is True: + self._cleanup() if fresh_store is True else None + data = {"X": self._X, "Y": self._Y, "X_extreme": self._X_extreme, "Y_extreme": self._Y_extreme} + with open(self._save_file, "wb") as f: + pickle.dump(data, f) + logging.debug(f"save pickle data to {self._save_file}") + self._reset_data() def _load(self): try: @@ -140,7 +145,7 @@ class DefaultDataHandler(AbstractDataHandler): return self.id_class.observation.copy().squeeze() def get_transformation_Y(self): - return self.id_class.get_transformation_information() + return self.id_class.get_transformation_targets() def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim="datetime"): @@ -212,27 +217,55 @@ class DefaultDataHandler(AbstractDataHandler): @classmethod def transformation(cls, set_stations, **kwargs): + """ + ### supported transformation methods + + Currently supported methods are: + + * standardise (default, if method is not given) + * centre + + ### mean and std estimation + + Mean and std (depending on method) are estimated. For each station, mean and std are calculated and afterwards + aggregated using the mean value over all station-wise metrics. This method is not exactly accurate, especially + regarding the std calculation but therefore much faster. Furthermore, it is a weighted mean weighted by the + time series length / number of data itself - a longer time series has more influence on the transformation + settings than a short time series. The estimation of the std in less accurate, because the unweighted mean of + all stds in not equal to the true std, but still the mean of all station-wise std is a decent estimate. Finally, + the real accuracy of mean and std is less important, because it is "just" a transformation / scaling. + + ### mean and std given + + If mean and std are not None, the default data handler expects this parameters to match the data and applies + this values to the data. Make sure that all dimensions and/or coordinates are in agreement. + """ + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} - transformation_dict = sp_keys.pop("transformation") - if transformation_dict is None: + transformation_class = sp_keys.get("transformation", None) + if transformation_class is None: return - scope = transformation_dict.pop("scope") - method = transformation_dict.pop("method") - if transformation_dict.pop("mean", None) is not None: + + transformation_inputs = transformation_class.inputs + if transformation_inputs.mean is not None: return - mean, std = None, None + means = [None, None] + stds = [None, None] for station in set_stations: try: - sp = DataHandlerSingleStation(station, transformation={"method": method}, **sp_keys) - mean = sp.mean.copy(deep=True) if mean is None else mean.combine_first(sp.mean) - std = sp.std.copy(deep=True) if std is None else std.combine_first(sp.std) + sp = cls.data_handler_transformation(station, **sp_keys) + for i, data in enumerate([sp.input_data, sp.target_data]): + means[i] = data.mean.copy(deep=True) if means[i] is None else means[i].combine_first(data.mean) + stds[i] = data.std.copy(deep=True) if stds[i] is None else stds[i].combine_first(data.std) except (AttributeError, EmptyQueryResult): continue - if mean is None: + if means[0] is None: return None - mean_estimated = mean.mean("Stations") - std_estimated = std.mean("Stations") - return {"scope": scope, "method": method, "mean": mean_estimated, "std": std_estimated} + transformation_class.inputs.mean = means[0].mean("Stations") + transformation_class.inputs.std = stds[0].mean("Stations") + transformation_class.targets.mean = means[1].mean("Stations") + transformation_class.targets.std = stds[1].mean("Stations") + return transformation_class def get_coordinates(self): return self.id_class.get_coordinates() \ No newline at end of file diff --git a/mlair/helpers/join.py b/mlair/helpers/join.py index a3c6876e3ea43ff4d03243430cf6cd791d62dec2..f66b277bbca54bd6190ab0430c9f8c0307b3f5af 100644 --- a/mlair/helpers/join.py +++ b/mlair/helpers/join.py @@ -55,7 +55,7 @@ def download_join(station_name: Union[str, List[str]], stat_var: dict, station_t for var in _lower_list(sorted(vars_dict.keys())): if var in stat_var.keys(): - logging.debug('load: {}'.format(var)) + logging.debug('load: {}'.format(var)) # ToDo start here for #206 # create data link opts = {'base': join_url_base, 'service': 'stats', 'id': vars_dict[var], 'statistics': stat_var[var], @@ -138,6 +138,7 @@ def load_series_information(station_name: List[str], station_type: str_or_none, opts = {"base": join_url_base, "service": "series", "station_id": station_name[0], "station_type": station_type, "network_name": network_name} station_vars = get_data(opts, headers) + logging.debug(f"{station_name}: {station_vars}") # ToDo start here for #206 vars_dict = {item[3].lower(): item[0] for item in station_vars} return vars_dict diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 056f92bec25b8d5216988f4dacb8fcd1e5257ab5..3db6618a5e8ebd575d61bc261144ff47ccaf9b53 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -9,10 +9,36 @@ import numpy as np import xarray as xr import pandas as pd from typing import Union, Tuple, Dict +from matplotlib import pyplot as plt + +from mlair.helpers import to_list, remove_items Data = Union[xr.DataArray, pd.DataFrame] +class DataClass: + + def __init__(self, data=None, mean=None, std=None, max=None, min=None, transform_method=None): + self.data = data + self.mean = mean + self.std = std + self.max = max + self.min = min + self.transform_method = transform_method + self._method = None + + def as_dict(self): + return remove_items(self.__dict__, "_method") + + +class TransformationClass: + + def __init__(self, inputs_mean=None, inputs_std=None, inputs_method=None, targets_mean=None, targets_std=None, + targets_method=None): + self.inputs = DataClass(mean=inputs_mean, std=inputs_std, transform_method=inputs_method) + self.targets = DataClass(mean=targets_mean, std=targets_std, transform_method=targets_method) + + def apply_inverse_transformation(data: Data, mean: Data, std: Data = None, method: str = "standardise") -> Data: """ Apply inverse transformation for given statistics. @@ -345,3 +371,168 @@ class SkillScores: monthly_mean[monthly_mean.index.dt.month == month, :] = mu[mu.month == month].values return monthly_mean + + +class KolmogorovZurbenkoBaseClass: + + def __init__(self, df, wl, itr, is_child=False, filter_dim="window"): + """ + It create the variables associate with the Kolmogorov-Zurbenko-filter. + + Args: + df(pd.DataFrame, None): time series of a variable + wl(list of int): window length + itr(list of int): number of iteration + """ + self.df = df + self.filter_dim = filter_dim + self.wl = to_list(wl) + self.itr = to_list(itr) + if abs(len(self.wl) - len(self.itr)) > 0: + raise ValueError("Length of lists for wl and itr must agree!") + self._isChild = is_child + self.child = self.set_child() + self.type = type(self).__name__ + + def set_child(self): + if len(self.wl) > 1: + return KolmogorovZurbenkoBaseClass(None, self.wl[1:], self.itr[1:], True, self.filter_dim) + else: + return None + + def kz_filter(self, df, m, k): + pass + + def spectral_calc(self): + df_start = self.df + kz = self.kz_filter(df_start, self.wl[0], self.itr[0]) + filtered = self.subtract(df_start, kz) + # case I: no child avail -> return kz and remaining + if self.child is None: + return [kz, filtered] + # case II: has child -> return current kz and all child results + else: + self.child.df = filtered + kz_next = self.child.spectral_calc() + return [kz] + kz_next + + @staticmethod + def subtract(minuend, subtrahend): + try: # pandas implementation + return minuend.sub(subtrahend, axis=0) + except AttributeError: # general implementation + return minuend - subtrahend + + def run(self): + return self.spectral_calc() + + def transfer_function(self): + m = self.wl[0] + k = self.itr[0] + omega = np.linspace(0.00001, 0.15, 5000) + return omega, (np.sin(m * np.pi * omega) / (m * np.sin(np.pi * omega))) ** (2 * k) + + def omega_null(self, alpha=0.5): + a = np.sqrt(6) / np.pi + b = 1 / (2 * np.array(self.itr)) + c = 1 - alpha ** b + d = np.array(self.wl) ** 2 - alpha ** b + return a * np.sqrt(c / d) + + def period_null(self, alpha=0.5): + return 1. / self.omega_null(alpha) + + def period_null_days(self, alpha=0.5): + return self.period_null(alpha) / 24. + + def plot_transfer_function(self, fig=None, name=None): + if fig is None: + fig = plt.figure() + omega, transfer_function = self.transfer_function() + if self.child is not None: + transfer_function_child = self.child.plot_transfer_function(fig) + else: + transfer_function_child = transfer_function * 0 + plt.semilogx(omega, transfer_function - transfer_function_child, + label="m={:3.0f}, k={:3.0f}, T={:6.2f}d".format(self.wl[0], + self.itr[0], + self.period_null_days())) + plt.axvline(x=self.omega_null()) + if not self._isChild: + locs, labels = plt.xticks() + plt.xticks(locs, np.round(1. / (locs * 24), 1)) + plt.xlim([0.00001, 0.15]) + plt.legend() + if name is None: + plt.show() + else: + plt.savefig(name) + else: + return transfer_function + + +class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): + + def __init__(self, df, wl: Union[list, int], itr: Union[list, int], is_child=False, filter_dim="window", + method="mean", percentile=0.5): + """ + It create the variables associate with the KolmogorovZurbenkoFilterMovingWindow class. + + Args: + df(pd.DataFrame, xr.DataArray): time series of a variable + wl: window length + itr: number of iteration + """ + self.valid_methods = ["mean", "percentile", "median", "max", "min"] + if method not in self.valid_methods: + raise ValueError("Method '{}' is not supported. Please select from [{}].".format( + method, ", ".join(self.valid_methods))) + else: + self.method = method + if percentile > 1 or percentile < 0: + raise ValueError("Percentile must be in range [0, 1]. Given was {}!".format(percentile)) + else: + self.percentile = percentile + super().__init__(df, wl, itr, is_child, filter_dim) + + def set_child(self): + if len(self.wl) > 1: + return KolmogorovZurbenkoFilterMovingWindow(self.df, self.wl[1:], self.itr[1:], is_child=True, + filter_dim=self.filter_dim, method=self.method, + percentile=self.percentile) + else: + return None + + def kz_filter(self, df, wl, itr): + """ + It passes the low frequency time series. + + Args: + wl(int): a window length + itr(int): a number of iteration + """ + df_itr = df.__deepcopy__() + try: + kwargs = {"min_periods": 1, + "center": True, + self.filter_dim: wl} + iter_vars = df_itr.coords["variables"].values + for var in iter_vars: + df_itr_var = df_itr.sel(variables=[var]).chunk() + for _ in np.arange(0, itr): + rolling = df_itr_var.rolling(**kwargs) + if self.method == "median": + df_mv_avg_tmp = rolling.median() + elif self.method == "percentile": + df_mv_avg_tmp = rolling.quantile(self.percentile) + elif self.method == "max": + df_mv_avg_tmp = rolling.max() + elif self.method == "min": + df_mv_avg_tmp = rolling.min() + else: + df_mv_avg_tmp = rolling.mean() + df_itr_var = df_mv_avg_tmp.compute() + df_itr = df_itr.drop_sel(variables=var).combine_first(df_itr_var) + return df_itr + except ValueError: + raise ValueError diff --git a/mlair/model_modules/model_class.py b/mlair/model_modules/model_class.py index c9cc13bd8108e43b5a9f03682942eacdf5a55f04..a603b466e4dab0dc30b6b6b22d10b6c27ee59767 100644 --- a/mlair/model_modules/model_class.py +++ b/mlair/model_modules/model_class.py @@ -396,8 +396,66 @@ class MyLittleModel(AbstractModelClass): def set_compile_options(self): self.initial_lr = 1e-2 self.optimizer = keras.optimizers.adam(lr=self.initial_lr) - self.lr_decay = mlair.model_modules.keras_extensions.LearningRateDecay(base_lr=self.initial_lr, drop=.94, - epochs_drop=10) + # self.lr_decay = mlair.model_modules.keras_extensions.LearningRateDecay(base_lr=self.initial_lr, drop=.94, + # epochs_drop=10) + self.compile_options = {"loss": [keras.losses.mean_squared_error], "metrics": ["mse", "mae"]} + + +class MyLittleModelHourly(AbstractModelClass): + """ + A customised model with a 1x1 Conv, and 4 Dense layers (64, 32, 16, window_lead_time), where the last layer is the + output layer depending on the window_lead_time parameter. Dropout is used between the Convolution and the first + Dense layer. + """ + + def __init__(self, input_shape: list, output_shape: list): + """ + Sets model and loss depending on the given arguments. + + :param shape_inputs: list of input shapes (expect len=1 with shape=(window_hist, station, variables)) + :param shape_outputs: list of output shapes (expect len=1 with shape=(window_forecast)) + """ + + assert len(input_shape) == 1 + assert len(output_shape) == 1 + super().__init__(input_shape[0], output_shape[0]) + + # settings + self.dropout_rate = 0.1 + self.regularizer = keras.regularizers.l2(0.001) + self.activation = keras.layers.PReLU + + # apply to model + self.set_model() + self.set_compile_options() + self.set_custom_objects(loss=self.compile_options['loss']) + + def set_model(self): + """ + Build the model. + """ + + # add 1 to window_size to include current time step t0 + x_input = keras.layers.Input(shape=self._input_shape) + x_in = keras.layers.Conv2D(128, (1, 1), padding='same', name='{}_Conv_1x1_128'.format("major"))(x_input) + x_in = self.activation()(x_in) + x_in = keras.layers.Conv2D(64, (1, 1), padding='same', name='{}_Conv_1x1_64'.format("major"))(x_in) + x_in = self.activation()(x_in) + x_in = keras.layers.Conv2D(32, (1, 1), padding='same', name='{}_Conv_1x1_32'.format("major"))(x_in) + x_in = self.activation()(x_in) + x_in = keras.layers.Flatten(name='{}'.format("major"))(x_in) + x_in = keras.layers.Dropout(self.dropout_rate, name='{}_Dropout_1'.format("major"))(x_in) + x_in = keras.layers.Dense(128, name='{}_Dense_128'.format("major"))(x_in) + x_in = self.activation()(x_in) + x_in = keras.layers.Dense(64, name='{}_Dense_64'.format("major"))(x_in) + x_in = self.activation()(x_in) + x_in = keras.layers.Dense(self._output_shape, name='{}_Dense'.format("major"))(x_in) + out_main = self.activation()(x_in) + self.model = keras.Model(inputs=x_input, outputs=[out_main]) + + def set_compile_options(self): + self.initial_lr = 1e-2 + self.optimizer = keras.optimizers.SGD(lr=self.initial_lr, momentum=0.9) self.compile_options = {"loss": [keras.losses.mean_squared_error], "metrics": ["mse", "mae"]} diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 675e5ade587011a9ac835e9afb45f89173bc7653..c8682374e0d4c0d724d83a5e36977543ac3a50f8 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -137,15 +137,16 @@ class PlotMonthlySummary(AbstractPlotClass): data_cnn = data.sel(type="CNN").squeeze() if len(data_cnn.shape) > 1: - data_cnn.coords["ahead"].values = [f"{days}d" for days in data_cnn.coords["ahead"].values] + data_cnn = data_cnn.assign_coords(ahead=[f"{days}d" for days in data_cnn.coords["ahead"].values]) data_obs = data.sel(type="obs", ahead=1).squeeze() data_obs.coords["ahead"] = "obs" data_concat = xr.concat([data_obs, data_cnn], dim="ahead") - data_concat = data_concat.drop("type") + data_concat = data_concat.drop_vars("type") - data_concat.index.values = data_concat.index.values.astype("datetime64[M]").astype(int) % 12 + 1 + new_index = data_concat.index.values.astype("datetime64[M]").astype(int) % 12 + 1 + data_concat = data_concat.assign_coords(index=new_index) data_concat = data_concat.clip(min=0) forecasts = xr.concat([forecasts, data_concat], 'index') if forecasts is not None else data_concat @@ -902,6 +903,7 @@ class PlotAvailability(AbstractPlotClass): # create standard Gantt plot for all stations (currently in single pdf file with single page) super().__init__(plot_folder, "data_availability") self.dim = time_dimension + self.linewidth = None self.sampling = self._get_sampling(sampling) plot_dict = self._prepare_data(generators) lgd = self._plot(plot_dict) @@ -917,11 +919,11 @@ class PlotAvailability(AbstractPlotClass): lgd = self._plot(plot_dict_summary) self._save(bbox_extra_artists=(lgd,), bbox_inches="tight") - @staticmethod - def _get_sampling(sampling): + def _get_sampling(self, sampling): if sampling == "daily": return "D" elif sampling == "hourly": + self.linewidth = 0.001 return "h" def _prepare_data(self, generators: Dict[str, DataCollection]): @@ -982,7 +984,7 @@ class PlotAvailability(AbstractPlotClass): plt_data = d.get(subset) if plt_data is None: continue - ax.broken_barh(plt_data, (pos, height), color=color, edgecolor="white") + ax.broken_barh(plt_data, (pos, height), color=color, edgecolor="white", linewidth=self.linewidth) yticklabels.append(station) ax.set_ylim([height, number_of_stations + 1]) diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index de43f30d929db1de12681d92c9c585df5c07944e..571d3a07d15873af1c1ccedc59e0cc462e07820f 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -399,10 +399,10 @@ class PostProcessing(RunEnvironment): :return: filled data array with ols predictions """ tmp_ols = self.ols_model.predict(input_data) - if not normalised: - tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method) target_shape = ols_prediction.values.shape ols_prediction.values = np.swapaxes(tmp_ols, 2, 0) if target_shape != tmp_ols.shape else tmp_ols + if not normalised: + ols_prediction = statistics.apply_inverse_transformation(ols_prediction, mean, std, transformation_method) return ols_prediction def _create_persistence_forecast(self, data, persistence_prediction: xr.DataArray, mean: xr.DataArray, @@ -423,9 +423,10 @@ class PostProcessing(RunEnvironment): :return: filled data array with persistence predictions """ tmp_persi = data.copy() - if not normalised: - tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method) persistence_prediction.values = np.tile(tmp_persi, (self.window_lead_time, 1)).T + if not normalised: + persistence_prediction = statistics.apply_inverse_transformation(persistence_prediction, mean, std, + transformation_method) return persistence_prediction def _create_nn_forecast(self, input_data: xr.DataArray, nn_prediction: xr.DataArray, mean: xr.DataArray, @@ -447,8 +448,6 @@ class PostProcessing(RunEnvironment): :return: filled data array with nn predictions """ tmp_nn = self.model.predict(input_data) - if not normalised: - tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method) if isinstance(tmp_nn, list): nn_prediction.values = tmp_nn[-1] elif tmp_nn.ndim == 3: @@ -457,6 +456,8 @@ class PostProcessing(RunEnvironment): nn_prediction.values = tmp_nn else: raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {tmp_nn.dims}.") + if not normalised: + nn_prediction = statistics.apply_inverse_transformation(nn_prediction, mean, std, transformation_method) return nn_prediction @staticmethod @@ -528,7 +529,7 @@ class PostProcessing(RunEnvironment): # external_data = external_data.squeeze("Stations").sel(window=1).drop(["window", "Stations", "variables"]) external_data = self._create_observation(observation, None, mean, std, transformation_method, normalised=False) return external_data.rename({external_data.dims[0]: 'index'}) - except IndexError: + except (IndexError, KeyError): return None def calculate_skill_scores(self) -> Tuple[Dict, Dict]: diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index ed972896e7a39b0b56df23dbc8a8d1ae64fb4183..82af9cf02cda9401237bac15ccf0a52fa10acdad 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -56,7 +56,8 @@ class PreProcessing(RunEnvironment): def _run(self): stations = self.data_store.get("stations") data_handler = self.data_store.get("data_handler") - _, valid_stations = self.validate_station(data_handler, stations, "preprocessing", overwrite_local_data=True) + _, valid_stations = self.validate_station(data_handler, stations, + "preprocessing") # , store_processed_data=False) if len(valid_stations) == 0: raise ValueError("Couldn't find any valid data according to given parameters. Abort experiment run.") self.data_store.set("stations", valid_stations) @@ -192,20 +193,14 @@ class PreProcessing(RunEnvironment): self.data_store.set("stations", valid_stations, scope=set_name) self.data_store.set("data_collection", collection, scope=set_name) - def validate_station(self, data_handler: AbstractDataHandler, set_stations, set_name=None, overwrite_local_data=False): + def validate_station(self, data_handler: AbstractDataHandler, set_stations, set_name=None, + store_processed_data=True): """ Check if all given stations in `all_stations` are valid. Valid means, that there is data available for the given time range (is included in `kwargs`). The shape and the loading time are logged in debug mode. - :param args: Dictionary with required parameters for DataGenerator class (`data_path`, `network`, `stations`, - `variables`, `time_dim`, `target_dim`, `target_var`). - :param kwargs: positional parameters for the DataGenerator class (e.g. `start`, `interpolation_method`, - `window_lead_time`). - :param all_stations: All stations to check. - :param name: name to display in the logging info message - :return: Corrected list containing only valid station IDs. """ t_outer = TimeTracking() @@ -219,7 +214,8 @@ class PreProcessing(RunEnvironment): kwargs = self.data_store.create_args_dict(data_handler.requirements(), scope=set_name) for station in set_stations: try: - dp = data_handler.build(station, name_affix=set_name, **kwargs) + dp = data_handler.build(station, name_affix=set_name, store_processed_data=store_processed_data, + **kwargs) collection.add(dp) valid_stations.append(station) except (AttributeError, EmptyQueryResult): @@ -234,6 +230,3 @@ class PreProcessing(RunEnvironment): transformation_dict = data_handler.transformation(stations, **kwargs) if transformation_dict is not None: self.data_store.set("transformation", transformation_dict) - - - diff --git a/mlair/workflows/abstract_workflow.py b/mlair/workflows/abstract_workflow.py index bced90bbe848cc9ebe36c583d05b62549f0ae80b..3a627d9f72a5c1c97c35b464af1b0944bc397ea5 100644 --- a/mlair/workflows/abstract_workflow.py +++ b/mlair/workflows/abstract_workflow.py @@ -16,15 +16,17 @@ class Workflow: execution but not the dependencies (workflow would probably fail in this case).""" def __init__(self, name=None): - self._registry = OrderedDict() + self._registry_kwargs = {} + self._registry = [] self._name = name if name is not None else self.__class__.__name__ def add(self, stage, **kwargs): """Add a new stage with optional kwargs.""" - self._registry[stage] = kwargs + self._registry.append(stage) + self._registry_kwargs[len(self._registry) - 1] = kwargs def run(self): """Run workflow embedded in a run environment and according to the stage's ordering.""" with RunEnvironment(name=self._name): - for stage, kwargs in self._registry.items(): - stage(**kwargs) + for pos, stage in enumerate(self._registry): + stage(**self._registry_kwargs[pos]) diff --git a/mlair/workflows/default_workflow.py b/mlair/workflows/default_workflow.py index 85d6726b70b699968933bf9af7580895490b8a6d..4d113190fdc90ec852d7db2b33459b9162867a24 100644 --- a/mlair/workflows/default_workflow.py +++ b/mlair/workflows/default_workflow.py @@ -14,28 +14,29 @@ class DefaultWorkflow(Workflow): the mentioned ordering.""" def __init__(self, stations=None, - train_model=None, create_new_model=None, - window_history_size=None, - experiment_date="testrun", - variables=None, statistics_per_var=None, - start=None, end=None, - target_var=None, target_dim=None, - window_lead_time=None, - dimensions=None, - interpolation_method=None, time_dim=None, limit_nan_fill=None, - train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, - use_all_stations_on_all_data_sets=None, fraction_of_train=None, - experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, - sampling=None, - permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None, - transformation=None, - train_min_length=None, val_min_length=None, test_min_length=None, - evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None, - plot_list=None, - model=None, - batch_size=None, - epochs=None, - data_preparation=None, + train_model=None, create_new_model=None, + window_history_size=None, + experiment_date="testrun", + variables=None, statistics_per_var=None, + start=None, end=None, + target_var=None, target_dim=None, + window_lead_time=None, + dimensions=None, + interpolation_method=None, time_dim=None, limit_nan_fill=None, + train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, + use_all_stations_on_all_data_sets=None, fraction_of_train=None, + experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, + overwrite_local_data=None, + sampling=None, + permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None, + transformation=None, + train_min_length=None, val_min_length=None, test_min_length=None, + evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None, + plot_list=None, + model=None, + batch_size=None, + epochs=None, + data_handler=None, **kwargs): super().__init__() @@ -58,28 +59,29 @@ class DefaultWorkflowHPC(Workflow): Training and PostProcessing in exact the mentioned ordering.""" def __init__(self, stations=None, - train_model=None, create_new_model=None, - window_history_size=None, - experiment_date="testrun", - variables=None, statistics_per_var=None, - start=None, end=None, - target_var=None, target_dim=None, - window_lead_time=None, - dimensions=None, - interpolation_method=None, time_dim=None, limit_nan_fill=None, - train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, - use_all_stations_on_all_data_sets=None, fraction_of_train=None, - experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, overwrite_local_data=None, - sampling=None, - permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None, - transformation=None, - train_min_length=None, val_min_length=None, test_min_length=None, - evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None, - plot_list=None, - model=None, - batch_size=None, - epochs=None, - data_preparation=None, **kwargs): + train_model=None, create_new_model=None, + window_history_size=None, + experiment_date="testrun", + variables=None, statistics_per_var=None, + start=None, end=None, + target_var=None, target_dim=None, + window_lead_time=None, + dimensions=None, + interpolation_method=None, time_dim=None, limit_nan_fill=None, + train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, test_end=None, + use_all_stations_on_all_data_sets=None, fraction_of_train=None, + experiment_path=None, plot_path=None, forecast_path=None, bootstrap_path=None, + overwrite_local_data=None, + sampling=None, + permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None, + transformation=None, + train_min_length=None, val_min_length=None, test_min_length=None, + evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None, + plot_list=None, + model=None, + batch_size=None, + epochs=None, + data_handler=None, **kwargs): super().__init__() # extract all given kwargs arguments diff --git a/requirements.txt b/requirements.txt index be76eab5b74797b039682a292ae8890488c058ec..371bb776e581925e507bf06c60bd866061c52791 100644 --- a/requirements.txt +++ b/requirements.txt @@ -61,7 +61,7 @@ typing-extensions urllib3==1.25.8 wcwidth==0.1.8 Werkzeug==1.0.0 -xarray==0.15.0 +xarray==0.16.1 zipp==3.1.0 setuptools~=49.6.0 diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc7199f2d8ed75af2d4f968a1f52ff3ee15baec --- /dev/null +++ b/test/test_configuration/test_defaults.py @@ -0,0 +1,73 @@ +from mlair.configuration.defaults import * + + +class TestGetDefaults: + + def test_get_defaults(self): + defaults = get_defaults() + assert isinstance(defaults, dict) + assert all(map(lambda k: k in defaults.keys(), ["DEFAULT_STATIONS", "DEFAULT_BATCH_SIZE", "DEFAULT_PLOT_LIST"])) + assert all(map(lambda x: x.startswith("DEFAULT"), defaults.keys())) + + +class TestAllDefaults: + + def test_training_parameters(self): + assert DEFAULT_CREATE_NEW_MODEL is True + assert DEFAULT_TRAIN_MODEL is True + assert DEFAULT_FRACTION_OF_TRAINING == 0.8 + assert DEFAULT_EXTREME_VALUES is None + assert DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY is False + assert DEFAULT_PERMUTE_DATA is False + assert DEFAULT_BATCH_SIZE == int(256 * 2) + assert DEFAULT_EPOCHS == 20 + + def test_data_handler_parameters(self): + assert DEFAULT_STATIONS == ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] + assert DEFAULT_VAR_ALL_DICT == {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', + 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', + 'cloudcover': 'average_values', + 'pblheight': 'maximum'} + assert DEFAULT_NETWORK == "AIRBASE" + assert DEFAULT_STATION_TYPE == "background" + assert DEFAULT_VARIABLES == DEFAULT_VAR_ALL_DICT.keys() + assert DEFAULT_START == "1997-01-01" + assert DEFAULT_END == "2017-12-31" + assert DEFAULT_WINDOW_HISTORY_SIZE == 13 + assert DEFAULT_OVERWRITE_LOCAL_DATA is False + assert isinstance(DEFAULT_TRANSFORMATION, TransformationClass) + assert DEFAULT_TRANSFORMATION.inputs.transform_method == "standardise" + assert DEFAULT_TRANSFORMATION.targets.transform_method == "standardise" + assert DEFAULT_TARGET_VAR == "o3" + assert DEFAULT_TARGET_DIM == "variables" + assert DEFAULT_WINDOW_LEAD_TIME == 3 + assert DEFAULT_DIMENSIONS == {"new_index": ["datetime", "Stations"]} + assert DEFAULT_TIME_DIM == "datetime" + assert DEFAULT_INTERPOLATION_METHOD == "linear" + assert DEFAULT_INTERPOLATION_LIMIT == 1 + + def test_subset_parameters(self): + assert DEFAULT_TRAIN_START == "1997-01-01" + assert DEFAULT_TRAIN_END == "2007-12-31" + assert DEFAULT_TRAIN_MIN_LENGTH == 90 + assert DEFAULT_VAL_START == "2008-01-01" + assert DEFAULT_VAL_END == "2009-12-31" + assert DEFAULT_VAL_MIN_LENGTH == 90 + assert DEFAULT_TEST_START == "2010-01-01" + assert DEFAULT_TEST_END == "2017-12-31" + assert DEFAULT_TEST_MIN_LENGTH == 90 + assert DEFAULT_TRAIN_VAL_MIN_LENGTH == 180 + assert DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS is True + + def test_hpc_parameters(self): + assert DEFAULT_HPC_HOST_LIST == ["jw", "hdfmlc"] + assert DEFAULT_HPC_LOGIN_LIST == ["ju", "hdfmll"] + + def test_postprocessing_parameters(self): + assert DEFAULT_EVALUATE_BOOTSTRAPS is True + assert DEFAULT_CREATE_NEW_BOOTSTRAPS is False + assert DEFAULT_NUMBER_OF_BOOTSTRAPS == 20 + assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", + "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", + "PlotConditionalQuantiles", "PlotAvailability"] diff --git a/test/test_statistics.py b/test/test_statistics.py index d4a72674ae89ecd106ff1861aa6ee26567da3243..76adc1bdd210e072b4fc9be717269c6ceb951fec 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -3,7 +3,9 @@ import pandas as pd import pytest import xarray as xr -from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, centre_apply, \ +from mlair.helpers.statistics import DataClass, TransformationClass +from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ + centre_apply, \ apply_inverse_transformation lazy = pytest.lazy_fixture @@ -113,3 +115,50 @@ class TestCentre: data = centre_apply(data_orig, mean) mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3]) assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None + + +class TestDataClass: + + def test_init(self): + dc = DataClass() + assert all([obj is None for obj in [dc.data, dc.mean, dc.std, dc.max, dc.min, dc.transform_method, dc._method]]) + + def test_init_values(self): + dc = DataClass(data=12, mean=2, std="test", max=23.4, min=np.array([3]), transform_method="f") + assert dc.data == 12 + assert dc.mean == 2 + assert dc.std == "test" + assert dc.max == 23.4 + assert np.testing.assert_array_equal(dc.min, np.array([3])) is None + assert dc.transform_method == "f" + assert dc._method is None + + def test_as_dict(self): + dc = DataClass(std=23) + dc._method = "f(x)" + assert dc.as_dict() == {"data": None, "mean": None, "std": 23, "max": None, "min": None, + "transform_method": None} + + +class TestTransformationClass: + + def test_init(self): + tc = TransformationClass() + assert hasattr(tc, "inputs") + assert isinstance(tc.inputs, DataClass) + assert hasattr(tc, "targets") + assert isinstance(tc.targets, DataClass) + assert tc.inputs.mean is None + assert tc.targets.std is None + + def test_init_values(self): + tc = TransformationClass(inputs_mean=1, inputs_std=2, inputs_method="f", targets_mean=3, targets_std=4, + targets_method="g") + assert tc.inputs.mean == 1 + assert tc.inputs.std == 2 + assert tc.inputs.transform_method == "f" + assert tc.inputs.max is None + assert tc.targets.mean == 3 + assert tc.targets.std == 4 + assert tc.targets.transform_method == "g" + assert tc.inputs.min is None