diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 85cf433427655484f1a29be562245144ae6c9e07..51d4beafbbc0b346331db80567946c3acc702b8e 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -1,6 +1,7 @@ __author__ = "Lukas Leufen" __date__ = '2020-06-25' +from mlair.helpers.statistics import TransformationClass DEFAULT_STATIONS = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', @@ -13,8 +14,7 @@ DEFAULT_START = "1997-01-01" DEFAULT_END = "2017-12-31" DEFAULT_WINDOW_HISTORY_SIZE = 13 DEFAULT_OVERWRITE_LOCAL_DATA = False -# DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} -DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise"} +DEFAULT_TRANSFORMATION = TransformationClass(inputs_method="standardise", targets_method="standardise") DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"] # ju[wels} #hdfmll(ogin) DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"] # first part of node names for Juwels (jw[comp], hdfmlc(ompute). DEFAULT_CREATE_NEW_MODEL = True diff --git a/mlair/data_handler/data_handler_kz_filter.py b/mlair/data_handler/data_handler_kz_filter.py index f2ff23be83c04ff8acbac116329288136ad979ed..a5a9de6701a03bedad95f54fc22fbd97ee041c86 100644 --- a/mlair/data_handler/data_handler_kz_filter.py +++ b/mlair/data_handler/data_handler_kz_filter.py @@ -32,7 +32,6 @@ class DataHandlerKzFilterSingleStation(DataHandlerSingleStation): self.kz_filter_iter = kz_filter_iter self.cutoff_period = None self.cutoff_period_days = None - self.data_target: xr.DataArray = None super().__init__(*args, **kwargs) def setup_samples(self): @@ -41,26 +40,25 @@ class DataHandlerKzFilterSingleStation(DataHandlerSingleStation): """ self.load_data() self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) + self.set_inputs_and_targets() import matplotlib matplotlib.use("TkAgg") import matplotlib.pyplot as plt - # self.original_data = self.data # ToDo: implement here something to store unfiltered data self.apply_kz_filter() # self.data.sel(filter="74d", variables="temp", Stations="DEBW107").plot() # self.data.sel(variables="temp", Stations="DEBW107").plot.line(hue="filter") - if self.transformation is not None: + if self.do_transformation is True: self.call_transform() - self.make_samples() # ToDo: target samples are still coming from filtered data + self.make_samples() @TimeTrackingWrapper def apply_kz_filter(self): """Apply kolmogorov zurbenko filter only on inputs.""" - self.data_target = self.data.sel({self.target_dim: [self.target_var]}) - kz = KZFilter(self.data, wl=self.kz_filter_length, itr=self.kz_filter_iter, filter_dim="datetime") + kz = KZFilter(self.input_data.data, wl=self.kz_filter_length, itr=self.kz_filter_iter, filter_dim="datetime") filtered_data: List[xr.DataArray] = kz.run() self.cutoff_period = kz.period_null() self.cutoff_period_days = kz.period_null_days() - self.data = xr.concat(filtered_data, pd.Index(self.create_filter_index(), name="filter")) + self.input_data.data = xr.concat(filtered_data, pd.Index(self.create_filter_index(), name="filter")) def create_filter_index(self) -> pd.Index: """ @@ -75,36 +73,6 @@ class DataHandlerKzFilterSingleStation(DataHandlerSingleStation): index = list(map(lambda x: str(x) + "d", index)) + ["res"] return pd.Index(index, name="filter") - def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, - window: int) -> None: - """ - Create a xr.DataArray containing labels. - - Labels are defined as the consecutive target values (t+1, ...t+n) following the current time step t. Set label - attribute. - - :param dim_name_of_target: Name of dimension which contains the target variable - :param target_var: Name of target variable in 'dimension' - :param dim_name_of_shift: Name of dimension on which xarray.DataArray.shift will be applied - :param window: lead time of label - """ - window = abs(window) - data = self.data_target.sel({dim_name_of_target: target_var}) - self.label = self.shift(data, dim_name_of_shift, window) - - def make_observation(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str) -> None: - """ - Create a xr.DataArray containing observations. - - Observations are defined as value of the current time step t. Set observation attribute. - - :param dim_name_of_target: Name of dimension which contains the observation variable - :param target_var: Name of observation variable(s) in 'dimension' - :param dim_name_of_shift: Name of dimension on which xarray.DataArray.shift will be applied - """ - data = self.data_target.sel({dim_name_of_target: target_var}) - self.observation = self.shift(data, dim_name_of_shift, 0) - def get_transposed_history(self) -> xr.DataArray: """Return history. diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index e4a3f857a01f3f8464fe102a020ba5ee82543d95..460d1c100dadbc2aea5d43932e902cc080177b27 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -3,6 +3,7 @@ __author__ = 'Lukas Leufen, Felix Kleinert' __date__ = '2020-07-20' +import copy import datetime as dt import logging import os @@ -53,7 +54,8 @@ class DataHandlerSingleStation(AbstractDataHandler): self.station = helpers.to_list(station) self.path = os.path.abspath(data_path) self.statistics_per_var = statistics_per_var - self.transformation = self.setup_transformation(transformation) + self.do_transformation = transformation is not None + self.input_data, self.target_data = self.setup_transformation(transformation) self.station_type = station_type self.network = network @@ -74,20 +76,13 @@ class DataHandlerSingleStation(AbstractDataHandler): self.end = end # internal - self.data: xr.DataArray = None + self._data: xr.DataArray = None # loaded raw data self.meta = None self.variables = list(statistics_per_var.keys()) if variables is None else variables self.history = None self.label = None self.observation = None - # internal for transformation - self.mean = None - self.std = None - self.max = None - self.min = None - self._transform_method = None - # create samples self.setup_samples() @@ -100,7 +95,7 @@ class DataHandlerSingleStation(AbstractDataHandler): @property def shape(self): - return self.data.shape, self.get_X().shape, self.get_Y().shape + return self._data.shape, self.get_X().shape, self.get_Y().shape def __repr__(self): return f"StationPrep(station={self.station}, data_path='{self.path}', " \ @@ -109,24 +104,7 @@ class DataHandlerSingleStation(AbstractDataHandler): f"sampling='{self.sampling}', target_dim='{self.target_dim}', target_var='{self.target_var}', " \ f"time_dim='{self.time_dim}', window_history_size={self.window_history_size}, " \ f"window_lead_time={self.window_lead_time}, interpolation_limit={self.interpolation_limit}, " \ - f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data}, " \ - f"transformation={self._print_transformation_as_string})" - - @property - def _print_transformation_as_string(self): - str_name = '' - if self.transformation is None: - str_name = f'{None}' - else: - for k, v in self.transformation.items(): - if v is not None: - try: - v_pr = f"xr.DataArray.from_dict({v.to_dict()})" - except AttributeError: - v_pr = f"'{v}'" - str_name += f"'{k}':{v_pr}, " - str_name = f"{{{str_name}}}" - return str_name + f"interpolation_method='{self.interpolation_method}', overwrite_local_data={self.overwrite_local_data})" def get_transposed_history(self) -> xr.DataArray: """Return history. @@ -153,18 +131,10 @@ class DataHandlerSingleStation(AbstractDataHandler): return coords.rename(index={"station_lon": "lon", "station_lat": "lat"}).to_dict()[str(self)] def call_transform(self, inverse=False): - self.transform(dim=self.time_dim, method=self.transformation["method"], - mean=self.transformation['mean'], std=self.transformation["std"], - min_val=self.transformation["min"], max_val=self.transformation["max"], - inverse=inverse - ) - - def set_transformation(self, transformation: dict): - if self._transform_method is not None: - self.call_transform(inverse=True) - self.transformation = self.setup_transformation(transformation) - self.call_transform() - self.make_samples() + kwargs = helpers.remove_items(self.input_data.as_dict(), ["data"]) + self.transform(self.input_data, dim=self.time_dim, inverse=inverse, **kwargs) + kwargs = helpers.remove_items(self.target_data.as_dict(), ["data"]) + self.transform(self.target_data, dim=self.time_dim, inverse=inverse, **kwargs) @TimeTrackingWrapper def setup_samples(self): @@ -173,10 +143,17 @@ class DataHandlerSingleStation(AbstractDataHandler): """ self.load_data() self.interpolate(dim=self.time_dim, method=self.interpolation_method, limit=self.interpolation_limit) - if self.transformation is not None: + self.set_inputs_and_targets() + if self.do_transformation is True: self.call_transform() self.make_samples() + def set_inputs_and_targets(self): + inputs = self._data.sel({self.target_dim: helpers.to_list(self.variables)}) + targets = self._data.sel({self.target_dim: self.target_var}) + self.input_data.data = inputs + self.target_data.data = targets + def make_samples(self): self.make_history_window(self.target_dim, self.window_history_size, self.time_dim) self.make_labels(self.target_dim, self.target_var, self.time_dim, self.window_lead_time) @@ -217,7 +194,7 @@ class DataHandlerSingleStation(AbstractDataHandler): logging.debug("loading finished") # create slices and check for negative concentration. data = self._slice_prep(data) - self.data = self.check_for_negative_concentrations(data) + self._data = self.check_for_negative_concentrations(data) def download_data_from_join(self, file_name: str, meta_file: str) -> [xr.DataArray, pd.DataFrame]: """ @@ -372,8 +349,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :return: xarray.DataArray """ - self.data = self.data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate, - **kwargs) + self._data = self._data.interpolate_na(dim=dim, method=method, limit=limit, use_coordinate=use_coordinate, + **kwargs) def make_history_window(self, dim_name_of_inputs: str, window: int, dim_name_of_shift: str) -> None: """ @@ -390,7 +367,7 @@ class DataHandlerSingleStation(AbstractDataHandler): :param dim_name_of_shift: Dimension along shift will be applied """ window = -abs(window) - data = self.data.sel({dim_name_of_inputs: self.variables}) + data = self.input_data.data self.history = self.shift(data, dim_name_of_shift, window) def make_labels(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str, @@ -407,7 +384,7 @@ class DataHandlerSingleStation(AbstractDataHandler): :param window: lead time of label """ window = abs(window) - data = self.data.sel({dim_name_of_target: target_var}) + data = self.target_data.data self.label = self.shift(data, dim_name_of_shift, window) def make_observation(self, dim_name_of_target: str, target_var: str_or_list, dim_name_of_shift: str) -> None: @@ -420,7 +397,7 @@ class DataHandlerSingleStation(AbstractDataHandler): :param target_var: Name of observation variable(s) in 'dimension' :param dim_name_of_shift: Name of dimension on which xarray.DataArray.shift will be applied """ - data = self.data.sel({dim_name_of_target: target_var}) + data = self.target_data.data self.observation = self.shift(data, dim_name_of_shift, 0) def remove_nan(self, dim: str) -> None: @@ -495,89 +472,23 @@ class DataHandlerSingleStation(AbstractDataHandler): return data @staticmethod - def setup_transformation(transformation: Dict): + def setup_transformation(transformation: statistics.TransformationClass): """ Set up transformation by extracting all relevant information. - Extract all information from transformation dictionary. Possible keys are method, mean, std, min, max. - * If a transformation should be applied on base of existing values, these need to be provided in the respective - keys "mean" and "std" (again only if required for given method). - - :param transformation: the transformation dictionary as described above. - - :return: updated transformation dictionary - - ## Transformation - - There are two different approaches (called scopes) to transform the data: - 1) `station`: transform data for each station independently (somehow like batch normalisation) - 1) `data`: transform all data of each station with shared metrics - - Transformation must be set by the `transformation` attribute. If `transformation = None` is given to `ExperimentSetup`, - data is not transformed at all. For all other setups, use the following dictionary structure to specify the - transformation. - ``` - transformation = {"scope": <...>, - "method": <...>, - "mean": <...>, - "std": <...>} - ExperimentSetup(..., transformation=transformation, ...) - ``` - - ### scopes - - **station**: mean and std are not used - - **data**: either provide already calculated values for mean and std (if required by transformation method), or choose - from different calculation schemes, explained in the mean and std section. - - ### supported transformation methods - Currently supported methods are: - * standardise (default, if method is not given) - * centre - - ### mean and std - `"mean"="accurate"`: calculate the accurate values of mean and std (depending on method) by using all data. Although, - this method is accurate, it may take some time for the calculation. Furthermore, this could potentially lead to memory - issue (not explored yet, but could appear for a very big amount of data) - - `"mean"="estimate"`: estimate mean and std (depending on method). For each station, mean and std are calculated and - afterwards aggregated using the mean value over all station-wise metrics. This method is less accurate, especially - regarding the std calculation but therefore much faster. - - We recommend to use the later method *estimate* because of following reasons: - * much faster calculation - * real accuracy of mean and std is less important, because it is "just" a transformation / scaling - * accuracy of mean is almost as high as in the *accurate* case, because of - $\bar{x_{ij}} = \bar{\left(\bar{x_i}\right)_j}$. The only difference is, that in the *estimate* case, each mean is - equally weighted for each station independently of the actual data count of the station. - * accuracy of std is lower for *estimate* because of $\var{x_{ij}} \ne \bar{\left(\var{x_i}\right)_j}$, but still the mean of all - station-wise std is a decent estimate of the true std. - - `"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the - scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the - *standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation - class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. - + * Either return new empty DataClass instances if given transformation arg is None, + * or return given object twice if transformation is a DataClass instance, + * or return the inputs and targets attributes if transformation is a TransformationClass instance (default + design behaviour) """ if transformation is None: - return - elif not isinstance(transformation, dict): - raise TypeError(f"`transformation' must be either `None' or dict like e.g. `{{'method': 'standardise'}}," - f" but transformation is of type {type(transformation)}.") - transformation = transformation.copy() - method = transformation.get("method", None) - mean = transformation.get("mean", None) - std = transformation.get("std", None) - max_val = transformation.get("max", None) - min_val = transformation.get("min", None) - - transformation["method"] = method - transformation["mean"] = mean - transformation["std"] = std - transformation["max"] = max_val - transformation["min"] = min_val - return transformation + return statistics.DataClass(), statistics.DataClass() + elif isinstance(transformation, statistics.DataClass): + return transformation, transformation + elif isinstance(transformation, statistics.TransformationClass): + return copy.deepcopy(transformation.inputs), copy.deepcopy(transformation.targets) + else: + raise NotImplementedError("Cannot handle this.") def load_data(self): try: @@ -586,8 +497,9 @@ class DataHandlerSingleStation(AbstractDataHandler): self.download_data() self.load_data() - def transform(self, dim: Union[str, int] = 0, method: str = 'standardise', inverse: bool = False, mean=None, - std=None, min_val=None, max_val=None) -> None: + def transform(self, data_class, dim: Union[str, int] = 0, transform_method: str = 'standardise', + inverse: bool = False, mean=None, + std=None, min=None, max=None) -> None: """ Transform data according to given transformation settings. @@ -607,9 +519,9 @@ class DataHandlerSingleStation(AbstractDataHandler): calculated over the data in this class instance. :param std: Used for transformation (if required by 'method') based on external data. If 'None' the std is calculated over the data in this class instance. - :param min_val: Used for transformation (if required by 'method') based on external data. If 'None' min_val is + :param min: Used for transformation (if required by 'method') based on external data. If 'None' min_val is extracted from the data in this class instance. - :param max_val: Used for transformation (if required by 'method') based on external data. If 'None' max_val is + :param max: Used for transformation (if required by 'method') based on external data. If 'None' max_val is extracted from the data in this class instance. :return: xarray.DataArrays or pandas.DataFrames: @@ -619,36 +531,37 @@ class DataHandlerSingleStation(AbstractDataHandler): """ def f(data): - if method == 'standardise': + if transform_method == 'standardise': return statistics.standardise(data, dim) - elif method == 'centre': + elif transform_method == 'centre': return statistics.centre(data, dim) - elif method == 'normalise': + elif transform_method == 'normalise': # use min/max of data or given min/max raise NotImplementedError else: raise NotImplementedError def f_apply(data): - if method == "standardise": + if transform_method == "standardise": return mean, std, statistics.standardise_apply(data, mean, std) - elif method == "centre": + elif transform_method == "centre": return mean, None, statistics.centre_apply(data, mean) else: raise NotImplementedError if not inverse: - if self._transform_method is not None: - raise AssertionError(f"Transform method is already set. Therefore, data was already transformed with " - f"{self._transform_method}. Please perform inverse transformation of data first.") + if data_class._method is not None: + raise AssertionError(f"Internal _method is already set. Therefore, data was already transformed with " + f"{data_class._method}. Please perform inverse transformation of data first.") # apply transformation on local data instance (f) if mean is None, else apply by using mean (and std) from # external data. - self.mean, self.std, self.data = locals()["f" if mean is None else "f_apply"](self.data) + data_class.mean, data_class.std, data_class.data = locals()["f" if mean is None else "f_apply"]( + data_class.data) # set transform method to find correct method for inverse transformation. - self._transform_method = method + data_class._method = transform_method else: - self.inverse_transform() + self.inverse_transform(data_class) @staticmethod def check_inverse_transform_params(mean: data_or_none, std: data_or_none, method: str) -> None: @@ -670,7 +583,7 @@ class DataHandlerSingleStation(AbstractDataHandler): if len(msg) > 0: raise AttributeError(f"Inverse transform {method} can not be executed because following is None: {msg}") - def inverse_transform(self) -> None: + def inverse_transform(self, data_class) -> None: """ Perform inverse transformation. @@ -690,36 +603,26 @@ class DataHandlerSingleStation(AbstractDataHandler): else: raise NotImplementedError - if self._transform_method is None: + if data_class.transform_method is None: raise AssertionError("Inverse transformation method is not set. Data cannot be inverse transformed.") - self.check_inverse_transform_params(self.mean, self.std, self._transform_method) - self.data, self.mean, self.std = f_inverse(self.data, self.mean, self.std, self._transform_method) - self._transform_method = None + self.check_inverse_transform_params(data_class.mean, data_class.std, data_class._method) + data_class.data, data_class.mean, data_class.std = f_inverse(data_class.data, data_class.mean, data_class.std, + data_class._method) + data_class.transform_method = None # update X and Y self.make_samples() - def get_transformation_information(self, variable: str = None) -> Tuple[data_or_none, data_or_none, str]: + def get_transformation_targets(self) -> Tuple[data_or_none, data_or_none, str]: """ Extract transformation statistics and method. - Get mean and standard deviation for given variable and the transformation method if set. If a transformation + Get mean and standard deviation for target values and the transformation method if set. If a transformation depends only on particular statistics (e.g. only mean is required for centering), the remaining statistics are returned with None as fill value. - :param variable: Variable for which the information on transformation is requested. - :return: mean, standard deviation and transformation method """ - variable = self.target_var if variable is None else variable - try: - mean = self.mean.sel({'variables': variable}).values - except AttributeError: - mean = None - try: - std = self.std.sel({'variables': variable}).values - except AttributeError: - std = None - return mean, std, self._transform_method + return self.target_data.mean, self.target_data.std, self.target_data.transform_method if __name__ == "__main__": @@ -732,7 +635,6 @@ if __name__ == "__main__": time_dim='datetime', window_history_size=7, window_lead_time=3, interpolation_limit=0 ) # transformation={'method': 'standardise'}) - # sp.set_transformation({'method': 'standardise', 'mean': sp.mean+2, 'std': sp.std+1}) sp2 = DataHandlerSingleStation(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122', statistics_per_var=statistics_per_var, station_type='background', network='UBA', sampling='daily', target_dim='variables', target_var='o3', diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index 8ed4f8743d90b313d074a3be15cc49fd9ffa07c0..e6dde10bf6bd13013fa454eadd1a7976c00dd3e2 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -145,7 +145,7 @@ class DefaultDataHandler(AbstractDataHandler): return self.id_class.observation.copy().squeeze() def get_transformation_Y(self): - return self.id_class.get_transformation_information() + return self.id_class.get_transformation_targets() def multiply_extremes(self, extreme_values: num_or_list = 1., extremes_on_right_tail_only: bool = False, timedelta: Tuple[int, str] = (1, 'm'), dim="datetime"): @@ -217,27 +217,55 @@ class DefaultDataHandler(AbstractDataHandler): @classmethod def transformation(cls, set_stations, **kwargs): + """ + ### supported transformation methods + + Currently supported methods are: + + * standardise (default, if method is not given) + * centre + + ### mean and std estimation + + Mean and std (depending on method) are estimated. For each station, mean and std are calculated and afterwards + aggregated using the mean value over all station-wise metrics. This method is not exactly accurate, especially + regarding the std calculation but therefore much faster. Furthermore, it is a weighted mean weighted by the + time series length / number of data itself - a longer time series has more influence on the transformation + settings than a short time series. The estimation of the std in less accurate, because the unweighted mean of + all stds in not equal to the true std, but still the mean of all station-wise std is a decent estimate. Finally, + the real accuracy of mean and std is less important, because it is "just" a transformation / scaling. + + ### mean and std given + + If mean and std are not None, the default data handler expects this parameters to match the data and applies + this values to the data. Make sure that all dimensions and/or coordinates are in agreement. + """ + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} - transformation_dict = sp_keys.pop("transformation") - if transformation_dict is None: + transformation_class = sp_keys.get("transformation", None) + if transformation_class is None: return - scope = transformation_dict.pop("scope") - method = transformation_dict.pop("method") - if transformation_dict.pop("mean", None) is not None: + + transformation_inputs = transformation_class.inputs + if transformation_inputs.mean is not None: return - mean, std = None, None + means = [None, None] + stds = [None, None] for station in set_stations: try: - sp = cls.data_handler_transformation(station, transformation={"method": method}, **sp_keys) - mean = sp.mean.copy(deep=True) if mean is None else mean.combine_first(sp.mean) - std = sp.std.copy(deep=True) if std is None else std.combine_first(sp.std) + sp = cls.data_handler_transformation(station, **sp_keys) + for i, data in enumerate([sp.input_data, sp.target_data]): + means[i] = data.mean.copy(deep=True) if means[i] is None else means[i].combine_first(data.mean) + stds[i] = data.std.copy(deep=True) if stds[i] is None else stds[i].combine_first(data.std) except (AttributeError, EmptyQueryResult): continue - if mean is None: + if means[0] is None: return None - mean_estimated = mean.mean("Stations") - std_estimated = std.mean("Stations") - return {"scope": scope, "method": method, "mean": mean_estimated, "std": std_estimated} + transformation_class.inputs.mean = means[0].mean("Stations") + transformation_class.inputs.std = stds[0].mean("Stations") + transformation_class.targets.mean = means[1].mean("Stations") + transformation_class.targets.std = stds[1].mean("Stations") + return transformation_class def get_coordinates(self): return self.id_class.get_coordinates() \ No newline at end of file diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index 4d51b1915ea4c027ff88a2106de29e572af069a0..3db6618a5e8ebd575d61bc261144ff47ccaf9b53 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -11,11 +11,34 @@ import pandas as pd from typing import Union, Tuple, Dict from matplotlib import pyplot as plt -from mlair.helpers import to_list +from mlair.helpers import to_list, remove_items Data = Union[xr.DataArray, pd.DataFrame] +class DataClass: + + def __init__(self, data=None, mean=None, std=None, max=None, min=None, transform_method=None): + self.data = data + self.mean = mean + self.std = std + self.max = max + self.min = min + self.transform_method = transform_method + self._method = None + + def as_dict(self): + return remove_items(self.__dict__, "_method") + + +class TransformationClass: + + def __init__(self, inputs_mean=None, inputs_std=None, inputs_method=None, targets_mean=None, targets_std=None, + targets_method=None): + self.inputs = DataClass(mean=inputs_mean, std=inputs_std, transform_method=inputs_method) + self.targets = DataClass(mean=targets_mean, std=targets_std, transform_method=targets_method) + + def apply_inverse_transformation(data: Data, mean: Data, std: Data = None, method: str = "standardise") -> Data: """ Apply inverse transformation for given statistics. diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index 327dc40de1aa6986c9763186a68a0138be61bb5a..c8682374e0d4c0d724d83a5e36977543ac3a50f8 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -137,15 +137,16 @@ class PlotMonthlySummary(AbstractPlotClass): data_cnn = data.sel(type="CNN").squeeze() if len(data_cnn.shape) > 1: - data_cnn.coords["ahead"].values = [f"{days}d" for days in data_cnn.coords["ahead"].values] + data_cnn = data_cnn.assign_coords(ahead=[f"{days}d" for days in data_cnn.coords["ahead"].values]) data_obs = data.sel(type="obs", ahead=1).squeeze() data_obs.coords["ahead"] = "obs" data_concat = xr.concat([data_obs, data_cnn], dim="ahead") - data_concat = data_concat.drop("type") + data_concat = data_concat.drop_vars("type") - data_concat.index.values = data_concat.index.values.astype("datetime64[M]").astype(int) % 12 + 1 + new_index = data_concat.index.values.astype("datetime64[M]").astype(int) % 12 + 1 + data_concat = data_concat.assign_coords(index=new_index) data_concat = data_concat.clip(min=0) forecasts = xr.concat([forecasts, data_concat], 'index') if forecasts is not None else data_concat diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index 7b1d145541d694b7f7145e42778f91d0716579ba..571d3a07d15873af1c1ccedc59e0cc462e07820f 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -399,10 +399,10 @@ class PostProcessing(RunEnvironment): :return: filled data array with ols predictions """ tmp_ols = self.ols_model.predict(input_data) - if not normalised: - tmp_ols = statistics.apply_inverse_transformation(tmp_ols, mean, std, transformation_method) target_shape = ols_prediction.values.shape ols_prediction.values = np.swapaxes(tmp_ols, 2, 0) if target_shape != tmp_ols.shape else tmp_ols + if not normalised: + ols_prediction = statistics.apply_inverse_transformation(ols_prediction, mean, std, transformation_method) return ols_prediction def _create_persistence_forecast(self, data, persistence_prediction: xr.DataArray, mean: xr.DataArray, @@ -423,9 +423,10 @@ class PostProcessing(RunEnvironment): :return: filled data array with persistence predictions """ tmp_persi = data.copy() - if not normalised: - tmp_persi = statistics.apply_inverse_transformation(tmp_persi, mean, std, transformation_method) persistence_prediction.values = np.tile(tmp_persi, (self.window_lead_time, 1)).T + if not normalised: + persistence_prediction = statistics.apply_inverse_transformation(persistence_prediction, mean, std, + transformation_method) return persistence_prediction def _create_nn_forecast(self, input_data: xr.DataArray, nn_prediction: xr.DataArray, mean: xr.DataArray, @@ -447,8 +448,6 @@ class PostProcessing(RunEnvironment): :return: filled data array with nn predictions """ tmp_nn = self.model.predict(input_data) - if not normalised: - tmp_nn = statistics.apply_inverse_transformation(tmp_nn, mean, std, transformation_method) if isinstance(tmp_nn, list): nn_prediction.values = tmp_nn[-1] elif tmp_nn.ndim == 3: @@ -457,6 +456,8 @@ class PostProcessing(RunEnvironment): nn_prediction.values = tmp_nn else: raise NotImplementedError(f"Number of dimension of model output must be 2 or 3, but not {tmp_nn.dims}.") + if not normalised: + nn_prediction = statistics.apply_inverse_transformation(nn_prediction, mean, std, transformation_method) return nn_prediction @staticmethod diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc7199f2d8ed75af2d4f968a1f52ff3ee15baec --- /dev/null +++ b/test/test_configuration/test_defaults.py @@ -0,0 +1,73 @@ +from mlair.configuration.defaults import * + + +class TestGetDefaults: + + def test_get_defaults(self): + defaults = get_defaults() + assert isinstance(defaults, dict) + assert all(map(lambda k: k in defaults.keys(), ["DEFAULT_STATIONS", "DEFAULT_BATCH_SIZE", "DEFAULT_PLOT_LIST"])) + assert all(map(lambda x: x.startswith("DEFAULT"), defaults.keys())) + + +class TestAllDefaults: + + def test_training_parameters(self): + assert DEFAULT_CREATE_NEW_MODEL is True + assert DEFAULT_TRAIN_MODEL is True + assert DEFAULT_FRACTION_OF_TRAINING == 0.8 + assert DEFAULT_EXTREME_VALUES is None + assert DEFAULT_EXTREMES_ON_RIGHT_TAIL_ONLY is False + assert DEFAULT_PERMUTE_DATA is False + assert DEFAULT_BATCH_SIZE == int(256 * 2) + assert DEFAULT_EPOCHS == 20 + + def test_data_handler_parameters(self): + assert DEFAULT_STATIONS == ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] + assert DEFAULT_VAR_ALL_DICT == {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', + 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', + 'cloudcover': 'average_values', + 'pblheight': 'maximum'} + assert DEFAULT_NETWORK == "AIRBASE" + assert DEFAULT_STATION_TYPE == "background" + assert DEFAULT_VARIABLES == DEFAULT_VAR_ALL_DICT.keys() + assert DEFAULT_START == "1997-01-01" + assert DEFAULT_END == "2017-12-31" + assert DEFAULT_WINDOW_HISTORY_SIZE == 13 + assert DEFAULT_OVERWRITE_LOCAL_DATA is False + assert isinstance(DEFAULT_TRANSFORMATION, TransformationClass) + assert DEFAULT_TRANSFORMATION.inputs.transform_method == "standardise" + assert DEFAULT_TRANSFORMATION.targets.transform_method == "standardise" + assert DEFAULT_TARGET_VAR == "o3" + assert DEFAULT_TARGET_DIM == "variables" + assert DEFAULT_WINDOW_LEAD_TIME == 3 + assert DEFAULT_DIMENSIONS == {"new_index": ["datetime", "Stations"]} + assert DEFAULT_TIME_DIM == "datetime" + assert DEFAULT_INTERPOLATION_METHOD == "linear" + assert DEFAULT_INTERPOLATION_LIMIT == 1 + + def test_subset_parameters(self): + assert DEFAULT_TRAIN_START == "1997-01-01" + assert DEFAULT_TRAIN_END == "2007-12-31" + assert DEFAULT_TRAIN_MIN_LENGTH == 90 + assert DEFAULT_VAL_START == "2008-01-01" + assert DEFAULT_VAL_END == "2009-12-31" + assert DEFAULT_VAL_MIN_LENGTH == 90 + assert DEFAULT_TEST_START == "2010-01-01" + assert DEFAULT_TEST_END == "2017-12-31" + assert DEFAULT_TEST_MIN_LENGTH == 90 + assert DEFAULT_TRAIN_VAL_MIN_LENGTH == 180 + assert DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS is True + + def test_hpc_parameters(self): + assert DEFAULT_HPC_HOST_LIST == ["jw", "hdfmlc"] + assert DEFAULT_HPC_LOGIN_LIST == ["ju", "hdfmll"] + + def test_postprocessing_parameters(self): + assert DEFAULT_EVALUATE_BOOTSTRAPS is True + assert DEFAULT_CREATE_NEW_BOOTSTRAPS is False + assert DEFAULT_NUMBER_OF_BOOTSTRAPS == 20 + assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", + "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", + "PlotConditionalQuantiles", "PlotAvailability"] diff --git a/test/test_statistics.py b/test/test_statistics.py index d4a72674ae89ecd106ff1861aa6ee26567da3243..76adc1bdd210e072b4fc9be717269c6ceb951fec 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -3,7 +3,9 @@ import pandas as pd import pytest import xarray as xr -from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, centre_apply, \ +from mlair.helpers.statistics import DataClass, TransformationClass +from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ + centre_apply, \ apply_inverse_transformation lazy = pytest.lazy_fixture @@ -113,3 +115,50 @@ class TestCentre: data = centre_apply(data_orig, mean) mean_expected = np.array([2, -5, 10]) - np.array([2, 10, 3]) assert np.testing.assert_almost_equal(data.mean(dim), mean_expected, decimal=1) is None + + +class TestDataClass: + + def test_init(self): + dc = DataClass() + assert all([obj is None for obj in [dc.data, dc.mean, dc.std, dc.max, dc.min, dc.transform_method, dc._method]]) + + def test_init_values(self): + dc = DataClass(data=12, mean=2, std="test", max=23.4, min=np.array([3]), transform_method="f") + assert dc.data == 12 + assert dc.mean == 2 + assert dc.std == "test" + assert dc.max == 23.4 + assert np.testing.assert_array_equal(dc.min, np.array([3])) is None + assert dc.transform_method == "f" + assert dc._method is None + + def test_as_dict(self): + dc = DataClass(std=23) + dc._method = "f(x)" + assert dc.as_dict() == {"data": None, "mean": None, "std": 23, "max": None, "min": None, + "transform_method": None} + + +class TestTransformationClass: + + def test_init(self): + tc = TransformationClass() + assert hasattr(tc, "inputs") + assert isinstance(tc.inputs, DataClass) + assert hasattr(tc, "targets") + assert isinstance(tc.targets, DataClass) + assert tc.inputs.mean is None + assert tc.targets.std is None + + def test_init_values(self): + tc = TransformationClass(inputs_mean=1, inputs_std=2, inputs_method="f", targets_mean=3, targets_std=4, + targets_method="g") + assert tc.inputs.mean == 1 + assert tc.inputs.std == 2 + assert tc.inputs.transform_method == "f" + assert tc.inputs.max is None + assert tc.targets.mean == 3 + assert tc.targets.std == 4 + assert tc.targets.transform_method == "g" + assert tc.inputs.min is None