diff --git a/.gitlab/issue_templates/release.md b/.gitlab/issue_templates/release.md index a95cf033eed919339c6c1734638542c3e0cdbc57..c8289cdd1be4ab02a61c6feb0db9db7cb6ca40d3 100644 --- a/.gitlab/issue_templates/release.md +++ b/.gitlab/issue_templates/release.md @@ -14,6 +14,7 @@ vX.Y.Z * [ ] Adjust `changelog.md` (see template for changelog) * [ ] Update version number in `mlair/__ init__.py` * [ ] Create new dist file: `python3 setup.py sdist bdist_wheel` +* [ ] Add new dist file `mlair-X.Y.Z-py3-none-any.whl` to git * [ ] Update file link `distribution file (current version)` in `README.md` * [ ] Update file link in `docs/_source/installation.rst` * [ ] Commit + push diff --git a/CHANGELOG.md b/CHANGELOG.md index b11a169c854465c5ea932f00f5da5a1688df7c18..34795b8333df846d5383fc2d8eca4b40517aab73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,25 @@ # Changelog All notable changes to this project will be documented in this file. -## v1.4.0 - 2021-07-27 - <release description> +## v1.5.0 - 2021-11-11 - new uncertainty estimation + +### general: +* introduces method to estimate sample uncertainty +* improved multiprocessing +* last release with tensorflow v1 support + +### new features: +* test set sample uncertainty estmation during postprocessing (#333) +* support of Kolmogorov Zurbenko filter for data handlers with filters (#334) + +### technical: +* new communication scheme for multiprocessing (#321, #322) +* improved error reporting (#323) +* feature importance returns now unaggregated results (#335) +* error metrics are reported for all competitors (#332) +* minor bugfixes and refacs (#330, #326, #329, #325, #324, #320, #337) + +## v1.4.0 - 2021-07-27 - new model classes and data handlers, improved usability and transparency ### general: * many technical adjustments to improve usability and transparency of MLAir diff --git a/README.md b/README.md index 6d07ecec3bd1ab76f6e29dc969e4339e5521593b..1baf4465a7ad4d55476fec1f4ed8d45a7a531386 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ HPC systems, see [here](#special-instructions-for-installation-on-jülich-hpc-sy * Installation of **MLAir**: * Either clone MLAir from the [gitlab repository](https://gitlab.version.fz-juelich.de/toar/mlair.git) and use it without installation (beside the requirements) - * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.4.0-py3-none-any.whl)) + * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.5.0-py3-none-any.whl)) and install it via `pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using `import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't @@ -518,3 +518,22 @@ add it to `src/join_settings.py` in the hourly data section. Replace the `TOAR_S value. To make sure, that this **sensitive** data is not uploaded to the remote server, use the following command to prevent git from tracking this file: `git update-index --assume-unchanged src/join_settings.py` + +## Known Issues + +### Problem with multiprocessing + +* cpython and python's native multiprocessing can crash when using the multiprocessing approach for preprocessing. This +is caused by an internal limitation in order of 2GB. When using long periods and therefore very big data, +multiprocessing is not able to handle these data correctly: +```shell +File "mlair/mlair/run_modules/pre_processing.py", line X, in validate_station + dh, s = p.get() +File "multiprocessing/pool.py", line 644, in get + raise self._value +multiprocessing.pool.MaybeEncodingError: Error sending result: '(DEMV012, 'DEMV012')'. Reason: 'error("'i' format requires -2147483648 <= number <= 2147483647",)' +``` +* to solve this issue, either update your python version to >=3.8 (warning, this version is not tested with MLAir) or +apply the patch that is applied in this commit +https://github.com/python/cpython/commit/bccacd19fa7b56dcf2fbfab15992b6b94ab6666b or as proposed in this comment +https://stackoverflow.com/questions/47776486/python-struct-error-i-format-requires-2147483648-number-2147483647 diff --git a/dist/mlair-1.5.0-py3-none-any.whl b/dist/mlair-1.5.0-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..34495d960b009737fb40bec6dfe3a96effd14c02 Binary files /dev/null and b/dist/mlair-1.5.0-py3-none-any.whl differ diff --git a/docs/_source/installation.rst b/docs/_source/installation.rst index 27543ac109439e487756cc211ecc47be946c586c..c87e64b217b4207185cfc662fdf00d2f7e891cc5 100644 --- a/docs/_source/installation.rst +++ b/docs/_source/installation.rst @@ -26,7 +26,7 @@ Installation of MLAir * Install all requirements from `requirements.txt <https://gitlab.version.fz-juelich.de/toar/machinelearningtools/-/blob/master/requirements.txt>`_ preferably in a virtual environment * Either clone MLAir from the `gitlab repository <https://gitlab.version.fz-juelich.de/toar/machinelearningtools.git>`_ -* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.4.0-py3-none-any.whl>`_) +* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.5.0-py3-none-any.whl>`_) and install it via :py:`pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using :py:`import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt index a39acca8a7cf887237f595d7992960ac10233a85..8ccf3ba6515d31ebd2b35901d3c9e58734d653d8 100644 --- a/docs/requirements_docs.txt +++ b/docs/requirements_docs.txt @@ -3,4 +3,5 @@ sphinx-autoapi==1.3.0 sphinx-autodoc-typehints==1.10.3 sphinx-rtd-theme==0.4.3 #recommonmark==0.6.0 -m2r2==0.2.5 \ No newline at end of file +m2r2==0.2.5 +docutils<0.18 \ No newline at end of file diff --git a/mlair/__init__.py b/mlair/__init__.py index f760f9b0fa4b87bde1f6ee409626f4428083d895..75359e1773edea55ecc47556a83a465510fac6c8 100644 --- a/mlair/__init__.py +++ b/mlair/__init__.py @@ -1,6 +1,6 @@ __version_info__ = { 'major': 1, - 'minor': 4, + 'minor': 5, 'micro': 0, } diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index d61146b61a5ade9118675fa7b895212f310acc71..ca569720dc41d95621d0613a2170cc4d9d46c082 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -13,6 +13,7 @@ DEFAULT_START = "1997-01-01" DEFAULT_END = "2017-12-31" DEFAULT_WINDOW_HISTORY_SIZE = 13 DEFAULT_OVERWRITE_LOCAL_DATA = False +DEFAULT_OVERWRITE_LAZY_DATA = False DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"] # ju[wels} #hdfmll(ogin) DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"] # first part of node names for Juwels (jw[comp], hdfmlc(ompute). DEFAULT_CREATE_NEW_MODEL = True @@ -43,14 +44,19 @@ DEFAULT_TEST_END = "2017-12-31" DEFAULT_TEST_MIN_LENGTH = 90 DEFAULT_TRAIN_VAL_MIN_LENGTH = 180 DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS = True -DEFAULT_EVALUATE_BOOTSTRAPS = True -DEFAULT_CREATE_NEW_BOOTSTRAPS = False -DEFAULT_NUMBER_OF_BOOTSTRAPS = 20 -DEFAULT_BOOTSTRAP_TYPE = "singleinput" -DEFAULT_BOOTSTRAP_METHOD = "shuffle" +DEFAULT_DO_UNCERTAINTY_ESTIMATE = True +DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH = "1m" +DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS = True +DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS = 1000 +DEFAULT_EVALUATE_FEATURE_IMPORTANCE = True +DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS = False +DEFAULT_FEATURE_IMPORTANCE_N_BOOTS = 20 +DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE = "singleinput" +DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD = "shuffle" DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", - "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", - "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram"] + "PlotCompetitiveSkillScore", "PlotFeatureImportanceSkillScore", "PlotConditionalQuantiles", + "PlotAvailability", "PlotAvailabilityHistogram", "PlotDataHistogram", "PlotPeriodogram", + "PlotSampleUncertaintyFromBootstrap"] DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", diff --git a/mlair/configuration/path_config.py b/mlair/configuration/path_config.py index e7418b984dab74b0527b8dca05a9f6c3636ac18f..6b9c799ceb190b9150be3a4cfcd336eaf45aa768 100644 --- a/mlair/configuration/path_config.py +++ b/mlair/configuration/path_config.py @@ -3,6 +3,7 @@ import getpass import logging import os import re +import shutil import socket from typing import Union @@ -112,17 +113,23 @@ def set_bootstrap_path(bootstrap_path: str, data_path: str) -> str: return os.path.abspath(bootstrap_path) -def check_path_and_create(path: str) -> None: +def check_path_and_create(path: str, remove_existing: bool = False) -> None: """ Check a given path and create if not existing. :param path: path to check and create + :param remove_existing: if set to true an existing folder is removed and replaced by a new one (default False). """ try: os.makedirs(path) logging.debug(f"Created path: {path}") except FileExistsError: - logging.debug(f"Path already exists: {path}") + if remove_existing is True: + logging.debug(f"Remove / clean path: {path}") + shutil.rmtree(path) + check_path_and_create(path, remove_existing=False) + else: + logging.debug(f"Path already exists: {path}") def get_host(): diff --git a/mlair/data_handler/__init__.py b/mlair/data_handler/__init__.py index 495b6e7c8604a839a084a2b78a54563c13eb06e6..d119977802038155af0d07d99c75c665622a148c 100644 --- a/mlair/data_handler/__init__.py +++ b/mlair/data_handler/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Lukas Leufen, Felix Kleinert' __date__ = '2020-04-17' -from .bootstraps import BootStraps +from .input_bootstraps import Bootstraps from .iterator import KerasIterator, DataCollection from .default_data_handler import DefaultDataHandler from .abstract_data_handler import AbstractDataHandler diff --git a/mlair/data_handler/data_handler_mixed_sampling.py b/mlair/data_handler/data_handler_mixed_sampling.py index 8205ae6c28f3683b1052c292e5d063d8bca555dc..5aefb0368ec1cf544443bb5e0412dd16a97f2a7f 100644 --- a/mlair/data_handler/data_handler_mixed_sampling.py +++ b/mlair/data_handler/data_handler_mixed_sampling.py @@ -12,6 +12,7 @@ from mlair.helpers import remove_items from mlair.configuration.defaults import DEFAULT_SAMPLING, DEFAULT_INTERPOLATION_LIMIT, DEFAULT_INTERPOLATION_METHOD from mlair.helpers.filter import filter_width_kzf +import copy import inspect from typing import Callable import datetime as dt @@ -140,13 +141,6 @@ class DataHandlerMixedSamplingWithFilterSingleStation(DataHandlerMixedSamplingSi def load_and_interpolate(self, ind) -> [xr.DataArray, pd.DataFrame]: start, end = self.update_start_end(ind) - # if ind == 0: # for inputs - # estimated_filter_width = self.estimate_filter_width() - # start = self._add_time_delta(self.start, -estimated_filter_width) - # end = self._add_time_delta(self.end, estimated_filter_width) - # else: # target - # start, end = self.start, self.end - vars = [self.variables, self.target_var] stats_per_var = helpers.select_from_dict(self.statistics_per_var, vars[ind]) @@ -264,7 +258,83 @@ class DataHandlerMixedSamplingWithClimateFirFilter(DataHandlerClimateFirFilter): data_handler = DataHandlerMixedSamplingWithClimateFirFilterSingleStation data_handler_transformation = DataHandlerMixedSamplingWithClimateFirFilterSingleStation - _requirements = data_handler.requirements() + data_handler_unfiltered = DataHandlerMixedSamplingSingleStation + _requirements = list(set(data_handler.requirements() + data_handler_unfiltered.requirements())) + DEFAULT_FILTER_ADD_UNFILTERED = False + + def __init__(self, *args, data_handler_class_unfiltered: data_handler_unfiltered = None, + filter_add_unfiltered: bool = DEFAULT_FILTER_ADD_UNFILTERED, **kwargs): + self.dh_unfiltered = data_handler_class_unfiltered + self.filter_add_unfiltered = filter_add_unfiltered + super().__init__(*args, **kwargs) + + @classmethod + def own_args(cls, *args): + """Return all arguments (including kwonlyargs).""" + super_own_args = DataHandlerClimateFirFilter.own_args(*args) + arg_spec = inspect.getfullargspec(cls) + list_of_args = arg_spec.args + arg_spec.kwonlyargs + super_own_args + return remove_items(list_of_args, ["self"] + list(args)) + + def _create_collection(self): + if self.filter_add_unfiltered is True and self.dh_unfiltered is not None: + return [self.id_class, self.dh_unfiltered] + else: + return super()._create_collection() + + @classmethod + def build(cls, station: str, **kwargs): + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler.requirements() if k in kwargs} + filter_add_unfiltered = kwargs.get("filter_add_unfiltered", False) + sp_keys = cls.build_update_kwargs(sp_keys, dh_type="filtered") + sp = cls.data_handler(station, **sp_keys) + if filter_add_unfiltered is True: + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls.data_handler_unfiltered.requirements() if k in kwargs} + sp_keys = cls.build_update_kwargs(sp_keys, dh_type="unfiltered") + sp_unfiltered = cls.data_handler_unfiltered(station, **sp_keys) + else: + sp_unfiltered = None + dp_args = {k: copy.deepcopy(kwargs[k]) for k in cls.own_args("id_class") if k in kwargs} + return cls(sp, data_handler_class_unfiltered=sp_unfiltered, **dp_args) + + @classmethod + def build_update_kwargs(cls, kwargs_dict, dh_type="filtered"): + if "transformation" in kwargs_dict: + trafo_opts = kwargs_dict.get("transformation") + if isinstance(trafo_opts, dict): + kwargs_dict["transformation"] = trafo_opts.get(dh_type) + return kwargs_dict + + @classmethod + def transformation(cls, set_stations, tmp_path=None, **kwargs): + + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} + if "transformation" not in sp_keys.keys(): + return + + transformation_filtered = super().transformation(set_stations, tmp_path=tmp_path, + dh_transformation=cls.data_handler_transformation, **kwargs) + if kwargs.get("filter_add_unfiltered", False) is False: + return transformation_filtered + else: + transformation_unfiltered = super().transformation(set_stations, tmp_path=tmp_path, + dh_transformation=cls.data_handler_unfiltered, **kwargs) + return {"filtered": transformation_filtered, "unfiltered": transformation_unfiltered} + + def get_X_original(self): + if self.use_filter_branches is True: + X = [] + for data in self._collection: + if hasattr(data, "filter_dim"): + X_total = data.get_X() + filter_dim = data.filter_dim + for filter_name in data.filter_dim_order: + X.append(X_total.sel({filter_dim: filter_name}, drop=True)) + else: + X.append(data.get_X()) + return X + else: + return super().get_X_original() class DataHandlerSeparationOfScalesSingleStation(DataHandlerMixedSamplingWithKzFilterSingleStation): diff --git a/mlair/data_handler/data_handler_neighbors.py b/mlair/data_handler/data_handler_neighbors.py index 6c87946eaad5568e1ff59c3988bf8fe469442641..26fe6f8549639019169bf0b894186d354eaddc83 100644 --- a/mlair/data_handler/data_handler_neighbors.py +++ b/mlair/data_handler/data_handler_neighbors.py @@ -1,6 +1,10 @@ __author__ = 'Lukas Leufen' __date__ = '2020-07-17' +""" +WARNING: This data handler is just a prototype and has not been validated to work properly! Use it with caution! +""" + import datetime as dt import numpy as np @@ -19,7 +23,7 @@ number = Union[float, int] num_or_list = Union[number, List[number]] -class DataHandlerNeighbors(DefaultDataHandler): +class DataHandlerNeighbors(DefaultDataHandler): # pragma: no cover """Data handler including neighboring stations.""" def __init__(self, id_class, data_path, neighbors=None, min_length=0, @@ -48,7 +52,7 @@ class DataHandlerNeighbors(DefaultDataHandler): return [super(DataHandlerNeighbors, self).get_coordinates()].append(neighbors) -def run_data_prep(): +def run_data_prep(): # pragma: no cover """Comment: methods just to start write meaningful test routines.""" data = DummyDataHandler("main_class") data.get_X() @@ -63,7 +67,7 @@ def run_data_prep(): data_prep.get_data(upsampling=False) -def create_data_prep(): +def create_data_prep(): # pragma: no cover """Comment: methods just to start write meaningful test routines.""" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") station_type = None @@ -91,7 +95,7 @@ def create_data_prep(): return data_prep -class DummyDataHandler(AbstractDataHandler): +class DummyDataHandler(AbstractDataHandler): # pragma: no cover def __init__(self, name, number_of_samples=None): """This data handler takes a name argument and the number of samples to generate. If not provided, a random diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py index 4330efd9ee5d3ae8a64c6eb9b95a0c58e18b3c36..88a57d108e4533968eeb9a65aabf575fae085704 100644 --- a/mlair/data_handler/data_handler_single_station.py +++ b/mlair/data_handler/data_handler_single_station.py @@ -5,6 +5,8 @@ __date__ = '2020-07-20' import copy import datetime as dt +import gc + import dill import hashlib import logging @@ -61,7 +63,7 @@ class DataHandlerSingleStation(AbstractDataHandler): interpolation_method: Union[str, Tuple[str]] = DEFAULT_INTERPOLATION_METHOD, overwrite_local_data: bool = False, transformation=None, store_data_locally: bool = True, min_length: int = 0, start=None, end=None, variables=None, data_origin: Dict = None, - lazy_preprocessing: bool = False, **kwargs): + lazy_preprocessing: bool = False, overwrite_lazy_data=False, **kwargs): super().__init__() self.station = helpers.to_list(station) self.path = self.setup_data_path(data_path, sampling) @@ -92,6 +94,7 @@ class DataHandlerSingleStation(AbstractDataHandler): self.interpolation_method = interpolation_method self.overwrite_local_data = overwrite_local_data + self.overwrite_lazy_data = True if self.overwrite_local_data is True else overwrite_lazy_data self.store_data_locally = store_data_locally self.min_length = min_length self.start = start @@ -107,6 +110,13 @@ class DataHandlerSingleStation(AbstractDataHandler): # create samples self.setup_samples() + self.clean_up() + + def clean_up(self): + self._data = None + self.input_data = None + self.target_data = None + gc.collect() def __str__(self): return self.station[0] @@ -213,7 +223,8 @@ class DataHandlerSingleStation(AbstractDataHandler): elif method == "centre": return statistics.centre_apply(data, mean), {"mean": mean, "method": method} elif method == "min_max": - return statistics.min_max_apply(data, min, max), {"min": min, "max": max, "method": method, + kws = {"feature_range": feature_range} if feature_range is not None else {} + return statistics.min_max_apply(data, min, max, **kws), {"min": min, "max": max, "method": method, "feature_range": feature_range} elif method == "log": return statistics.log_apply(data, mean, std), {"mean": mean, "std": std, "method": method} @@ -253,7 +264,7 @@ class DataHandlerSingleStation(AbstractDataHandler): hash = self._get_hash() filename = os.path.join(self.lazy_path, hash + ".pickle") if not os.path.exists(filename): - dill.dump(self._create_lazy_data(), file=open(filename, "wb")) + dill.dump(self._create_lazy_data(), file=open(filename, "wb"), protocol=4) def _create_lazy_data(self): return [self._data, self.meta, self.input_data, self.target_data] @@ -262,6 +273,8 @@ class DataHandlerSingleStation(AbstractDataHandler): hash = self._get_hash() filename = os.path.join(self.lazy_path, hash + ".pickle") try: + if self.overwrite_lazy_data is True: + raise FileNotFoundError with open(filename, "rb") as pickle_file: lazy_data = dill.load(pickle_file) self._extract_lazy(lazy_data) @@ -404,8 +417,7 @@ class DataHandlerSingleStation(AbstractDataHandler): """ chem_vars = ["benzene", "ch4", "co", "ethane", "no", "no2", "nox", "o3", "ox", "pm1", "pm10", "pm2p5", "propane", "so2", "toluene"] - # used_chem_vars = list(set(chem_vars) & set(self.statistics_per_var.keys())) - used_chem_vars = list(set(chem_vars) & set(data.variables.values)) + used_chem_vars = list(set(chem_vars) & set(data.coords[self.target_dim].values)) if len(used_chem_vars) > 0: data.loc[..., used_chem_vars] = data.loc[..., used_chem_vars].clip(min=minimum) return data @@ -451,11 +463,8 @@ class DataHandlerSingleStation(AbstractDataHandler): :return: this array """ ind = pd.DataFrame({'val': index_value}, index=index_value) - # res = xr.Dataset.from_dataframe(ind).to_array().rename({'index': index_name}).squeeze(dim=squeez/e_dim, drop=True) res = xr.Dataset.from_dataframe(ind).to_array(squeeze_dim).rename({'index': index_name}).squeeze( - dim=squeeze_dim, - drop=True - ) + dim=squeeze_dim, drop=True) res.name = index_name return res @@ -584,6 +593,12 @@ class DataHandlerSingleStation(AbstractDataHandler): non_nan_history = self.history.dropna(dim=dim) non_nan_label = self.label.dropna(dim=dim) non_nan_observation = self.observation.dropna(dim=dim) + if non_nan_label.coords[dim].shape[0] == 0: + raise ValueError(f'self.label consist of NaNs only - station {self.station} is therefore dropped') + if non_nan_history.coords[dim].shape[0] == 0: + raise ValueError(f'self.history consist of NaNs only - station {self.station} is therefore dropped') + if non_nan_observation.coords[dim].shape[0] == 0: + raise ValueError(f'self.observation consist of NaNs only - station {self.station} is therefore dropped') intersect = reduce(np.intersect1d, (non_nan_history.coords[dim].values, non_nan_label.coords[dim].values, non_nan_observation.coords[dim].values)) @@ -738,8 +753,6 @@ class DataHandlerSingleStation(AbstractDataHandler): if __name__ == "__main__": - # dp = AbstractDataPrep('data/', 'dummy', 'DEBW107', ['o3', 'temp'], statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) - # print(dp) statistics_per_var = {'o3': 'dma8eu', 'temp-rea-miub': 'maximum'} sp = DataHandlerSingleStation(data_path='/home/felix/PycharmProjects/mlt_new/data/', station='DEBY122', statistics_per_var=statistics_per_var, station_type='background', diff --git a/mlair/data_handler/data_handler_with_filter.py b/mlair/data_handler/data_handler_with_filter.py index e76f396aea80b2db76e01ea5baacf71d024b0d23..4707fd580562a68fd6b2dc0843551905e70d7e50 100644 --- a/mlair/data_handler/data_handler_with_filter.py +++ b/mlair/data_handler/data_handler_with_filter.py @@ -4,6 +4,7 @@ __author__ = 'Lukas Leufen' __date__ = '2020-08-26' import inspect +import copy import numpy as np import pandas as pd import xarray as xr @@ -12,7 +13,7 @@ from functools import partial import logging from mlair.data_handler.data_handler_single_station import DataHandlerSingleStation from mlair.data_handler import DefaultDataHandler -from mlair.helpers import remove_items, to_list, TimeTrackingWrapper +from mlair.helpers import remove_items, to_list, TimeTrackingWrapper, statistics from mlair.helpers.filter import KolmogorovZurbenkoFilterMovingWindow as KZFilter from mlair.helpers.filter import FIRFilter, ClimateFIRFilter, omega_null_kzf @@ -126,32 +127,16 @@ class DataHandlerFilter(DefaultDataHandler): list_of_args = arg_spec.args + arg_spec.kwonlyargs + super_own_args return remove_items(list_of_args, ["self"] + list(args)) - def get_X_original(self): - if self.use_filter_branches is True: - X = [] - for data in self._collection: - X_total = data.get_X() - filter_dim = data.filter_dim - for filter_name in data.filter_dim_order: - X.append(X_total.sel({filter_dim: filter_name}, drop=True)) - return X - else: - return super().get_X_original() - class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): """Data handler for a single station to be used by a superior data handler. Inputs are FIR filtered.""" _requirements = remove_items(DataHandlerFilterSingleStation.requirements(), "station") - _hash = DataHandlerFilterSingleStation._hash + ["filter_cutoff_period", "filter_order", "filter_window_type", - "_add_unfiltered"] + _hash = DataHandlerFilterSingleStation._hash + ["filter_cutoff_period", "filter_order", "filter_window_type"] DEFAULT_WINDOW_TYPE = ("kaiser", 5) - DEFAULT_ADD_UNFILTERED = False - def __init__(self, *args, filter_cutoff_period, filter_order, filter_window_type=DEFAULT_WINDOW_TYPE, - filter_add_unfiltered=DEFAULT_ADD_UNFILTERED, **kwargs): - # self._check_sampling(**kwargs) + def __init__(self, *args, filter_cutoff_period, filter_order, filter_window_type=DEFAULT_WINDOW_TYPE, **kwargs): # self.original_data = None # ToDo: implement here something to store unfiltered data self.fs = self._get_fs(**kwargs) if filter_window_type == "kzf": @@ -161,7 +146,7 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): assert len(self.filter_cutoff_period) == (len(filter_order) - len(removed_index)) self.filter_order = self._prepare_filter_order(filter_order, removed_index, self.fs) self.filter_window_type = filter_window_type - self._add_unfiltered = filter_add_unfiltered + self.unfiltered_name = "unfiltered" super().__init__(*args, **kwargs) @staticmethod @@ -223,8 +208,6 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): self.filter_window_type, self.target_dim) self.fir_coeff = fir.filter_coefficients() fir_data = fir.filtered_data() - if self._add_unfiltered is True: - fir_data.append(self.input_data) self.input_data = xr.concat(fir_data, pd.Index(self.create_filter_index(), name=self.filter_dim)) # this is just a code snippet to check the results of the kz filter # import matplotlib @@ -249,8 +232,6 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): else: index.append(f"band{band_num}") band_num += 1 - if self._add_unfiltered: - index.append("unfiltered") self.filter_dim_order = index return pd.Index(index, name=self.filter_dim) @@ -261,13 +242,89 @@ class DataHandlerFirFilterSingleStation(DataHandlerFilterSingleStation): _data, _meta, _input_data, _target_data, self.fir_coeff, self.filter_dim_order = lazy_data super(__class__, self)._extract_lazy((_data, _meta, _input_data, _target_data)) + def transform(self, data_in, dim: Union[str, int] = 0, inverse: bool = False, opts=None, + transformation_dim=None): + """ + Transform data according to given transformation settings. + + This function transforms a xarray.dataarray (along dim) or pandas.DataFrame (along axis) either with mean=0 + and std=1 (`method=standardise`) or centers the data with mean=0 and no change in data scale + (`method=centre`). Furthermore, this sets an internal instance attribute for later inverse transformation. This + method will raise an AssertionError if an internal transform method was already set ('inverse=False') or if the + internal transform method, internal mean and internal standard deviation weren't set ('inverse=True'). + + :param string/int dim: This param is not used for inverse transformation. + | for xarray.DataArray as string: name of dimension which should be standardised + | for pandas.DataFrame as int: axis of dimension which should be standardised + :param inverse: Switch between transformation and inverse transformation. + + :return: xarray.DataArrays or pandas.DataFrames: + #. mean: Mean of data + #. std: Standard deviation of data + #. data: Standardised data + """ + + if transformation_dim is None: + transformation_dim = self.DEFAULT_TARGET_DIM + + def f(data, method="standardise", feature_range=None): + if method == "standardise": + return statistics.standardise(data, dim) + elif method == "centre": + return statistics.centre(data, dim) + elif method == "min_max": + kwargs = {"feature_range": feature_range} if feature_range is not None else {} + return statistics.min_max(data, dim, **kwargs) + elif method == "log": + return statistics.log(data, dim) + else: + raise NotImplementedError + + def f_apply(data, method, **kwargs): + for k, v in kwargs.items(): + if not (isinstance(v, xr.DataArray) or v is None): + _, opts = statistics.min_max(data, dim) + helper = xr.ones_like(opts['min']) + kwargs[k] = helper * v + mean = kwargs.pop('mean', None) + std = kwargs.pop('std', None) + min = kwargs.pop('min', None) + max = kwargs.pop('max', None) + feature_range = kwargs.pop('feature_range', None) + + if method == "standardise": + return statistics.standardise_apply(data, mean, std), {"mean": mean, "std": std, "method": method} + elif method == "centre": + return statistics.centre_apply(data, mean), {"mean": mean, "method": method} + elif method == "min_max": + return statistics.min_max_apply(data, min, max), {"min": min, "max": max, "method": method, + "feature_range": feature_range} + elif method == "log": + return statistics.log_apply(data, mean, std), {"mean": mean, "std": std, "method": method} + else: + raise NotImplementedError + + opts = opts or {} + opts_updated = {} + if not inverse: + transformed_values = [] + for var in data_in.variables.values: + data_var = data_in.sel(**{transformation_dim: [var]}) + var_opts = opts.get(var, {}) + _apply = (var_opts.get("mean", None) is not None) or (var_opts.get("min") is not None) + values, new_var_opts = locals()["f_apply" if _apply else "f"](data_var, **var_opts) + opts_updated[var] = copy.deepcopy(new_var_opts) + transformed_values.append(values) + return xr.concat(transformed_values, dim=transformation_dim), opts_updated + else: + return self.inverse_transform(data_in, opts, transformation_dim) + class DataHandlerFirFilter(DataHandlerFilter): """Data handler using FIR filtered data.""" data_handler = DataHandlerFirFilterSingleStation data_handler_transformation = DataHandlerFirFilterSingleStation - _requirements = data_handler.requirements() class DataHandlerKzFilterSingleStation(DataHandlerFilterSingleStation): @@ -393,13 +450,8 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation climate_filter.filtered_data] # create input data with filter index - input_data = xr.concat(climate_filter_data, pd.Index(self.create_filter_index(), name=self.filter_dim)) - - # add unfiltered raw data - if self._add_unfiltered is True: - data_raw = self.shift(self.input_data, self.time_dim, -self.window_history_size) - data_raw = data_raw.expand_dims({self.filter_dim: ["unfiltered"]}, -1) - input_data = xr.concat([input_data, data_raw], self.filter_dim) + input_data = xr.concat(climate_filter_data, pd.Index(self.create_filter_index(add_unfiltered_index=False), + name=self.filter_dim)) self.input_data = input_data @@ -410,7 +462,7 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation # self.input_data.sel(filter="low", variables="temp", Stations="DEBW107").plot() # self.input_data.sel(variables="temp", Stations="DEBW107").plot.line(hue="filter") - def create_filter_index(self) -> pd.Index: + def create_filter_index(self, add_unfiltered_index=True) -> pd.Index: """ Round cut off periods in days and append 'res' for residuum index. @@ -421,8 +473,6 @@ class DataHandlerClimateFirFilterSingleStation(DataHandlerFirFilterSingleStation f = lambda x: int(np.round(x)) if x >= 10 else np.round(x, 1) index = list(map(f, index.tolist())) index = list(map(lambda x: str(x) + "d", index)) + ["res"] - if self._add_unfiltered: - index.append("unfiltered") self.filter_dim_order = index return pd.Index(index, name=self.filter_dim) @@ -491,11 +541,3 @@ class DataHandlerClimateFirFilter(DataHandlerFilter): _requirements = data_handler.requirements() _store_attributes = data_handler.store_attributes() - # def get_X_original(self): - # X = [] - # for data in self._collection: - # X_total = data.get_X() - # filter_dim = data.filter_dim - # for filter in data.filter_dim_order: - # X.append(X_total.sel({filter_dim: filter}, drop=True)) - # return X diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index a17de95407a74d1504877fdce03a82d1c943e868..6fa7952d4bc0a278f17f767073969822924b6d5f 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -8,6 +8,7 @@ import gc import logging import os import pickle +import random import dill import shutil from functools import reduce @@ -52,6 +53,7 @@ class DefaultDataHandler(AbstractDataHandler): self._Y = None self._X_extreme = None self._Y_extreme = None + self._data_intersection = None self._use_multiprocessing = use_multiprocessing self._max_number_multiprocessing = max_number_multiprocessing _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self)) @@ -92,7 +94,7 @@ class DefaultDataHandler(AbstractDataHandler): data = {"X": self._X, "Y": self._Y, "X_extreme": self._X_extreme, "Y_extreme": self._Y_extreme} data = self._force_dask_computation(data) with open(self._save_file, "wb") as f: - dill.dump(data, f) + dill.dump(data, f, protocol=4) logging.debug(f"save pickle data to {self._save_file}") self._reset_data() @@ -132,7 +134,7 @@ class DefaultDataHandler(AbstractDataHandler): return X, Y def __repr__(self): - return ";".join(list(map(lambda x: str(x), self._collection))) + return str(self._collection[0]) def get_X_original(self): X = [] @@ -171,10 +173,15 @@ class DefaultDataHandler(AbstractDataHandler): else: X = list(map(lambda x: x.sel({dim: intersect}), X_original)) Y = Y_original.sel({dim: intersect}) + self._data_intersection = intersect self._X, self._Y = X, Y def get_observation(self): - return self.id_class.observation.copy().squeeze() + dim = self.time_dim + if self._data_intersection is not None: + return self.id_class.observation.sel({dim: self._data_intersection}).copy().squeeze() + else: + return self.id_class.observation.copy().squeeze() def apply_transformation(self, data, base="target", dim=0, inverse=False): return self.id_class.apply_transformation(data, dim=dim, base=base, inverse=inverse) @@ -214,30 +221,29 @@ class DefaultDataHandler(AbstractDataHandler): raise TypeError(f"Elements of list extreme_values have to be {number.__args__}, but at least element " f"{i} is type {type(i)}") + extremes_X, extremes_Y = None, None for extr_val in sorted(extreme_values): # check if some extreme values are already extracted - if (self._X_extreme is None) or (self._Y_extreme is None): - X = self._X - Y = self._Y + if (extremes_X is None) or (extremes_Y is None): + X, Y = self._X, self._Y + extremes_X, extremes_Y = X, Y else: # one extr value iteration is done already: self.extremes_label is NOT None... - X = self._X_extreme - Y = self._Y_extreme + X, Y = self._X_extreme, self._Y_extreme # extract extremes based on occurrence in labels other_dims = remove_items(list(Y.dims), dim) if extremes_on_right_tail_only: - extreme_idx = (Y > extr_val).any(dim=other_dims) + extreme_idx = (extremes_Y > extr_val).any(dim=other_dims) else: - extreme_idx = xr.concat([(Y < -extr_val).any(dim=other_dims[0]), - (Y > extr_val).any(dim=other_dims[0])], - dim=other_dims[1]).any(dim=other_dims[1]) + extreme_idx = xr.concat([(extremes_Y < -extr_val).any(dim=other_dims[0]), + (extremes_Y > extr_val).any(dim=other_dims[0])], + dim=other_dims[0]).any(dim=other_dims[0]) - extremes_X = list(map(lambda x: x.sel(**{dim: extreme_idx}), X)) + sel = extreme_idx[extreme_idx].coords[dim].values + extremes_X = list(map(lambda x: x.sel(**{dim: sel}), extremes_X)) self._add_timedelta(extremes_X, dim, timedelta) - # extremes_X = list(map(lambda x: x.coords[dim].values + np.timedelta64(*timedelta), extremes_X)) - - extremes_Y = Y.sel(**{dim: extreme_idx}) - extremes_Y.coords[dim].values += np.timedelta64(*timedelta) + extremes_Y = extremes_Y.sel(**{dim: extreme_idx}) + self._add_timedelta([extremes_Y], dim, timedelta) self._Y_extreme = xr.concat([Y, extremes_Y], dim=dim) self._X_extreme = list(map(lambda x1, x2: xr.concat([x1, x2], dim=dim), X, extremes_X)) @@ -245,10 +251,10 @@ class DefaultDataHandler(AbstractDataHandler): @staticmethod def _add_timedelta(data, dim, timedelta): for d in data: - d.coords[dim].values += np.timedelta64(*timedelta) + d.coords[dim] = d.coords[dim].values + np.timedelta64(*timedelta) @classmethod - def transformation(cls, set_stations, **kwargs): + def transformation(cls, set_stations, tmp_path=None, dh_transformation=None, **kwargs): """ ### supported transformation methods @@ -278,53 +284,45 @@ class DefaultDataHandler(AbstractDataHandler): If min and max are not None, the default data handler expects this parameters to match the data and applies this values to the data. Make sure that all dimensions and/or coordinates are in agreement. """ + if dh_transformation is None: + dh_transformation = cls.data_handler_transformation - sp_keys = {k: copy.deepcopy(kwargs[k]) for k in cls._requirements if k in kwargs} + sp_keys = {k: copy.deepcopy(kwargs[k]) for k in dh_transformation.requirements() if k in kwargs} if "transformation" not in sp_keys.keys(): return transformation_dict = ({}, {}) - def _inner(): - """Inner method that is performed in both serial and parallel approach.""" - if dh is not None: - for i, transformation in enumerate(dh._transformation): - for var in transformation.keys(): - if var not in transformation_dict[i].keys(): - transformation_dict[i][var] = {} - opts = transformation[var] - if not transformation_dict[i][var].get("method", opts["method"]) == opts["method"]: - # data handlers with filters are allowed to change transformation method to standardise - assert hasattr(dh, "filter_dim") and opts["method"] == "standardise" - transformation_dict[i][var]["method"] = opts["method"] - for k in ["mean", "std", "min", "max"]: - old = transformation_dict[i][var].get(k, None) - new = opts.get(k) - transformation_dict[i][var][k] = new if old is None else old.combine_first(new) - if "feature_range" in opts.keys(): - transformation_dict[i][var]["feature_range"] = opts.get("feature_range", None) - max_process = kwargs.get("max_number_multiprocessing", 16) n_process = min([psutil.cpu_count(logical=False), len(set_stations), max_process]) # use only physical cpus if n_process > 1 and kwargs.get("use_multiprocessing", True) is True: # parallel solution logging.info("use parallel transformation approach") - pool = multiprocessing.Pool( - min([psutil.cpu_count(logical=False), len(set_stations), 16])) # use only physical cpus + pool = multiprocessing.Pool(n_process) # use only physical cpus logging.info(f"running {getattr(pool, '_processes')} processes in parallel") + sp_keys.update({"tmp_path": tmp_path, "return_strategy": "reference"}) output = [ - pool.apply_async(f_proc, args=(cls.data_handler_transformation, station), kwds=sp_keys) + pool.apply_async(f_proc, args=(dh_transformation, station), kwds=sp_keys) for station in set_stations] for p in output: - dh, s = p.get() - _inner() + _res_file, s = p.get() + with open(_res_file, "rb") as f: + dh = dill.load(f) + os.remove(_res_file) + transformation_dict = cls.update_transformation_dict(dh, transformation_dict) pool.close() else: # serial solution logging.info("use serial transformation approach") + sp_keys.update({"return_strategy": "result"}) for station in set_stations: - dh, s = f_proc(cls.data_handler_transformation, station, **sp_keys) - _inner() + dh, s = f_proc(dh_transformation, station, **sp_keys) + transformation_dict = cls.update_transformation_dict(dh, transformation_dict) # aggregate all information iter_dim = sp_keys.get("iter_dim", cls.DEFAULT_ITER_DIM) + transformation_dict = cls.aggregate_transformation(transformation_dict, iter_dim) + return transformation_dict + + @classmethod + def aggregate_transformation(cls, transformation_dict, iter_dim): pop_list = [] for i, transformation in enumerate(transformation_dict): for k in transformation.keys(): @@ -345,19 +343,47 @@ class DefaultDataHandler(AbstractDataHandler): transformation_dict[i].pop(k) return transformation_dict + @classmethod + def update_transformation_dict(cls, dh, transformation_dict): + """Inner method that is performed in both serial and parallel approach.""" + if dh is not None: + for i, transformation in enumerate(dh._transformation): + for var in transformation.keys(): + if var not in transformation_dict[i].keys(): + transformation_dict[i][var] = {} + opts = transformation[var] + if not transformation_dict[i][var].get("method", opts["method"]) == opts["method"]: + # data handlers with filters are allowed to change transformation method to standardise + assert hasattr(dh, "filter_dim") and opts["method"] == "standardise" + transformation_dict[i][var]["method"] = opts["method"] + for k in ["mean", "std", "min", "max"]: + old = transformation_dict[i][var].get(k, None) + new = opts.get(k) + transformation_dict[i][var][k] = new if old is None else old.combine_first(new) + if "feature_range" in opts.keys(): + transformation_dict[i][var]["feature_range"] = opts.get("feature_range", None) + return transformation_dict + def get_coordinates(self): return self.id_class.get_coordinates() -def f_proc(data_handler, station, **sp_keys): +def f_proc(data_handler, station, return_strategy="", tmp_path=None, **sp_keys): """ Try to create a data handler for given arguments. If build fails, this station does not fulfil all requirements and therefore f_proc will return None as indication. On a successful build, f_proc returns the built data handler and the station that was used. This function must be implemented globally to work together with multiprocessing. """ + assert return_strategy in ["result", "reference"] try: res = data_handler(station, **sp_keys) except (AttributeError, EmptyQueryResult, KeyError, ValueError) as e: logging.info(f"remove station {station} because it raised an error: {e}") res = None - return res, station + if return_strategy == "result": + return res, station + else: + _tmp_file = os.path.join(tmp_path, f"{station}_{'%032x' % random.getrandbits(128)}.pickle") + with open(_tmp_file, "wb") as f: + dill.dump(res, f, protocol=4) + return _tmp_file, station diff --git a/mlair/data_handler/bootstraps.py b/mlair/data_handler/input_bootstraps.py similarity index 91% rename from mlair/data_handler/bootstraps.py rename to mlair/data_handler/input_bootstraps.py index e03881484bfc9b8275ede8a4432072c74643994a..187f09050bb39a953ac58c2b7fca54b6a207aed1 100644 --- a/mlair/data_handler/bootstraps.py +++ b/mlair/data_handler/input_bootstraps.py @@ -28,12 +28,13 @@ class BootstrapIterator(Iterator): _position: int = None - def __init__(self, data: "BootStraps", method): - assert isinstance(data, BootStraps) + def __init__(self, data: "Bootstraps", method, return_reshaped=False): + assert isinstance(data, Bootstraps) self._data = data self._dimension = data.bootstrap_dimension self.boot_dim = "boots" self._method = method + self._return_reshaped = return_reshaped self._collection = self.create_collection(self._data.data, self._dimension) self._position = 0 @@ -46,12 +47,15 @@ class BootstrapIterator(Iterator): raise NotImplementedError def _reshape(self, d): - if isinstance(d, list): - return list(map(lambda x: self._reshape(x), d)) - # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + if self._return_reshaped: + if isinstance(d, list): + return list(map(lambda x: self._reshape(x), d)) + # return list(map(lambda x: np.rollaxis(x, -1, 0).reshape(x.shape[0] * x.shape[-1], *x.shape[1:-1]), d)) + else: + shape = d.shape + return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) else: - shape = d.shape - return np.rollaxis(d, -1, 0).reshape(shape[0] * shape[-1], *shape[1:-1]) + return d def _to_numpy(self, d): if isinstance(d, list): @@ -75,8 +79,8 @@ class BootstrapIterator(Iterator): class BootstrapIteratorSingleInput(BootstrapIterator): _position: int = None - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -107,8 +111,8 @@ class BootstrapIteratorSingleInput(BootstrapIterator): class BootstrapIteratorVariable(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): """Return next element or stop iteration.""" @@ -140,8 +144,8 @@ class BootstrapIteratorVariable(BootstrapIterator): class BootstrapIteratorBranch(BootstrapIterator): - def __init__(self, *args): - super().__init__(*args) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) def __next__(self): try: @@ -184,7 +188,7 @@ class MeanBootstraps: return np.ones_like(data) * self._mean -class BootStraps(Iterable): +class Bootstraps(Iterable): """ Main class to perform bootstrap operations. diff --git a/mlair/helpers/filter.py b/mlair/helpers/filter.py index a63cef975888162f335e4528c2f99bdfc7a892d5..36c93b04486fc9be013af2c4f34d2b3ee1bd84c2 100644 --- a/mlair/helpers/filter.py +++ b/mlair/helpers/filter.py @@ -173,8 +173,10 @@ class ClimateFIRFilter: # visualize if self.plot_path is not None: - self.PlotClimateFirFilter(self.plot_path, self.plot_data, sampling, plot_name) - # self._plot(sampling, new_dim=new_dim) + try: + self.PlotClimateFirFilter(self.plot_path, self.plot_data, sampling, plot_name) + except Exception as e: + logging.info(f"Could not plot climate fir filter due to following reason:\n{e}") @staticmethod def _minimum_length(order, minimum_length, pos, window): @@ -382,7 +384,7 @@ class ClimateFIRFilter: _end = pd.to_datetime(data.coords[time_dim].max().values).year filt_coll = [] for _year in range(_start, _end + 1): - logging.info(f"{data.coords['Stations'].values[0]} ({var}): year={_year}") + logging.debug(f"{data.coords['Stations'].values[0]} ({var}): year={_year}") time_slice = self._create_time_range_extend(_year, sampling, extend_length_history) d = data.sel({var_dim: [var], time_dim: time_slice}) @@ -504,137 +506,6 @@ class ClimateFIRFilter: res.name = index_name return res - def _plot(self, sampling, new_dim="window"): - h = None - td_type = {"1d": "D", "1H": "h"}.get(sampling) - if self.plot_path is None: - return - plot_folder = os.path.join(os.path.abspath(self.plot_path), "climFIR") - if not os.path.exists(plot_folder): - os.makedirs(plot_folder) - - # set plot parameter - rc_params = {'axes.labelsize': 'large', - 'xtick.labelsize': 'large', - 'ytick.labelsize': 'large', - 'legend.fontsize': 'medium', - 'axes.titlesize': 'large', - } - plt.rcParams.update(rc_params) - - plot_dict = {} - for i, o in enumerate(range(len(self.plot_data))): - plot_data = self.plot_data[i] - for p_d in plot_data: - var = p_d.get("var") - t0 = p_d.get("t0") - filter_input = p_d.get("filter_input") - filter_input_nc = p_d.get("filter_input_nc") - valid_range = p_d.get("valid_range") - time_range = p_d.get("time_range") - new_dim = p_d.get("new_dim") - h = p_d.get("h") - plot_dict_var = plot_dict.get(var, {}) - plot_dict_t0 = plot_dict_var.get(t0, {}) - plot_dict_order = {"filter_input": filter_input, - "filter_input_nc": filter_input_nc, - "valid_range": valid_range, - "time_range": time_range, - "order": o, "h": h} - plot_dict_t0[i] = plot_dict_order - plot_dict_var[t0] = plot_dict_t0 - plot_dict[var] = plot_dict_var - - for var, viz_date_dict in plot_dict.items(): - for it0, t0 in enumerate(viz_date_dict.keys()): - viz_data = viz_date_dict[t0] - residuum_true = None - for ifilter in sorted(viz_data.keys()): - data = viz_data[ifilter] - filter_input = data["filter_input"] - filter_input_nc = data["filter_input_nc"] if residuum_true is None else residuum_true.sel( - {new_dim: filter_input.coords[new_dim]}) - valid_range = data["valid_range"] - time_axis = data["time_range"] - # time_axis = pd.date_range(t_minus, t_plus, freq=sampling) - filter_order = data["order"] - h = data["h"] - t_minus = t0 + np.timedelta64(-int(1.5 * valid_range.start), td_type) - t_plus = t0 + np.timedelta64(int(0.5 * valid_range.start), td_type) - fig, ax = plt.subplots() - ax.axvspan(t0 + np.timedelta64(-valid_range.start, td_type), - t0 + np.timedelta64(valid_range.stop - 1, td_type), color="whitesmoke", - label="valid area") - ax.axvline(t0, color="lightgrey", lw=6, label="time of interest ($t_0$)") - - # original data - ax.plot(time_axis, filter_input_nc.values.flatten(), color="darkgrey", linestyle="dashed", - label="original") - - # clim apriori - if ifilter == 0: - d_tmp = filter_input.sel( - {new_dim: slice(0, filter_input.coords[new_dim].values.max())}).values.flatten() - else: - d_tmp = filter_input.values.flatten() - ax.plot(time_axis[len(time_axis) - len(d_tmp):], d_tmp, color="darkgrey", linestyle="solid", - label="estimated future") - - # clim filter response - filt = xr.apply_ufunc(fir_filter_convolve, filter_input, - input_core_dims=[[new_dim]], - output_core_dims=[[new_dim]], - vectorize=True, - kwargs={"h": h}, - output_dtypes=[filter_input.dtype]) - ax.plot(time_axis, filt.values.flatten(), color="black", linestyle="solid", - label="clim filter response", linewidth=2) - residuum_estimated = filter_input - filt - - # ideal filter response - filt = xr.apply_ufunc(fir_filter_convolve, filter_input_nc, - input_core_dims=[[new_dim]], - output_core_dims=[[new_dim]], - vectorize=True, - kwargs={"h": h}, - output_dtypes=[filter_input.dtype]) - ax.plot(time_axis, filt.values.flatten(), color="black", linestyle="dashed", - label="ideal filter response", linewidth=2) - residuum_true = filter_input_nc - filt - - # set title, legend, and save plot - ax_start = max(t_minus, time_axis[0]) - ax_end = min(t_plus, time_axis[-1]) - ax.set_xlim((ax_start, ax_end)) - plt.title(f"Input of ClimFilter ({str(var)})") - plt.legend() - fig.autofmt_xdate() - plt.tight_layout() - plot_name = os.path.join(plot_folder, - f"climFIR_{self.plot_name}_{str(var)}_{it0}_{ifilter}.pdf") - plt.savefig(plot_name, dpi=300) - plt.close('all') - - # plot residuum - fig, ax = plt.subplots() - ax.axvspan(t0 + np.timedelta64(-valid_range.start, td_type), - t0 + np.timedelta64(valid_range.stop - 1, td_type), color="whitesmoke", - label="valid area") - ax.axvline(t0, color="lightgrey", lw=6, label="time of interest ($t_0$)") - ax.plot(time_axis, residuum_true.values.flatten(), color="black", linestyle="dashed", - label="ideal filter residuum", linewidth=2) - ax.plot(time_axis, residuum_estimated.values.flatten(), color="black", linestyle="solid", - label="clim filter residuum", linewidth=2) - ax.set_xlim((ax_start, ax_end)) - plt.title(f"Residuum of ClimFilter ({str(var)})") - plt.legend(loc="upper left") - fig.autofmt_xdate() - plt.tight_layout() - plot_name = os.path.join(plot_folder, - f"climFIR_{self.plot_name}_{str(var)}_{it0}_{ifilter}_residuum.pdf") - plt.savefig(plot_name, dpi=300) - plt.close('all') - @property def filter_coefficients(self): return self._h @@ -897,6 +768,7 @@ class KolmogorovZurbenkoFilterMovingWindow(KolmogorovZurbenkoBaseClass): def firwin_kzf(m, k): + m, k = int(m), int(k) coef = np.ones(m) for i in range(1, k): t = np.zeros((m, m + i * (m - 1))) diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py index 5ddaa3ee3fe505eeb7c8082274d9cd888cec720f..1f5a86cde01752b74be82476e2e0fd8cad514a9e 100644 --- a/mlair/helpers/helpers.py +++ b/mlair/helpers/helpers.py @@ -4,6 +4,7 @@ __date__ = '2019-10-21' import inspect import math +import sys import numpy as np import xarray as xr @@ -80,8 +81,10 @@ def remove_items(obj: Union[List, Dict, Tuple], items: Any): def remove_from_list(list_obj, item_list): """Remove implementation for lists.""" - if len(items) > 1: + if len(item_list) > 1: return [e for e in list_obj if e not in item_list] + elif len(item_list) == 0: + return list_obj else: list_obj = list_obj.copy() try: @@ -179,3 +182,34 @@ def convert2xrda(arr: Union[xr.DataArray, xr.Dataset, np.ndarray, int, float], kwargs.update({'dims': dims, 'coords': coords}) return xr.DataArray(arr, **kwargs) + + +# def convert_size(size_bytes): +# if size_bytes == 0: +# return "0B" +# size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") +# i = int(math.floor(math.log(size_bytes, 1024))) +# p = math.pow(1024, i) +# s = round(size_bytes / p, 2) +# return "%s %s" % (s, size_name[i]) +# +# +# def get_size(obj, seen=None): +# """Recursively finds size of objects""" +# size = sys.getsizeof(obj) +# if seen is None: +# seen = set() +# obj_id = id(obj) +# if obj_id in seen: +# return 0 +# # Important mark as seen *before* entering recursion to gracefully handle +# # self-referential objects +# seen.add(obj_id) +# if isinstance(obj, dict): +# size += sum([get_size(v, seen) for v in obj.values()]) +# size += sum([get_size(k, seen) for k in obj.keys()]) +# elif hasattr(obj, '__dict__'): +# size += get_size(obj.__dict__, seen) +# elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)): +# size += sum([get_size(i, seen) for i in obj]) +# return size diff --git a/mlair/helpers/statistics.py b/mlair/helpers/statistics.py index a1e713a8c135800d02ff7c27894485a5da7fae37..af7975f3a042163a885f590c6624076fe91f03aa 100644 --- a/mlair/helpers/statistics.py +++ b/mlair/helpers/statistics.py @@ -152,7 +152,10 @@ def min_max_apply(data: Data, _min: Data, _max: Data, feature_range: Data = (0, :param feature_range: scale data to any interval given in feature range. Default is scaling on interval [0, 1]. :return: min/max scaled data """ - return (data - _min) / (_max - _min) * (max(feature_range) - min(feature_range)) + min(feature_range) + if not isinstance(feature_range, xr.DataArray): + return (data - _min) / (_max - _min) * (max(feature_range) - min(feature_range)) + min(feature_range) + else: + return (data - _min) / (_max - _min) * (feature_range.max() - feature_range.min()) + feature_range.min() def log(data: Data, dim: Union[str, int]) -> Tuple[Data, Dict[(str, Data)]]: @@ -284,7 +287,7 @@ class SkillScores: combination_strings = [f"{first}-{second}" for (first, second) in combinations] return combinations, combination_strings - def skill_scores(self) -> pd.DataFrame: + def skill_scores(self) -> [pd.DataFrame, pd.DataFrame]: """ Calculate skill scores for all combinations of model names. @@ -293,6 +296,7 @@ class SkillScores: ahead_names = list(self.external_data[self.ahead_dim].data) combinations, combination_strings = self.get_model_name_combinations() skill_score = pd.DataFrame(index=combination_strings) + count = pd.DataFrame(index=combination_strings) for iahead in ahead_names: data = self.external_data.sel({self.ahead_dim: iahead}) skill_score[iahead] = [self.general_skill_score(data, @@ -300,7 +304,12 @@ class SkillScores: reference_name=second, observation_name=self.observation_name) for (first, second) in combinations] - return skill_score + count[iahead] = [self.get_count(data, + forecast_name=first, + reference_name=second, + observation_name=self.observation_name) + for (first, second) in combinations] + return skill_score, count def climatological_skill_scores(self, internal_data: Data, forecast_name: str) -> xr.DataArray: """ @@ -314,7 +323,10 @@ class SkillScores: :return: all CASES as well as all terms """ - ahead_names = list(self.external_data[self.ahead_dim].data) + if self.external_data is not None: + ahead_names = list(self.external_data[self.ahead_dim].data) + else: + ahead_names = list(internal_data[self.ahead_dim].data) all_terms = ['AI', 'AII', 'AIII', 'AIV', 'BI', 'BII', 'BIV', 'CI', 'CIV', 'CASE I', 'CASE II', 'CASE III', 'CASE IV'] @@ -349,7 +361,7 @@ class SkillScores: **kwargs) def general_skill_score(self, data: Data, forecast_name: str, reference_name: str, - observation_name: str = None) -> np.ndarray: + observation_name: str = None, dim: str = "index") -> np.ndarray: r""" Calculate general skill score based on mean squared error. @@ -362,14 +374,29 @@ class SkillScores: """ if observation_name is None: observation_name = self.observation_name - data = data.dropna("index") + data = data.dropna(dim) observation = data.sel(type=observation_name) forecast = data.sel(type=forecast_name) reference = data.sel(type=reference_name) mse = mean_squared_error - skill_score = 1 - mse(observation, forecast) / mse(observation, reference) + skill_score = 1 - mse(observation, forecast, dim=dim) / mse(observation, reference, dim=dim) return skill_score.values + def get_count(self, data: Data, forecast_name: str, reference_name: str, + observation_name: str = None) -> np.ndarray: + r""" + Calculate general skill score based on mean squared error. + + :param data: internal data containing data for observation, forecast and reference + :param observation_name: name of observation + :param forecast_name: name of forecast + :param reference_name: name of reference + + :return: skill score of forecast + """ + data = data.dropna("index") + return data.count("index").max().values + def skill_score_pre_calculations(self, data: Data, observation_name: str, forecast_name: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, @@ -475,3 +502,58 @@ class SkillScores: return monthly_mean + +def create_single_bootstrap_realization(data: xr.DataArray, dim_name_time: str) -> xr.DataArray: + """ + Return a bootstraped realization of data + :param data: data from which to draw ONE bootstrap realization + :param dim_name_time: name of time dimension + :return: bootstrapped realization of data + """ + + num_of_blocks = data.coords[dim_name_time].shape[0] + boot_idx = np.random.choice(num_of_blocks, size=num_of_blocks, replace=True) + return data.isel({dim_name_time: boot_idx}) + + +def calculate_average(data: xr.DataArray, **kwargs) -> xr.DataArray: + """ + Calculate mean of data + :param data: data for which to calculate mean + :return: mean of data + """ + return data.mean(**kwargs) + + +def create_n_bootstrap_realizations(data: xr.DataArray, dim_name_time: str, dim_name_model: str, n_boots: int = 1000, + dim_name_boots: str = 'boots') -> xr.DataArray: + """ + Create n bootstrap realizations and calculate averages across realizations + + :param data: original data from which to create bootstrap realizations + :param dim_name_time: name of time dimension + :param dim_name_model: name of model dimension + :param n_boots: number of bootstap realizations + :param dim_name_boots: name of bootstap dimension + :return: + """ + res_dims = [dim_name_boots] + dims = list(data.dims) + coords = {dim_name_boots: range(n_boots), dim_name_model: data.coords[dim_name_model] } + if len(dims) > 1: + res_dims = res_dims + dims[1:] + res = xr.DataArray(np.nan, dims=res_dims, coords=coords) + for boot in range(n_boots): + res[boot] = (calculate_average( + create_single_bootstrap_realization(data, dim_name_time=dim_name_time), + dim=dim_name_time, skipna=True)) + return res + + + + + + + + + diff --git a/mlair/plotting/data_insight_plotting.py b/mlair/plotting/data_insight_plotting.py index 513f64f2c174d94cb7230b141387c9a850d678cb..6a837993fcf849a860e029d441de910d55888a1b 100644 --- a/mlair/plotting/data_insight_plotting.py +++ b/mlair/plotting/data_insight_plotting.py @@ -8,6 +8,7 @@ import os import logging import multiprocessing import psutil +import sys import numpy as np import pandas as pd @@ -20,6 +21,8 @@ from mlair.data_handler import DataCollection from mlair.helpers import TimeTrackingWrapper, to_list, remove_items from mlair.plotting.abstract_plot_class import AbstractPlotClass +matplotlib.use("Agg") + @TimeTrackingWrapper class PlotStationMap(AbstractPlotClass): # pragma: no cover @@ -459,7 +462,7 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover """ def __init__(self, generators: Dict[str, DataCollection], plot_folder: str = ".", plot_name="histogram", - variables_dim="variables", time_dim="datetime", window_dim="window"): + variables_dim="variables", time_dim="datetime", window_dim="window", upsampling=False): super().__init__(plot_folder, plot_name) self.variables_dim = variables_dim self.time_dim = time_dim @@ -468,6 +471,8 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover self.bins = {} self.interval_width = {} self.bin_edges = {} + if upsampling is True: + self._handle_upsampling(generators) # input plots for branch_pos in range(number_of_branches): @@ -483,6 +488,11 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover self._plot(add_name="target", subset=subset) self._plot_combined(add_name="target") + @staticmethod + def _handle_upsampling(generators): + if "train" in generators: + generators.update({"train_upsampled": generators["train"]}) + @staticmethod def _get_inputs_targets(gens, dim): k = list(gens.keys())[0] @@ -495,11 +505,15 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover def _calculate_hist(self, generators, variables, input_data=True, branch_pos=0): n_bins = 100 for set_type, generator in generators.items(): + upsampling = "upsampled" in set_type tmp_bins = {} tmp_edges = {} end = {} start = {} - f = lambda x: x.get_X(as_numpy=False)[branch_pos] if input_data is True else x.get_Y(as_numpy=False) + if input_data is True: + f = lambda x: x.get_X(as_numpy=False, upsampling=upsampling)[branch_pos] + else: + f = lambda x: x.get_Y(as_numpy=False, upsampling=upsampling) for gen in generator: w = min(abs(f(gen).coords[self.window_dim].values)) data = f(gen).sel({self.window_dim: w}) @@ -536,6 +550,7 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover bin_edges = self.bin_edges[subset] interval_width = self.interval_width[subset] colors = self.get_dataset_colors() + colors.update({"train_upsampled": colors.get("train_val", "#000000")}) for var in bins.keys(): fig, ax = plt.subplots() hist_var = bins[var] @@ -555,6 +570,7 @@ class PlotDataHistogram(AbstractPlotClass): # pragma: no cover pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) variables = self.bins[list(self.bins.keys())[0]].keys() colors = self.get_dataset_colors() + colors.update({"train_upsampled": colors.get("train_val", "#000000")}) for var in variables: fig, ax = plt.subplots() for subset in self.bins.keys(): @@ -618,23 +634,27 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover self._plot_total(raw=True) self._plot_total(raw=False) if multiple > 1: - self._plot_difference(label_names) + self._plot_difference(label_names, plot_name_add="_last") + self._prepare_pgram(generator, pos, multiple, use_multiprocessing=use_multiprocessing, + use_last_input_value=False) + self._plot_difference(label_names, plot_name_add="_first") @staticmethod def _has_filter_dimension(g, pos): - # check if coords raw data differs from input / target data - check_data = g.id_class - if "filter" not in [check_data.input_data, check_data.target_data][pos].coords.dims: + """Inspect if filtered data is provided and return number and labels of filtered components.""" + check_class = g.id_class + check_data = [check_class.get_X(as_numpy=False), check_class.get_Y(as_numpy=False)][pos] + if not hasattr(check_class, "filter_dim"): # data handler has no filtered data return 1, [] else: - if len(set(check_data._data[0].coords.dims).symmetric_difference(check_data.input_data.coords.dims)) > 0: - return g.id_class.input_data.coords["filter"].shape[0], g.id_class.input_data.coords[ - "filter"].values.tolist() - else: + filter_dim = check_class.filter_dim + if filter_dim not in check_data.coords.dims: # current data has no filter (e.g. target data) return 1, [] + else: + return check_data.coords[filter_dim].shape[0], check_data.coords[filter_dim].values.tolist() @TimeTrackingWrapper - def _prepare_pgram(self, generator, pos, multiple=1, use_multiprocessing=False): + def _prepare_pgram(self, generator, pos, multiple=1, use_multiprocessing=False, use_last_input_value=True): """ Create periodogram data. """ @@ -648,8 +668,8 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover plot_data_raw_single = dict() plot_data_mean_single = dict() self.f_index = np.logspace(-3, 0 if self._sampling == "daily" else np.log10(24), 1000) - raw_data_single = self._prepare_pgram_parallel_gen(generator, m, pos, use_multiprocessing) - # raw_data_single = self._prepare_pgram_parallel_var(generator, m, pos, use_multiprocessing) + raw_data_single = self._prepare_pgram_parallel_gen(generator, m, pos, use_multiprocessing, + use_last_input_value=use_last_input_value) for var in raw_data_single.keys(): pgram_com = [] pgram_mean = 0 @@ -701,7 +721,7 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover raw_data_single[var_str] = raw_data_single[var_str] + [(f, pgram)] return raw_data_single - def _prepare_pgram_parallel_gen(self, generator, m, pos, use_multiprocessing): + def _prepare_pgram_parallel_gen(self, generator, m, pos, use_multiprocessing, use_last_input_value=True): """Implementation of data preprocessing using parallel generator element processing.""" raw_data_single = dict() res = [] @@ -709,14 +729,15 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover pool = multiprocessing.Pool( min([psutil.cpu_count(logical=False), len(generator), 16])) # use only physical cpus output = [ - pool.apply_async(f_proc_2, args=(g, m, pos, self.variables_dim, self.time_dim, self.f_index)) + pool.apply_async(f_proc_2, args=(g, m, pos, self.variables_dim, self.time_dim, self.f_index, + use_last_input_value)) for g in generator] for i, p in enumerate(output): res.append(p.get()) pool.close() else: for g in generator: - res.append(f_proc_2(g, m, pos, self.variables_dim, self.time_dim, self.f_index)) + res.append(f_proc_2(g, m, pos, self.variables_dim, self.time_dim, self.f_index, use_last_input_value)) for res_dict in res: for k, v in res_dict.items(): if k not in raw_data_single.keys(): @@ -802,8 +823,8 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover pdf_pages.close() plt.close('all') - def _plot_difference(self, label_names): - plot_name = f"{self.plot_name}_{self._sampling}_{self._add_text}_filter.pdf" + def _plot_difference(self, label_names, plot_name_add = ""): + plot_name = f"{self.plot_name}_{self._sampling}_{self._add_text}_filter{plot_name_add}.pdf" plot_path = os.path.join(os.path.abspath(self.plot_folder), plot_name) logging.info(f"... plotting {plot_name}") pdf_pages = matplotlib.backends.backend_pdf.PdfPages(plot_path) @@ -832,21 +853,22 @@ class PlotPeriodogram(AbstractPlotClass): # pragma: no cover plt.close('all') -def f_proc(var, d_var, f_index, time_dim="datetime"): # pragma: no cover +def f_proc(var, d_var, f_index, time_dim="datetime", use_last_value=True): # pragma: no cover var_str = str(var) t = (d_var[time_dim] - d_var[time_dim][0]).astype("timedelta64[h]").values / np.timedelta64(1, "D") if len(d_var.shape) > 1: # use only max value if dimensions are remaining (e.g. max(window) -> latest value) to_remove = remove_items(d_var.coords.dims, time_dim) for e in to_list(to_remove): - d_var = d_var.sel({e: d_var[e].max()}) + d_var = d_var.sel({e: d_var[e].max() if use_last_value is True else d_var[e].min()}) pgram = LombScargle(t, d_var.values.flatten(), nterms=1, normalization="psd").power(f_index) # f, pgram = LombScargle(t, d_var.values.flatten(), nterms=1, normalization="psd").autopower() return var_str, f_index, pgram - -def f_proc_2(g, m, pos, variables_dim, time_dim, f_index): # pragma: no cover +def f_proc_2(g, m, pos, variables_dim, time_dim, f_index, use_last_value): # pragma: no cover raw_data_single = dict() + if hasattr(g.id_class, "lazy"): + g.id_class.load_lazy() if g.id_class.lazy is True else None if m == 0: d = g.id_class._data else: @@ -856,8 +878,10 @@ def f_proc_2(g, m, pos, variables_dim, time_dim, f_index): # pragma: no cover d = d[pos] if isinstance(d, tuple) else d for var in d[variables_dim].values: d_var = d.loc[{variables_dim: var}].squeeze().dropna(time_dim) - var_str, f, pgram = f_proc(var, d_var, f_index) + var_str, f, pgram = f_proc(var, d_var, f_index, use_last_value=use_last_value) raw_data_single[var_str] = [(f, pgram)] + if hasattr(g.id_class, "lazy"): + g.id_class.clean_up() if g.id_class.lazy is True else None return raw_data_single @@ -888,6 +912,8 @@ class PlotClimateFirFilter(AbstractPlotClass): from mlair.helpers.filter import fir_filter_convolve + logging.info(f"start PlotClimateFirFilter for ({name})") + # adjust default plot parameters rc_params = { 'axes.labelsize': 'large', @@ -951,59 +977,63 @@ class PlotClimateFirFilter(AbstractPlotClass): for it0, t0 in enumerate(viz_date_dict.keys()): viz_data = viz_date_dict[t0] residuum_true = None - for ifilter in sorted(viz_data.keys()): - data = viz_data[ifilter] - filter_input = data["filter_input"] - filter_input_nc = data["filter_input_nc"] if residuum_true is None else residuum_true.sel( - {new_dim: filter_input.coords[new_dim]}) - valid_range = data["valid_range"] - time_axis = data["time_range"] - filter_order = data["order"] - h = data["h"] - fig, ax = plt.subplots() - - # plot backgrounds - self._plot_valid_area(ax, t0, valid_range, td_type) - self._plot_t0(ax, t0) - - # original data - self._plot_original_data(ax, time_axis, filter_input_nc) - - # clim apriori - self._plot_apriori(ax, time_axis, filter_input, new_dim, ifilter) - - # clim filter response - residuum_estimated = self._plot_clim_filter(ax, time_axis, filter_input, new_dim, h, + try: + for ifilter in sorted(viz_data.keys()): + data = viz_data[ifilter] + filter_input = data["filter_input"] + filter_input_nc = data["filter_input_nc"] if residuum_true is None else residuum_true.sel( + {new_dim: filter_input.coords[new_dim]}) + valid_range = data["valid_range"] + time_axis = data["time_range"] + filter_order = data["order"] + h = data["h"] + fig, ax = plt.subplots() + + # plot backgrounds + self._plot_valid_area(ax, t0, valid_range, td_type) + self._plot_t0(ax, t0) + + # original data + self._plot_original_data(ax, time_axis, filter_input_nc) + + # clim apriori + self._plot_apriori(ax, time_axis, filter_input, new_dim, ifilter) + + # clim filter response + residuum_estimated = self._plot_clim_filter(ax, time_axis, filter_input, new_dim, h, + output_dtypes=filter_input.dtype) + + # ideal filter response + residuum_true = self._plot_ideal_filter(ax, time_axis, filter_input_nc, new_dim, h, output_dtypes=filter_input.dtype) - # ideal filter response - residuum_true = self._plot_ideal_filter(ax, time_axis, filter_input_nc, new_dim, h, - output_dtypes=filter_input.dtype) - - # set title, legend, and save plot - xlims = self._set_xlim(ax, t0, filter_order, valid_range, td_type, time_axis) - - plt.title(f"Input of ClimFilter ({str(var)})") - plt.legend() - fig.autofmt_xdate() - plt.tight_layout() - self.plot_name = f"climFIR_{self._name}_{str(var)}_{it0}_{ifilter}" - self._save() - - # plot residuum - fig, ax = plt.subplots() - self._plot_valid_area(ax, t0, valid_range, td_type) - self._plot_t0(ax, t0) - self._plot_series(ax, time_axis, residuum_true.values.flatten(), style="ideal") - self._plot_series(ax, time_axis, residuum_estimated.values.flatten(), style="clim") - ax.set_xlim(xlims) - plt.title(f"Residuum of ClimFilter ({str(var)})") - plt.legend(loc="upper left") - fig.autofmt_xdate() - plt.tight_layout() - - self.plot_name = f"climFIR_{self._name}_{str(var)}_{it0}_{ifilter}_residuum" - self._save() + # set title, legend, and save plot + xlims = self._set_xlim(ax, t0, filter_order, valid_range, td_type, time_axis) + + plt.title(f"Input of ClimFilter ({str(var)})") + plt.legend() + fig.autofmt_xdate() + plt.tight_layout() + self.plot_name = f"climFIR_{self._name}_{str(var)}_{it0}_{ifilter}" + self._save() + + # plot residuum + fig, ax = plt.subplots() + self._plot_valid_area(ax, t0, valid_range, td_type) + self._plot_t0(ax, t0) + self._plot_series(ax, time_axis, residuum_true.values.flatten(), style="ideal") + self._plot_series(ax, time_axis, residuum_estimated.values.flatten(), style="clim") + ax.set_xlim(xlims) + plt.title(f"Residuum of ClimFilter ({str(var)})") + plt.legend(loc="upper left") + fig.autofmt_xdate() + plt.tight_layout() + + self.plot_name = f"climFIR_{self._name}_{str(var)}_{it0}_{ifilter}_residuum" + self._save() + except Exception as e: + logging.info(f"Could not create plot because of:\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + pass def _set_xlim(self, ax, t0, order, valid_range, td_type, time_axis): """ diff --git a/mlair/plotting/postprocessing_plotting.py b/mlair/plotting/postprocessing_plotting.py index ecba8a4e0a3369fbb170a7427ef81365d531bc3b..43f1864f7354c1f711bb886f4f97eda56439ab89 100644 --- a/mlair/plotting/postprocessing_plotting.py +++ b/mlair/plotting/postprocessing_plotting.py @@ -15,6 +15,7 @@ import pandas as pd import seaborn as sns import xarray as xr from matplotlib.backends.backend_pdf import PdfPages +from matplotlib.offsetbox import AnchoredText from mlair import helpers from mlair.data_handler.iterator import DataCollection @@ -30,7 +31,7 @@ logging.getLogger('matplotlib').setLevel(logging.WARNING) @TimeTrackingWrapper -class PlotMonthlySummary(AbstractPlotClass): +class PlotMonthlySummary(AbstractPlotClass): # pragma: no cover """ Show a monthly summary over all stations for each lead time ("ahead") as box and whiskers plot. @@ -128,7 +129,7 @@ class PlotMonthlySummary(AbstractPlotClass): logging.debug("... start plotting") color_palette = [matplotlib.colors.cnames["green"]] + sns.color_palette("Blues_d", self._window_lead_time).as_hex() - ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1., palette=color_palette, + ax = sns.boxplot(x='index', y='values', hue='ahead', data=data.compute(), whis=1.5, palette=color_palette, flierprops={'marker': '.', 'markersize': 1}, showmeans=True, meanprops={'markersize': 1, 'markeredgecolor': 'k'}) ylabel = self._spell_out_chemical_concentrations(target_var) @@ -137,7 +138,7 @@ class PlotMonthlySummary(AbstractPlotClass): @TimeTrackingWrapper -class PlotConditionalQuantiles(AbstractPlotClass): +class PlotConditionalQuantiles(AbstractPlotClass): # pragma: no cover """ Create cond.quantile plots as originally proposed by Murphy, Brown and Chen (1989) [But in log scale]. @@ -381,7 +382,7 @@ class PlotConditionalQuantiles(AbstractPlotClass): @TimeTrackingWrapper -class PlotClimatologicalSkillScore(AbstractPlotClass): +class PlotClimatologicalSkillScore(AbstractPlotClass): # pragma: no cover """ Create plot of climatological skill score after Murphy (1988) as box plot over all stations. @@ -448,7 +449,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass): fig, ax = plt.subplots() if not score_only: fig.set_size_inches(11.7, 8.27) - sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1., palette="Blues_d", + sns.boxplot(x="terms", y="data", hue="ahead", data=self._data, ax=ax, whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) ax.axhline(y=0, color="grey", linewidth=.5) ax.set(ylabel=f"{self._label_add(score_only)}skill score", xlabel="", title="summary of all stations", @@ -473,7 +474,7 @@ class PlotClimatologicalSkillScore(AbstractPlotClass): @TimeTrackingWrapper -class PlotCompetitiveSkillScore(AbstractPlotClass): +class PlotCompetitiveSkillScore(AbstractPlotClass): # pragma: no cover """ Create competitive skill score plot. @@ -491,12 +492,12 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): """ - def __init__(self, data: pd.DataFrame, plot_folder=".", model_setup="NN"): + def __init__(self, data: Dict[str, pd.DataFrame], plot_folder=".", model_setup="NN"): """Initialise.""" super().__init__(plot_folder, f"skill_score_competitive_{model_setup}") self._model_setup = model_setup self._labels = None - self._data = self._prepare_data(data) + self._data = self._prepare_data(helpers.remove_items(data, "total")) default_plot_name = self.plot_name # draw full detail plot self.plot_name = default_plot_name + "_full_detail" @@ -538,7 +539,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): fig, ax = plt.subplots(figsize=(size, size * 0.8)) data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(x="comparison", y="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axhline(y=0, color="grey", linewidth=.5) @@ -553,7 +554,7 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): fig, ax = plt.subplots() data = self._filter_comparisons(self._data) if single_model_comparison is True else self._data order = self._create_pseudo_order(data) - sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1., ax=ax, palette="Blues_d", + sns.boxplot(y="comparison", x="data", hue="ahead", data=data, whis=1.5, ax=ax, palette="Blues_d", showmeans=True, meanprops={"markersize": 3, "markeredgecolor": "k"}, flierprops={"marker": "."}, order=order) ax.axvline(x=0, color="grey", linewidth=.5) @@ -590,14 +591,9 @@ class PlotCompetitiveSkillScore(AbstractPlotClass): @TimeTrackingWrapper -class PlotBootstrapSkillScore(AbstractPlotClass): +class PlotFeatureImportanceSkillScore(AbstractPlotClass): # pragma: no cover """ - Create plot of climatological skill score after Murphy (1988) as box plot over all stations. - - A forecast time step (called "ahead") is separately shown to highlight the differences for each prediction time - step. Either each single term is plotted (score_only=False) or only the resulting scores CASE I to IV are displayed - (score_only=True, default). Y-axis is adjusted following the data and not hard coded. The plot is saved under - plot_folder path with name skill_score_clim_{extra_name_tag}{model_setup}.pdf and resolution of 500dpi. + Create plot of feature importance analysis. By passing a list `separate_vars` containing variable names, a second plot is created showing the `separate_vars` and the remaining variables side by side with different scaling. @@ -612,7 +608,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): def __init__(self, data: Dict, plot_folder: str = ".", model_setup: str = "", separate_vars: List = None, sampling: str = "daily", ahead_dim: str = "ahead", bootstrap_type: str = None, - bootstrap_method: str = None): + bootstrap_method: str = None, boot_dim: str = "boots", model_name: str = "NN", + branch_names: list = None, ylim: tuple = None): """ Set attributes and create plot. @@ -625,24 +622,32 @@ class PlotBootstrapSkillScore(AbstractPlotClass): :param bootstrap_annotation: additional information to use in the file name (default: None) """ annotation = ["_".join([s for s in ["", bootstrap_type, bootstrap_method] if s is not None])][0] - super().__init__(plot_folder, f"skill_score_bootstrap_{model_setup}{annotation}") + super().__init__(plot_folder, f"feature_importance_{model_setup}{annotation}") if separate_vars is None: separate_vars = ['o3'] self._labels = None self._x_name = "boot_var" self._ahead_dim = ahead_dim + self._boot_dim = boot_dim self._boot_type = self._set_bootstrap_type(bootstrap_type) self._boot_method = self._set_bootstrap_method(bootstrap_method) + self._number_of_bootstraps = 0 + self._branches_names = branch_names + self._ylim = ylim - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type})" self._data = self._prepare_data(data, sampling) + self._set_title(model_name) if "branch" in self._data.columns: plot_name = self.plot_name for branch in self._data["branch"].unique(): - self._title = f"Bootstrap analysis ({self._boot_method}, {self._boot_type}, {branch})" + self._set_title(model_name, branch) self._plot(branch=branch) self.plot_name = f"{plot_name}_{branch}" self._save() + if len(set(separate_vars).intersection(self._data[self._x_name].unique())) > 0: + self.plot_name += '_separated' + self._plot(branch=branch, separate_vars=separate_vars) + self._save(bbox_inches='tight') else: self._plot() self._save() @@ -655,6 +660,21 @@ class PlotBootstrapSkillScore(AbstractPlotClass): def _set_bootstrap_type(boot_type): return {"singleinput": "single input"}.get(boot_type, boot_type) + def _set_title(self, model_name, branch=None): + title_d = {"single input": "Single Inputs", "branch": "Input Branches", "variable": "Variables"} + base_title = f"{model_name}\nImportance of {title_d[self._boot_type]}" + + additional = [] + if branch is not None: + branch_name = self._branches_names[branch] if self._branches_names is not None else branch + additional.append(branch_name) + if self._number_of_bootstraps > 1: + additional.append(f"n={self._number_of_bootstraps}") + additional_title = ", ".join(additional) + if len(additional_title) > 0: + additional_title = f" ({additional_title})" + self._title = base_title + additional_title + @staticmethod def _set_bootstrap_method(boot_method): return {"zero_mean": "zero mean", "shuffle": "shuffled"}.get(boot_method, boot_method) @@ -671,14 +691,16 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ station_dim = "station" data = helpers.dict_to_xarray(data, station_dim).sortby(self._x_name) + data = data.transpose(station_dim, self._ahead_dim, self._boot_dim, self._x_name) if self._boot_type == "single input": number_tags = self._get_number_tag(data.coords[self._x_name].values, split_by='_') new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', keep=1, as_unique=True) - values = data.values.reshape((data.shape[0], len(new_boot_coords), len(number_tags), data.shape[-1])) - data = xr.DataArray(values, coords={station_dim: data.coords["station"], self._x_name: new_boot_coords, - "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim]}, - dims=[station_dim, self._x_name, "branch", self._ahead_dim]) + values = data.values.reshape((*data.shape[:3], len(number_tags), len(new_boot_coords))) + data = xr.DataArray(values, coords={station_dim: data.coords[station_dim], self._x_name: new_boot_coords, + "branch": number_tags, self._ahead_dim: data.coords[self._ahead_dim], + self._boot_dim: data.coords[self._boot_dim]}, + dims=[station_dim, self._ahead_dim, self._boot_dim, "branch", self._x_name]) else: try: new_boot_coords = self._return_vars_without_number_tag(data.coords[self._x_name].values, split_by='_', @@ -690,6 +712,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): self._labels = [str(i) + sampling_letter for i in data.coords[self._ahead_dim].values] if station_dim not in data.dims: data = data.expand_dims(station_dim) + self._number_of_bootstraps = np.unique(data.coords[self._boot_dim].values).shape[0] return data.to_dataframe("data").reset_index(level=np.arange(len(data.dims)).tolist()) @staticmethod @@ -738,10 +761,10 @@ class PlotBootstrapSkillScore(AbstractPlotClass): if separate_vars is None: self._plot_all_variables(branch) else: - self._plot_selected_variables(separate_vars) + self._plot_selected_variables(separate_vars, branch) - def _plot_selected_variables(self, separate_vars: List): - data = self._data + def _plot_selected_variables(self, separate_vars: List, branch=None): + data = self._data if branch is None else self._data[self._data["branch"] == str(branch)] self.raise_error_if_separate_vars_do_not_exist(data, separate_vars, self._x_name) all_variables = self._get_unique_values_from_column_of_df(data, self._x_name) remaining_vars = helpers.remove_items(all_variables, separate_vars) @@ -749,22 +772,30 @@ class PlotBootstrapSkillScore(AbstractPlotClass): data_second = self._select_data(df=data, variables=remaining_vars, column_name=self._x_name) fig, ax = plt.subplots(nrows=1, ncols=2, gridspec_kw={'width_ratios': [len(separate_vars), - len(remaining_vars)]}) + len(remaining_vars)]}, + figsize=(len(remaining_vars),len(remaining_vars)/2.)) if len(separate_vars) > 1: first_box_width = .8 else: - first_box_width = 2. + first_box_width = .8 - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_first, ax=ax[0], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, - flierprops={"marker": "."}, width=first_box_width) + showfliers=False, width=first_box_width) ax[0].set(ylabel=f"skill score", xlabel="") + if self._ylim is not None: + _ylim = self._ylim if isinstance(self._ylim, tuple) else self._ylim[0] + ax[0].set(ylim=_ylim) - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1., + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=data_second, ax=ax[1], whis=1.5, palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, - flierprops={"marker": "."}) + showfliers=False) ax[1].set(ylabel="", xlabel="") ax[1].yaxis.tick_right() + if self._ylim is not None and isinstance(self._ylim, list): + _ylim = self._ylim[1] + ax[1].set(ylim=_ylim) + handles, _ = ax[1].get_legend_handles_labels() for sax in ax: matplotlib.pyplot.sca(sax) @@ -772,7 +803,8 @@ class PlotBootstrapSkillScore(AbstractPlotClass): plt.xticks(rotation=45, ha='right') sax.legend_.remove() - fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + # fig.legend(handles, self._labels, loc='upper center', ncol=len(handles) + 1, ) + ax[1].legend(handles, self._labels, loc='lower center', ncol=len(handles) + 1, fontsize="medium") def align_yaxis(ax1, ax2): """ @@ -797,6 +829,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): align_yaxis(ax[0], ax[1]) align_yaxis(ax[0], ax[1]) + plt.subplots_adjust(right=0.8) plt.title(self._title) @staticmethod @@ -826,12 +859,29 @@ class PlotBootstrapSkillScore(AbstractPlotClass): """ """ - fig, ax = plt.subplots() plot_data = self._data if branch is None else self._data[self._data["branch"] == str(branch)] - sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., palette="Blues_d", - showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, flierprops={"marker": "."}) + if self._boot_type == "branch": + fig, ax = plt.subplots(figsize=(0.5 + 2 / len(plot_data[self._x_name].unique()) + len(plot_data[self._x_name].unique()),4)) + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1., + palette="Blues_d", showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, + showfliers=False, width=0.8) + else: + fig, ax = plt.subplots() + sns.boxplot(x=self._x_name, y="data", hue=self._ahead_dim, data=plot_data, ax=ax, whis=1.5, palette="Blues_d", + showmeans=True, meanprops={"markersize": 1, "markeredgecolor": "k"}, showfliers=False) ax.axhline(y=0, color="grey", linewidth=.5) - plt.xticks(rotation=45) + + if self._ylim is not None: + if isinstance(self._ylim, tuple): + _ylim = self._ylim + else: + _ylim = (min(self._ylim[0][0], self._ylim[1][0]), max(self._ylim[0][1], self._ylim[1][1])) + ax.set(ylim=_ylim) + + if self._boot_type == "branch": + plt.xticks() + else: + plt.xticks(rotation=45) ax.set(ylabel=f"skill score", xlabel="", title=self._title) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, self._labels) @@ -839,7 +889,7 @@ class PlotBootstrapSkillScore(AbstractPlotClass): @TimeTrackingWrapper -class PlotTimeSeries: +class PlotTimeSeries: # pragma: no cover """ Create time series plot. @@ -847,13 +897,14 @@ class PlotTimeSeries: """ def __init__(self, stations: List, data_path: str, name: str, window_lead_time: int = None, plot_folder: str = ".", - sampling="daily", model_name="nn", obs_name="obs"): + sampling="daily", model_name="nn", obs_name="obs", ahead_dim="ahead"): """Initialise.""" self._data_path = data_path self._data_name = name self._stations = stations self._model_name = model_name self._obs_name = obs_name + self._ahead_dim = ahead_dim self._window_lead_time = self._get_window_lead_time(window_lead_time) self._sampling = self._get_sampling(sampling) self._plot(plot_folder) @@ -876,7 +927,7 @@ class PlotTimeSeries: :param window_lead_time: lead time from arguments to validate :return: validated lead time, comes either from given argument or from data itself """ - ahead_steps = len(self._load_data(self._stations[0]).ahead) + ahead_steps = len(self._load_data(self._stations[0]).coords[self._ahead_dim]) if window_lead_time is None: window_lead_time = ahead_steps return min(ahead_steps, window_lead_time) @@ -898,10 +949,13 @@ class PlotTimeSeries: data_year = data.sel(index=f"{start + i_year}") for i_half_of_year in range(factor): pos = factor * i_year + i_half_of_year - plot_data = self._create_plot_data(data_year, factor, i_half_of_year) - self._plot_obs(axes[pos], plot_data) - self._plot_ahead(axes[pos], plot_data) - if np.isnan(plot_data.values).all(): + try: + plot_data = self._create_plot_data(data_year, factor, i_half_of_year) + self._plot_obs(axes[pos], plot_data) + self._plot_ahead(axes[pos], plot_data) + if np.isnan(plot_data.values).all(): + nan_list.append(pos) + except Exception: nan_list.append(pos) self._clean_up_axes(nan_list, axes, fig) self._save_page(station, pdf_pages) @@ -938,9 +992,8 @@ class PlotTimeSeries: def _plot_ahead(self, ax, data): color = sns.color_palette("Blues_d", self._window_lead_time).as_hex() - for ahead in data.coords["ahead"].values: - plot_data = data.sel(type=self._model_name, ahead=ahead).drop(["type", "ahead"]).squeeze().shift( - index=ahead) + for ahead in data.coords[self._ahead_dim].values: + plot_data = data.sel({"type": self._model_name, self._ahead_dim: ahead}).drop(["type", self._ahead_dim]).squeeze().shift(index=ahead) label = f"{ahead}{self._sampling}" ax.plot(plot_data, color=color[ahead - 1], label=label) @@ -969,7 +1022,7 @@ class PlotTimeSeries: @TimeTrackingWrapper -class PlotSeparationOfScales(AbstractPlotClass): +class PlotSeparationOfScales(AbstractPlotClass): # pragma: no cover def __init__(self, collection: DataCollection, plot_folder: str = ".", time_dim="datetime", window_dim="window", filter_dim="filter", target_dim="variables"): @@ -995,6 +1048,80 @@ class PlotSeparationOfScales(AbstractPlotClass): self._save() +@TimeTrackingWrapper +class PlotSampleUncertaintyFromBootstrap(AbstractPlotClass): # pragma: no cover + + def __init__(self, data: xr.DataArray, plot_folder: str = ".", model_type_dim: str = "type", + error_measure: str = "mse", error_unit: str = None, dim_name_boots: str = 'boots', + block_length: str = None): + super().__init__(plot_folder, "sample_uncertainty_from_bootstrap") + default_name = self.plot_name + self.model_type_dim = model_type_dim + self.error_measure = error_measure + self.dim_name_boots = dim_name_boots + self.error_unit = error_unit + self.block_length = block_length + self.prepare_data(data) + self._plot(orientation="v") + + self.plot_name = default_name + "_horizontal" + self._plot(orientation="h") + + self._apply_root() + + self.plot_name = default_name + "_sqrt" + self._plot(orientation="v") + + self.plot_name = default_name + "_horizontal_sqrt" + self._plot(orientation="h") + + self._data_table = None + self._n_boots = None + + def prepare_data(self, data: xr.DataArray): + self._data_table = data.to_pandas() + if "persi" in self._data_table.columns: + self._data_table["persi"] = self._data_table.pop("persi") + self._n_boots = self._data_table.shape[0] + + def _apply_root(self): + self._data_table = np.sqrt(self._data_table) + self.error_measure = f"root {self.error_measure}" + self.error_unit = self.error_unit.replace("$^2$", "") + + def _plot(self, orientation: str = "v"): + data_table = self._data_table + n_boots = self._n_boots + size = len(np.unique(data_table.columns)) + if orientation == "v": + figsize, width = (size, 5), 0.4 + elif orientation == "h": + figsize, width = (6, (1+.5*size)), 0.65 + else: + raise ValueError(f"orientation must be `v' or `h' but is: {orientation}") + fig, ax = plt.subplots(figsize=figsize) + sns.boxplot(data=data_table, ax=ax, whis=1.5, color="white", + showmeans=True, meanprops={"markersize": 6, "markeredgecolor": "k"}, + flierprops={"marker": "o", "markerfacecolor": "black", "markeredgecolor": "none", "markersize": 3}, + boxprops={'facecolor': 'none', 'edgecolor': 'k'}, + width=width, orient=orientation) + if orientation == "v": + ax.set_ylabel(f"{self.error_measure} (in {self.error_unit})") + ax.set_xticklabels(ax.get_xticklabels(), rotation=45) + elif orientation == "h": + ax.set_xlabel(f"{self.error_measure} (in {self.error_unit})") + else: + raise ValueError(f"orientation must be `v' or `h' but is: {orientation}") + text = f"n={n_boots}" if self.block_length is None else f"{self.block_length}, n={n_boots}" + text_box = AnchoredText(text, frameon=True, loc=1, pad=0.5) + plt.setp(text_box.patch, edgecolor='k', facecolor='w') + ax.add_artist(text_box) + plt.setp(ax.lines, color='k') + plt.tight_layout() + self._save() + plt.close("all") + + if __name__ == "__main__": stations = ['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'] path = "../../testrun_network/forecasts" diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 209859c1ff38efe2667c918aa5b79c96f2524be0..63be6eb4c6e8b5f8d3149df023e07d23805f077f 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -18,10 +18,12 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_WINDOW_DIM, DEFAULT_DIMENSIONS, DEFAULT_TIME_DIM, DEFAULT_INTERPOLATION_METHOD, DEFAULT_INTERPOLATION_LIMIT, \ DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \ DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ - DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ - DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ + DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_FEATURE_IMPORTANCE, DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS, \ + DEFAULT_FEATURE_IMPORTANCE_N_BOOTS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ DEFAULT_USE_MULTIPROCESSING, DEFAULT_USE_MULTIPROCESSING_ON_DEBUG, DEFAULT_MAX_NUMBER_MULTIPROCESSING, \ - DEFAULT_BOOTSTRAP_TYPE, DEFAULT_BOOTSTRAP_METHOD + DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE, DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD, DEFAULT_OVERWRITE_LAZY_DATA, \ + DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, \ + DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, DEFAULT_DO_UNCERTAINTY_ESTIMATE from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -212,13 +214,17 @@ class ExperimentSetup(RunEnvironment): sampling: str = None, create_new_model=None, bootstrap_path=None, permute_data_on_training=None, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None, - extremes_on_right_tail_only: bool = None, evaluate_bootstraps=None, plot_list=None, - number_of_bootstraps=None, create_new_bootstraps=None, bootstrap_method=None, bootstrap_type=None, + extremes_on_right_tail_only: bool = None, evaluate_feature_importance: bool = None, plot_list=None, + feature_importance_n_boots: int = None, feature_importance_create_new_bootstraps: bool = None, + feature_importance_bootstrap_method=None, feature_importance_bootstrap_type=None, data_path: str = None, batch_path: str = None, login_nodes=None, hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, data_origin: Dict = None, competitors: list = None, competitor_path: str = None, use_multiprocessing: bool = None, use_multiprocessing_on_debug: bool = None, - max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, **kwargs): + max_number_multiprocessing: int = None, start_script: Union[Callable, str] = None, + overwrite_lazy_data: bool = None, uncertainty_estimate_block_length: str = None, + uncertainty_estimate_evaluate_competitors: bool = None, uncertainty_estimate_n_boots: int = None, + do_uncertainty_estimate: bool = None, **kwargs): # create run framework super().__init__() @@ -287,6 +293,10 @@ class ExperimentSetup(RunEnvironment): self._set_param("logging_path", None, os.path.join(experiment_path, "logging")) path_config.check_path_and_create(self.data_store.get("logging_path")) + # set tmp path + self._set_param("tmp_path", None, os.path.join(experiment_path, "tmp")) + path_config.check_path_and_create(self.data_store.get("tmp_path"), remove_existing=True) + # setup for data self._set_param("stations", stations, default=DEFAULT_STATIONS, apply=helpers.to_list) self._set_param("statistics_per_var", statistics_per_var, default=DEFAULT_VAR_ALL_DICT) @@ -297,6 +307,8 @@ class ExperimentSetup(RunEnvironment): self._set_param("window_history_size", window_history_size, default=DEFAULT_WINDOW_HISTORY_SIZE) self._set_param("overwrite_local_data", overwrite_local_data, default=DEFAULT_OVERWRITE_LOCAL_DATA, scope="preprocessing") + self._set_param("overwrite_lazy_data", overwrite_lazy_data, default=DEFAULT_OVERWRITE_LAZY_DATA, + scope="preprocessing") self._set_param("transformation", transformation, default={}) self._set_param("transformation", None, scope="preprocessing") self._set_param("data_handler", data_handler, default=DefaultDataHandler) @@ -342,15 +354,28 @@ class ExperimentSetup(RunEnvironment): default=DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS) # set post-processing instructions - self._set_param("evaluate_bootstraps", evaluate_bootstraps, default=DEFAULT_EVALUATE_BOOTSTRAPS, - scope="general.postprocessing") - create_new_bootstraps = max([self.data_store.get("train_model", "general"), - create_new_bootstraps or DEFAULT_CREATE_NEW_BOOTSTRAPS]) - self._set_param("create_new_bootstraps", create_new_bootstraps, scope="general.postprocessing") - self._set_param("number_of_bootstraps", number_of_bootstraps, default=DEFAULT_NUMBER_OF_BOOTSTRAPS, - scope="general.postprocessing") - self._set_param("bootstrap_method", bootstrap_method, default=DEFAULT_BOOTSTRAP_METHOD) - self._set_param("bootstrap_type", bootstrap_type, default=DEFAULT_BOOTSTRAP_TYPE) + self._set_param("do_uncertainty_estimate", do_uncertainty_estimate, + default=DEFAULT_DO_UNCERTAINTY_ESTIMATE, scope="general.postprocessing") + self._set_param("block_length", uncertainty_estimate_block_length, + default=DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH, scope="uncertainty_estimate") + self._set_param("evaluate_competitors", uncertainty_estimate_evaluate_competitors, + default=DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS, scope="uncertainty_estimate") + self._set_param("n_boots", uncertainty_estimate_n_boots, + default=DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS, scope="uncertainty_estimate") + + self._set_param("evaluate_feature_importance", evaluate_feature_importance, + default=DEFAULT_EVALUATE_FEATURE_IMPORTANCE, scope="general.postprocessing") + feature_importance_create_new_bootstraps = max([self.data_store.get("train_model", "general"), + feature_importance_create_new_bootstraps or + DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS]) + self._set_param("create_new_bootstraps", feature_importance_create_new_bootstraps, scope="feature_importance") + self._set_param("n_boots", feature_importance_n_boots, default=DEFAULT_FEATURE_IMPORTANCE_N_BOOTS, + scope="feature_importance") + self._set_param("bootstrap_method", feature_importance_bootstrap_method, + default=DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD, scope="feature_importance") + self._set_param("bootstrap_type", feature_importance_bootstrap_type, + default=DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE, scope="feature_importance") + self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing") self._set_param("neighbors", ["DEBW030"]) # TODO: just for testing @@ -377,8 +402,10 @@ class ExperimentSetup(RunEnvironment): if len(self.data_store.search_name(k)) == 0: self._set_param(k, v) else: + s = ", ".join([f"{k}({s})={self.data_store.get(k, scope=s)}" + for s in self.data_store.search_name(k)]) raise KeyError(f"Given argument {k} with value {v} cannot be set for this experiment due to a " - f"conflict with an existing entry with same naming: {k}={self.data_store.get(k)}") + f"conflict with an existing entry with same naming: {s}") def _set_param(self, param: str, value: Any, default: Any = None, scope: str = "general", apply: Callable = None) -> Any: diff --git a/mlair/run_modules/post_processing.py b/mlair/run_modules/post_processing.py index f3909fde29b466af1bf64124ab1d57873ae70d18..e3aa2154559622fdd699d430bc4d386499f5114d 100644 --- a/mlair/run_modules/post_processing.py +++ b/mlair/run_modules/post_processing.py @@ -6,6 +6,8 @@ __date__ = '2019-12-11' import inspect import logging import os +import sys +import traceback from typing import Dict, Tuple, Union, List, Callable import keras @@ -14,13 +16,14 @@ import pandas as pd import xarray as xr from mlair.configuration import path_config -from mlair.data_handler import BootStraps, KerasIterator +from mlair.data_handler import Bootstraps, KerasIterator from mlair.helpers.datastore import NameNotFoundInDataStore from mlair.helpers import TimeTracking, statistics, extract_value, remove_items, to_list, tables from mlair.model_modules.linear_model import OrdinaryLeastSquaredModel from mlair.model_modules import AbstractModelClass from mlair.plotting.postprocessing_plotting import PlotMonthlySummary, PlotClimatologicalSkillScore, \ - PlotCompetitiveSkillScore, PlotTimeSeries, PlotBootstrapSkillScore, PlotConditionalQuantiles, PlotSeparationOfScales + PlotCompetitiveSkillScore, PlotTimeSeries, PlotFeatureImportanceSkillScore, PlotConditionalQuantiles, \ + PlotSeparationOfScales, PlotSampleUncertaintyFromBootstrap from mlair.plotting.data_insight_plotting import PlotStationMap, PlotAvailability, PlotAvailabilityHistogram, \ PlotPeriodogram, PlotDataHistogram from mlair.run_modules.run_environment import RunEnvironment @@ -46,7 +49,7 @@ class PostProcessing(RunEnvironment): * `target_var` [.] * `sampling` [.] * `output_shape` [model] - * `evaluate_bootstraps` [postprocessing] and if enabled: + * `evaluate_feature_importance` [postprocessing] and if enabled: * `create_new_bootstraps` [postprocessing] * `bootstrap_path` [postprocessing] @@ -81,11 +84,17 @@ class PostProcessing(RunEnvironment): self._sampling = self.data_store.get("sampling") self.window_lead_time = extract_value(self.data_store.get("output_shape", "model")) self.skill_scores = None - self.bootstrap_skill_scores = None + self.feature_importance_skill_scores = None + self.uncertainty_estimate = None self.competitor_path = self.data_store.get("competitor_path") self.competitors = to_list(self.data_store.get_default("competitors", default=[])) self.forecast_indicator = "nn" + self.observation_indicator = "obs" self.ahead_dim = "ahead" + self.boot_var_dim = "boot_var" + self.uncertainty_estimate_boot_dim = "boots" + self.model_type_dim = "type" + self.index_dim = "index" self._run() def _run(self): @@ -99,25 +108,132 @@ class PostProcessing(RunEnvironment): # calculate error metrics on test data self.calculate_test_score() - # bootstraps - if self.data_store.get("evaluate_bootstraps", "postprocessing"): - with TimeTracking(name="calculate bootstraps"): - create_new_bootstraps = self.data_store.get("create_new_bootstraps", "postprocessing") - bootstrap_method = self.data_store.get("bootstrap_method", "postprocessing") - bootstrap_type = self.data_store.get("bootstrap_type", "postprocessing") - self.bootstrap_postprocessing(create_new_bootstraps, bootstrap_type=bootstrap_type, - bootstrap_method=bootstrap_method) + # sample uncertainty + if self.data_store.get("do_uncertainty_estimate", "postprocessing"): + self.estimate_sample_uncertainty() + + # feature importance bootstraps + if self.data_store.get("evaluate_feature_importance", "postprocessing"): + with TimeTracking(name="calculate feature importance using bootstraps"): + create_new_bootstraps = self.data_store.get("create_new_bootstraps", "feature_importance") + bootstrap_method = self.data_store.get("bootstrap_method", "feature_importance") + bootstrap_type = self.data_store.get("bootstrap_type", "feature_importance") + self.calculate_feature_importance(create_new_bootstraps, bootstrap_type=bootstrap_type, + bootstrap_method=bootstrap_method) + if self.feature_importance_skill_scores is not None: + self.report_feature_importance_results(self.feature_importance_skill_scores) # skill scores and error metrics with TimeTracking(name="calculate skill scores"): - skill_score_competitive, skill_score_climatological, errors = self.calculate_error_metrics() + skill_score_competitive, _, skill_score_climatological, errors = self.calculate_error_metrics() self.skill_scores = (skill_score_competitive, skill_score_climatological) self.report_error_metrics(errors) - self.report_error_metrics(skill_score_climatological) + self.report_error_metrics({self.forecast_indicator: skill_score_climatological}) + self.report_error_metrics({"skill_score": skill_score_competitive}) # plotting self.plot() + def estimate_sample_uncertainty(self, separate_ahead=False): + """ + Estimate sample uncertainty by using a bootstrap approach. Forecasts are split into individual blocks along time + and randomly drawn with replacement. The resulting behaviour of the error indicates the robustness of each + analyzed model to quantify which model might be superior compared to others. + """ + n_boots = self.data_store.get_default("n_boots", default=1000, scope="uncertainty_estimate") + block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") + evaluate_competitors = self.data_store.get_default("evaluate_competitors", default=True, + scope="uncertainty_estimate") + block_mse = self.calculate_block_mse(evaluate_competitors=evaluate_competitors, separate_ahead=separate_ahead, + block_length=block_length) + self.uncertainty_estimate = statistics.create_n_bootstrap_realizations( + block_mse, dim_name_time=self.index_dim, dim_name_model=self.model_type_dim, + dim_name_boots=self.uncertainty_estimate_boot_dim, n_boots=n_boots) + self.report_sample_uncertainty() + + def report_sample_uncertainty(self, percentiles: list = None): + """ + Store raw results of uncertainty estimate and calculate aggregate statistcs and store as raw data but also as + markdown and latex. + """ + report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") + path_config.check_path_and_create(report_path) + + # store raw results as nc + file_name = os.path.join(report_path, "uncertainty_estimate_raw_results.nc") + self.uncertainty_estimate.to_netcdf(path=file_name) + + # store statistics + if percentiles is None: + percentiles = [.05, .1, .25, .5, .75, .9, .95] + df_descr = self.uncertainty_estimate.to_pandas().describe(percentiles=percentiles).astype("float32") + column_format = tables.create_column_format_for_tex(df_descr) + file_name = os.path.join(report_path, "uncertainty_estimate_statistics.%s") + tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df_descr) + tables.save_to_md(report_path, file_name % "md", df=df_descr) + df_descr.to_csv(file_name % "csv", sep=";") + + def calculate_block_mse(self, evaluate_competitors=True, separate_ahead=False, block_length="1m"): + """ + Transform data into blocks along time axis. Block length can be any frequency like '1m' or '7d. Data are only + split along time axis, which means that a single block can have very diverse quantities regarding the number of + station or actual data contained. This is intended to analyze not only the robustness against the time but also + against the number of observations and diversity ot stations. + """ + path = self.data_store.get("forecast_path") + all_stations = self.data_store.get("stations") + start = self.data_store.get("start", "test") + end = self.data_store.get("end", "test") + index_dim = self.index_dim + coll_dim = "station" + collector = [] + for station in all_stations: + # test data + external_data = self._get_external_data(station, path) + if external_data is not None: + pass + # competitors + if evaluate_competitors is True: + competitor = self.load_competitors(station) + combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim) + else: + combined = external_data + + if combined is None: + continue + else: + combined = self.create_full_time_dim(combined, index_dim, self._sampling, start, end) + # get squared errors + errors = self.create_error_array(combined) + # calc mse for each block (single station) + mse = errors.resample(indexer={index_dim: block_length}).mean(skipna=True) + collector.append(mse.assign_coords({coll_dim: station})) + # calc mse for each block (average over all stations) + mse_blocks = xr.concat(collector, dim=coll_dim).mean(dim=coll_dim, skipna=True) + # average also on ahead steps + if separate_ahead is False: + mse_blocks = mse_blocks.mean(dim=self.ahead_dim, skipna=True) + return mse_blocks + + def create_error_array(self, data): + """Calculate squared error of all given time series in relation to observation.""" + errors = data.drop_sel({self.model_type_dim: self.observation_indicator}) + errors1 = errors - data.sel({self.model_type_dim: self.observation_indicator}) + errors2 = errors1 ** 2 + return errors2 + + @staticmethod + def create_full_time_dim(data, dim, sampling, start, end): + """Ensure time dimension to be equidistant. Sometimes dates if missing values have been dropped.""" + start_data = data.coords[dim].values[0] + freq = {"daily": "1D", "hourly": "1H"}.get(sampling) + datetime_index = pd.DataFrame(index=pd.date_range(start, end, freq=freq)) + t = data.sel({dim: start_data}, drop=True) + res = xr.DataArray(coords=[datetime_index.index, *[t.coords[c] for c in t.coords]], dims=[dim, *t.coords]) + res = res.transpose(*data.dims) + res.loc[data.coords] = data + return res + def load_competitors(self, station_name: str) -> xr.DataArray: """ Load all requested and available competitors for a given station. Forecasts must be available in the competitor @@ -137,10 +253,10 @@ class PostProcessing(RunEnvironment): except (FileNotFoundError, KeyError): logging.debug(f"No competitor found for combination '{station_name}' and '{competitor_name}'.") continue - return xr.concat(competing_predictions, "type") if len(competing_predictions) > 0 else None + return xr.concat(competing_predictions, self.model_type_dim) if len(competing_predictions) > 0 else None - def bootstrap_postprocessing(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", - bootstrap_method="shuffle") -> None: + def calculate_feature_importance(self, create_new_bootstraps: bool, _iter: int = 0, bootstrap_type="singleinput", + bootstrap_method="shuffle") -> None: """ Calculate skill scores of bootstrapped data. @@ -153,52 +269,64 @@ class PostProcessing(RunEnvironment): :param _iter: internal counter to reduce unnecessary recursive calls (maximum number is 2, otherwise something went wrong). """ - self.bootstrap_skill_scores = {} + if _iter == 0: + self.feature_importance_skill_scores = {} for boot_type in to_list(bootstrap_type): - self.bootstrap_skill_scores[boot_type] = {} + if _iter == 0: + self.feature_importance_skill_scores[boot_type] = {} for boot_method in to_list(bootstrap_method): try: if create_new_bootstraps: - self.create_bootstrap_forecast(bootstrap_type=boot_type, bootstrap_method=boot_method) - boot_skill_score = self.calculate_bootstrap_skill_scores(bootstrap_type=boot_type, - bootstrap_method=boot_method) - self.bootstrap_skill_scores[boot_type][boot_method] = boot_skill_score - except FileNotFoundError: + self.create_feature_importance_bootstrap_forecast(bootstrap_type=boot_type, + bootstrap_method=boot_method) + boot_skill_score = self.calculate_feature_importance_skill_scores(bootstrap_type=boot_type, + bootstrap_method=boot_method) + self.feature_importance_skill_scores[boot_type][boot_method] = boot_skill_score + except (FileNotFoundError, ValueError): if _iter != 0: - raise RuntimeError(f"bootstrap_postprocessing ({boot_type}, {boot_type}) was called for the 2nd" - f" time. This means, that something internally goes wrong. Please check for " - f"possible errors") - logging.info(f"Could not load all files for bootstrapping ({boot_type}, {boot_type}), restart " - f"bootstrap postprocessing with create_new_bootstraps=True.") - self.bootstrap_postprocessing(True, _iter=1, bootstrap_type=boot_type, bootstrap_method=boot_method) - - def create_bootstrap_forecast(self, bootstrap_type, bootstrap_method) -> None: + raise RuntimeError(f"calculate_feature_importance ({boot_type}, {boot_type}) was called for the " + f"2nd time. This means, that something internally goes wrong. Please check " + f"for possible errors") + logging.info(f"Could not load all files for feature importance ({boot_type}, {boot_type}), restart " + f"calculate_feature_importance with create_new_bootstraps=True.") + self.calculate_feature_importance(True, _iter=1, bootstrap_type=boot_type, + bootstrap_method=boot_method) + + def create_feature_importance_bootstrap_forecast(self, bootstrap_type, bootstrap_method) -> None: """ Create bootstrapped predictions for all stations and variables. These forecasts are saved in bootstrap_path with the names `bootstraps_{var}_{station}.nc` and `bootstraps_labels_{station}.nc`. """ + + def _reshape(d, pos): + if isinstance(d, list): + return list(map(lambda x: _reshape(x, pos), d)) + else: + return d[..., pos] + # forecast with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"): # extract all requirements from data store forecast_path = self.data_store.get("forecast_path") - number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing") - dims = ["index", self.ahead_dim, "type"] + number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") + dims = [self.uncertainty_estimate_boot_dim, self.index_dim, self.ahead_dim, self.model_type_dim] for station in self.test_data: X, Y = None, None - bootstraps = BootStraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type, + bootstraps = Bootstraps(station, number_of_bootstraps, bootstrap_type=bootstrap_type, bootstrap_method=bootstrap_method) + number_of_bootstraps = bootstraps.number_of_bootstraps for boot in bootstraps: X, Y, (index, dimension) = boot # make bootstrap predictions - bootstrap_predictions = self.model.predict(X) - if isinstance(bootstrap_predictions, list): # if model is branched model - bootstrap_predictions = bootstrap_predictions[-1] + bootstrap_predictions = [self.model.predict(_reshape(X, pos)) for pos in range(number_of_bootstraps)] + if isinstance(bootstrap_predictions[0], list): # if model is branched model + bootstrap_predictions = list(map(lambda x: x[-1], bootstrap_predictions)) # save bootstrap predictions separately for each station and variable combination - bootstrap_predictions = np.expand_dims(bootstrap_predictions, axis=-1) - shape = bootstrap_predictions.shape - coords = (range(shape[0]), range(1, shape[1] + 1)) + bootstrap_predictions = list(map(lambda x: np.expand_dims(x, axis=-1), bootstrap_predictions)) + shape = bootstrap_predictions[0].shape + coords = (range(number_of_bootstraps), range(shape[0]), range(1, shape[1] + 1)) var = f"{index}_{dimension}" if index is not None else str(dimension) tmp = xr.DataArray(bootstrap_predictions, coords=(*coords, [var]), dims=dims) file_name = os.path.join(forecast_path, @@ -206,12 +334,12 @@ class PostProcessing(RunEnvironment): tmp.to_netcdf(file_name) else: # store also true labels for each station - labels = np.expand_dims(Y, axis=-1) + labels = np.expand_dims(Y[..., 0], axis=-1) file_name = os.path.join(forecast_path, f"bootstraps_{station}_{bootstrap_method}_labels.nc") - labels = xr.DataArray(labels, coords=(*coords, ["obs"]), dims=dims) + labels = xr.DataArray(labels, coords=(*coords[1:], [self.observation_indicator]), dims=dims[1:]) labels.to_netcdf(file_name) - def calculate_bootstrap_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]: + def calculate_feature_importance_skill_scores(self, bootstrap_type, bootstrap_method) -> Dict[str, xr.DataArray]: """ Calculate skill score of bootstrapped variables. @@ -224,10 +352,11 @@ class PostProcessing(RunEnvironment): with TimeTracking(name=f"{inspect.stack()[0].function} ({bootstrap_type}, {bootstrap_method})"): # extract all requirements from data store forecast_path = self.data_store.get("forecast_path") - number_of_bootstraps = self.data_store.get("number_of_bootstraps", "postprocessing") + number_of_bootstraps = self.data_store.get("n_boots", "feature_importance") forecast_file = f"forecasts_norm_%s_test.nc" + reference_name = "orig" - bootstraps = BootStraps(self.test_data[0], number_of_bootstraps, bootstrap_type=bootstrap_type, + bootstraps = Bootstraps(self.test_data[0], number_of_bootstraps, bootstrap_type=bootstrap_type, bootstrap_method=bootstrap_method) number_of_bootstraps = bootstraps.number_of_bootstraps bootstrap_iter = bootstraps.bootstraps() @@ -238,16 +367,13 @@ class PostProcessing(RunEnvironment): file_name = os.path.join(forecast_path, f"bootstraps_{str(station)}_{bootstrap_method}_labels.nc") with xr.open_dataarray(file_name) as da: labels = da.load() - shape = labels.shape # get original forecasts - orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), number_of_bootstraps) - orig = orig.reshape(shape) - coords = (range(shape[0]), range(1, shape[1] + 1), ["orig"]) - orig = xr.DataArray(orig, coords=coords, dims=["index", self.ahead_dim, "type"]) + orig = self.get_orig_prediction(forecast_path, forecast_file % str(station), reference_name=reference_name) + orig.coords[self.index_dim] = labels.coords[self.index_dim] # calculate skill scores for each variable - skill = pd.DataFrame(columns=range(1, self.window_lead_time + 1)) + skill = [] for boot_set in bootstrap_iter: boot_var = f"{boot_set[0]}_{boot_set[1]}" if isinstance(boot_set, tuple) else str(boot_set) file_name = os.path.join(forecast_path, @@ -259,27 +385,34 @@ class PostProcessing(RunEnvironment): for ahead in range(1, self.window_lead_time + 1): data = boot_data.sel({self.ahead_dim: ahead}) boot_scores.append( - skill_scores.general_skill_score(data, forecast_name=boot_var, reference_name="orig")) - skill.loc[boot_var] = np.array(boot_scores) + skill_scores.general_skill_score(data, forecast_name=boot_var, + reference_name=reference_name, dim=self.index_dim)) + tmp = xr.DataArray(np.expand_dims(np.array(boot_scores), axis=-1), + coords={self.ahead_dim: range(1, self.window_lead_time + 1), + self.uncertainty_estimate_boot_dim: range(number_of_bootstraps), + self.boot_var_dim: [boot_var]}, + dims=[self.ahead_dim, self.uncertainty_estimate_boot_dim, self.boot_var_dim]) + skill.append(tmp) # collect all results in single dictionary - score[str(station)] = xr.DataArray(skill, dims=["boot_var", self.ahead_dim]) + score[str(station)] = xr.concat(skill, dim=self.boot_var_dim) return score - def get_orig_prediction(self, path, file_name, number_of_bootstraps, prediction_name=None): + def get_orig_prediction(self, path, file_name, prediction_name=None, reference_name=None): if prediction_name is None: prediction_name = self.forecast_indicator file = os.path.join(path, file_name) with xr.open_dataarray(file) as da: - prediction = da.load().sel(type=prediction_name).squeeze() - return self.repeat_data(prediction, number_of_bootstraps) + prediction = da.load().sel({self.model_type_dim: [prediction_name]}) + if reference_name is not None: + prediction.coords[self.model_type_dim] = [reference_name] + return prediction.dropna(dim=self.index_dim) @staticmethod def repeat_data(data, number_of_repetition): if isinstance(data, xr.DataArray): data = data.data - vals = np.tile(data, (number_of_repetition, 1)) - return vals[~np.isnan(vals).any(axis=1), :] + return np.repeat(np.expand_dims(data, axis=-1), number_of_repetition, axis=-1) def _get_model_name(self): """Return model name without path information.""" @@ -336,35 +469,40 @@ class PostProcessing(RunEnvironment): PlotSeparationOfScales(self.test_data, plot_folder=self.plot_path, time_dim=time_dim, window_dim=window_dim, target_dim=target_dim, **{"filter_dim": filter_dim}) except Exception as e: - logging.error(f"Could not create plot PlotSeparationOfScales due to the following error: {e}") + logging.error(f"Could not create plot PlotSeparationOfScales due to the following error:" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: - if (self.bootstrap_skill_scores is not None) and ("PlotBootstrapSkillScore" in plot_list): - for boot_type, boot_data in self.bootstrap_skill_scores.items(): + if (self.feature_importance_skill_scores is not None) and ("PlotFeatureImportanceSkillScore" in plot_list): + for boot_type, boot_data in self.feature_importance_skill_scores.items(): for boot_method, boot_skill_score in boot_data.items(): try: - PlotBootstrapSkillScore(boot_skill_score, plot_folder=self.plot_path, - model_setup=self.forecast_indicator, sampling=self._sampling, - ahead_dim=self.ahead_dim, separate_vars=to_list(self.target_var), - bootstrap_type=boot_type, bootstrap_method=boot_method) + PlotFeatureImportanceSkillScore( + boot_skill_score, plot_folder=self.plot_path, model_setup=self.forecast_indicator, + sampling=self._sampling, ahead_dim=self.ahead_dim, + separate_vars=to_list(self.target_var), bootstrap_type=boot_type, + bootstrap_method=boot_method) except Exception as e: - logging.error(f"Could not create plot PlotBootstrapSkillScore ({boot_type}, {boot_method}) " - f"due to the following error: {e}") + logging.error(f"Could not create plot PlotFeatureImportanceSkillScore ({boot_type}, " + f"{boot_method}) due to the following error:\n{sys.exc_info()[0]}\n" + f"{sys.exc_info()[1]}\n{sys.exc_info()[2]}") except Exception as e: - logging.error(f"Could not create plot PlotBootstrapSkillScore due to the following error: {e}") + logging.error(f"Could not create plot PlotFeatureImportanceSkillScore due to the following error: {e}") try: if "PlotConditionalQuantiles" in plot_list: PlotConditionalQuantiles(self.test_data.keys(), data_pred_path=path, plot_folder=self.plot_path) except Exception as e: - logging.error(f"Could not create plot PlotConditionalQuantiles due to the following error: {e}") + logging.error(f"Could not create plot PlotConditionalQuantiles due to the following error:" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotMonthlySummary" in plot_list: PlotMonthlySummary(self.test_data.keys(), path, r"forecasts_%s_test.nc", self.target_var, plot_folder=self.plot_path) except Exception as e: - logging.error(f"Could not create plot PlotMonthlySummary due to the following error: {e}") + logging.error(f"Could not create plot PlotMonthlySummary due to the following error:" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotClimatologicalSkillScore" in plot_list: @@ -373,21 +511,24 @@ class PostProcessing(RunEnvironment): PlotClimatologicalSkillScore(self.skill_scores[1], plot_folder=self.plot_path, score_only=False, extra_name_tag="all_terms_", model_setup=self.forecast_indicator) except Exception as e: - logging.error(f"Could not create plot PlotClimatologicalSkillScore due to the following error: {e}") + logging.error(f"Could not create plot PlotClimatologicalSkillScore due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotCompetitiveSkillScore" in plot_list: PlotCompetitiveSkillScore(self.skill_scores[0], plot_folder=self.plot_path, model_setup=self.forecast_indicator) except Exception as e: - logging.error(f"Could not create plot PlotCompetitiveSkillScore due to the following error: {e}") + logging.error(f"Could not create plot PlotCompetitiveSkillScore due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotTimeSeries" in plot_list: PlotTimeSeries(self.test_data.keys(), path, r"forecasts_%s_test.nc", plot_folder=self.plot_path, - sampling=self._sampling) + sampling=self._sampling, ahead_dim=self.ahead_dim) except Exception as e: - logging.error(f"Could not create plot PlotTimeSeries due to the following error: {e}") + logging.error(f"Could not create plot PlotTimeSeries due to the following error:\n{sys.exc_info()[0]}\n" + f"{sys.exc_info()[1]}\n{sys.exc_info()[2]}\n{traceback.format_exc()}") try: if "PlotStationMap" in plot_list: @@ -404,7 +545,8 @@ class PostProcessing(RunEnvironment): (self.test_data, {"marker": 9, "ms": 9})] PlotStationMap(generators=gens, plot_folder=self.plot_path, plot_name="station_map_var") except Exception as e: - logging.error(f"Could not create plot PlotStationMap due to the following error: {e}") + logging.error(f"Could not create plot PlotStationMap due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotAvailability" in plot_list: @@ -412,7 +554,8 @@ class PostProcessing(RunEnvironment): PlotAvailability(avail_data, plot_folder=self.plot_path, time_dimension=time_dim, window_dimension=window_dim) except Exception as e: - logging.error(f"Could not create plot PlotAvailability due to the following error: {e}") + logging.error(f"Could not create plot PlotAvailability due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotAvailabilityHistogram" in plot_list: @@ -420,7 +563,8 @@ class PostProcessing(RunEnvironment): PlotAvailabilityHistogram(avail_data, plot_folder=self.plot_path, station_dim=iter_dim, history_dim=window_dim) except Exception as e: - logging.error(f"Could not create plot PlotAvailabilityHistogram due to the following error: {e}") + logging.error(f"Could not create plot PlotAvailabilityHistogram due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotPeriodogram" in plot_list: @@ -428,14 +572,29 @@ class PostProcessing(RunEnvironment): variables_dim=target_dim, sampling=self._sampling, use_multiprocessing=use_multiprocessing) except Exception as e: - logging.error(f"Could not create plot PlotPeriodogram due to the following error: {e}") + logging.error(f"Could not create plot PlotPeriodogram due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") try: if "PlotDataHistogram" in plot_list: + upsampling = self.data_store.get_default("upsampling", scope="train", default=False) gens = {"train": self.train_data, "val": self.val_data, "test": self.test_data} - PlotDataHistogram(gens, plot_folder=self.plot_path, time_dim=time_dim, variables_dim=target_dim) + PlotDataHistogram(gens, plot_folder=self.plot_path, time_dim=time_dim, variables_dim=target_dim, + upsampling=upsampling) + except Exception as e: + logging.error(f"Could not create plot PlotDataHistogram due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") + + try: + if "PlotSampleUncertaintyFromBootstrap" in plot_list and self.uncertainty_estimate is not None: + block_length = self.data_store.get_default("block_length", default="1m", scope="uncertainty_estimate") + PlotSampleUncertaintyFromBootstrap( + data=self.uncertainty_estimate, plot_folder=self.plot_path, model_type_dim=self.model_type_dim, + dim_name_boots=self.uncertainty_estimate_boot_dim, error_measure="mean squared error", + error_unit=r"ppb$^2$", block_length=block_length) except Exception as e: - logging.error(f"Could not create plot PlotDataHistogram due to the following error: {e}") + logging.error(f"Could not create plot PlotSampleUncertaintyFromBootstrap due to the following error: {e}" + f"\n{sys.exc_info()[0]}\n{sys.exc_info()[1]}\n{sys.exc_info()[2]}") def calculate_test_score(self): """Evaluate test score of model and save locally.""" @@ -496,10 +655,11 @@ class PostProcessing(RunEnvironment): full_index = self.create_fullindex(observation_data.indexes[time_dimension], self._get_frequency()) prediction_dict = {self.forecast_indicator: nn_prediction, "persi": persistence_prediction, - "obs": observation, + self.observation_indicator: observation, "ols": ols_prediction} all_predictions = self.create_forecast_arrays(full_index, list(target_data.indexes[window_dim]), time_dimension, ahead_dim=self.ahead_dim, + index_dim=self.index_dim, type_dim=self.model_type_dim, **prediction_dict) # save all forecasts locally @@ -529,7 +689,7 @@ class PostProcessing(RunEnvironment): with xr.open_dataarray(file) as da: data = da.load() forecast = data.sel(type=[self.forecast_indicator]) - forecast.coords["type"] = [competitor_name] + forecast.coords[self.model_type_dim] = [competitor_name] return forecast def _create_observation(self, data, _, transformation_func: Callable, normalised: bool) -> xr.DataArray: @@ -659,7 +819,7 @@ class PostProcessing(RunEnvironment): @staticmethod def create_forecast_arrays(index: pd.DataFrame, ahead_names: List[Union[str, int]], time_dimension, - ahead_dim="ahead", **kwargs): + ahead_dim="ahead", index_dim="index", type_dim="type", **kwargs): """ Combine different forecast types into single xarray. @@ -672,7 +832,7 @@ class PostProcessing(RunEnvironment): """ keys = list(kwargs.keys()) res = xr.DataArray(np.full((len(index.index), len(ahead_names), len(keys)), np.nan), - coords=[index.index, ahead_names, keys], dims=['index', ahead_dim, 'type']) + coords=[index.index, ahead_names, keys], dims=[index_dim, ahead_dim, type_dim]) for k, v in kwargs.items(): intersection = set(res.index.values) & set(v.indexes[time_dimension].values) match_index = np.array(list(intersection)) @@ -711,18 +871,19 @@ class PostProcessing(RunEnvironment): except (IndexError, KeyError, FileNotFoundError): return None - @staticmethod - def _combine_forecasts(forecast, competitor, dim="type"): + def _combine_forecasts(self, forecast, competitor, dim=None): """ Combine forecast and competitor if both are xarray. If competitor is None, this returns forecasts and vise versa. """ + if dim is None: + dim = self.model_type_dim try: return xr.concat([forecast, competitor], dim=dim) except (TypeError, AttributeError): return forecast if competitor is None else competitor - def calculate_error_metrics(self) -> Tuple[Dict, Dict, Dict]: + def calculate_error_metrics(self) -> Tuple[Dict, Dict, Dict, Dict]: """ Calculate error metrics and skill scores of NN forecast. @@ -735,6 +896,7 @@ class PostProcessing(RunEnvironment): path = self.data_store.get("forecast_path") all_stations = self.data_store.get("stations") skill_score_competitive = {} + skill_score_competitive_count = {} skill_score_climatological = {} errors = {} for station in all_stations: @@ -742,24 +904,61 @@ class PostProcessing(RunEnvironment): # test errors if external_data is not None: - errors[station] = statistics.calculate_error_metrics(*map(lambda x: external_data.sel(type=x), - [self.forecast_indicator, "obs"]), - dim="index") - # skill score + model_type_list = external_data.coords[self.model_type_dim].values.tolist() + for model_type in remove_items(model_type_list, self.observation_indicator): + if model_type not in errors.keys(): + errors[model_type] = {} + errors[model_type][station] = statistics.calculate_error_metrics( + *map(lambda x: external_data.sel(**{self.model_type_dim: x}), + [model_type, self.observation_indicator]), dim=self.index_dim) + + # load competitors competitor = self.load_competitors(station) - combined = self._combine_forecasts(external_data, competitor, dim="type") - model_list = remove_items(list(combined.type.values), "obs") if combined is not None else None + combined = self._combine_forecasts(external_data, competitor, dim=self.model_type_dim) + if combined is not None: + model_list = remove_items(combined.coords[self.model_type_dim].values.tolist(), + self.observation_indicator) + else: + model_list = None + + # test errors of competitors + for model_type in remove_items(model_list or [], list(errors.keys())): + if self.observation_indicator not in combined.coords[self.model_type_dim]: + continue + if model_type not in errors.keys(): + errors[model_type] = {} + errors[model_type][station] = statistics.calculate_error_metrics( + *map(lambda x: combined.sel(**{self.model_type_dim: x}), + [model_type, self.observation_indicator]), dim=self.index_dim) + + # skill score skill_score = statistics.SkillScores(combined, models=model_list, ahead_dim=self.ahead_dim) if external_data is not None: - skill_score_competitive[station] = skill_score.skill_scores() + skill_score_competitive[station], skill_score_competitive_count[station] = skill_score.skill_scores() internal_data = self._get_internal_data(station, path) if internal_data is not None: skill_score_climatological[station] = skill_score.climatological_skill_scores( internal_data, forecast_name=self.forecast_indicator) - errors.update({"total": self.calculate_average_errors(errors)}) - return skill_score_competitive, skill_score_climatological, errors + for model_type in errors.keys(): + errors[model_type].update({"total": self.calculate_average_errors(errors[model_type])}) + skill_score_competitive.update({"total": self.calculate_average_skill_scores(skill_score_competitive, + skill_score_competitive_count)}) + return skill_score_competitive, skill_score_competitive_count, skill_score_climatological, errors + + @staticmethod + def calculate_average_skill_scores(scores, counts): + avg_skill_score = 0 + n_total = None + for vals in counts.values(): + n_total = vals if n_total is None else n_total.add(vals, fill_value=0) + for station, station_scores in scores.items(): + n = counts.get(station) + avg_skill_score = station_scores.mul(n.div(n_total, fill_value=0), fill_value=0).add(avg_skill_score, + fill_value=0) + return avg_skill_score + @staticmethod def calculate_average_errors(errors): @@ -772,28 +971,57 @@ class PostProcessing(RunEnvironment): avg_error[error_metric] = new_val return avg_error + def report_feature_importance_results(self, results): + """Create a csv file containing all results from feature importance.""" + report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") + path_config.check_path_and_create(report_path) + res = [] + for boot_type, d0 in results.items(): + for boot_method, d1 in d0.items(): + for station_name, vals in d1.items(): + for boot_var in vals.coords[self.boot_var_dim].values.tolist(): + for ahead in vals.coords[self.ahead_dim].values.tolist(): + res.append([boot_type, boot_method, station_name, boot_var, ahead, + *vals.sel({self.boot_var_dim: boot_var, + self.ahead_dim: ahead}).values.round(5).tolist()]) + col_names = [self.model_type_dim, "method", "station", self.boot_var_dim, self.ahead_dim, + *list(range(len(res[0]) - 5))] + df = pd.DataFrame(res, columns=col_names) + file_name = "feature_importance_skill_score_report_raw.csv" + df.to_csv(os.path.join(report_path, file_name), sep=";") + def report_error_metrics(self, errors): report_path = os.path.join(self.data_store.get("experiment_path"), "latex_report") path_config.check_path_and_create(report_path) - metric_collection = {} - for station, station_errors in errors.items(): - if isinstance(station_errors, xr.DataArray): - dim = station_errors.dims[0] - sel_index = [sel for sel in station_errors.coords[dim] if "CASE" in str(sel)] - station_errors = {str(i.values): station_errors.sel(**{dim: i}) for i in sel_index} - for metric, vals in station_errors.items(): - if metric == "n": - continue - pd_vals = pd.DataFrame.from_dict({station: vals}).T - pd_vals.columns = [f"{metric}(t+{x})" for x in vals.coords["ahead"].values] - mc = metric_collection.get(metric, pd.DataFrame()) - mc = mc.append(pd_vals) - metric_collection[metric] = mc - for metric, error_df in metric_collection.items(): - df = error_df.sort_index() - if "total" in df.index: - df.reindex(df.index.drop(["total"]).to_list() + ["total"], ) - column_format = tables.create_column_format_for_tex(df) - file_name = f"error_report_{metric}.%s".replace(' ', '_') - tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df) - tables.save_to_md(report_path, file_name % "md", df=df) + for model_type in errors.keys(): + metric_collection = {} + for station, station_errors in errors[model_type].items(): + if isinstance(station_errors, xr.DataArray): + dim = station_errors.dims[0] + sel_index = [sel for sel in station_errors.coords[dim] if "CASE" in str(sel)] + station_errors = {str(i.values): station_errors.sel(**{dim: i}) for i in sel_index} + elif isinstance(station_errors, pd.DataFrame): + sel_index = station_errors.index.tolist() + ahead = station_errors.columns.values + station_errors = {k: xr.DataArray(station_errors[station_errors.index == k].values.flatten(), + dims=["ahead"], coords={"ahead": ahead}).astype(float) + for k in sel_index} + for metric, vals in station_errors.items(): + if metric == "n": + metric = "count" + pd_vals = pd.DataFrame.from_dict({station: vals}).T + pd_vals.columns = [f"{metric}(t+{x})" for x in vals.coords["ahead"].values] + mc = metric_collection.get(metric, pd.DataFrame()) + mc = mc.append(pd_vals) + metric_collection[metric] = mc + for metric, error_df in metric_collection.items(): + df = error_df.sort_index() + if "total" in df.index: + df.reindex(df.index.drop(["total"]).to_list() + ["total"], ) + column_format = tables.create_column_format_for_tex(df) + if model_type == "skill_score": + file_name = f"error_report_{model_type}_{metric}.%s".replace(' ', '_') + else: + file_name = f"error_report_{metric}_{model_type}.%s".replace(' ', '_') + tables.save_to_tex(report_path, file_name % "tex", column_format=column_format, df=df) + tables.save_to_md(report_path, file_name % "md", df=df) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 08bff85c9c1fe06111ddb47e7a3952404e05c0ac..873919fa93af3e4a43c3b16c382d9746ec26a573 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -10,6 +10,8 @@ from typing import Tuple import multiprocessing import requests import psutil +import random +import dill import pandas as pd @@ -242,6 +244,7 @@ class PreProcessing(RunEnvironment): valid_stations = [] kwargs = self.data_store.create_args_dict(data_handler.requirements(), scope=set_name) use_multiprocessing = self.data_store.get("use_multiprocessing") + tmp_path = self.data_store.get("tmp_path") max_process = self.data_store.get("max_number_multiprocessing") n_process = min([psutil.cpu_count(logical=False), len(set_stations), max_process]) # use only physical cpus @@ -249,18 +252,23 @@ class PreProcessing(RunEnvironment): logging.info("use parallel validate station approach") pool = multiprocessing.Pool(n_process) logging.info(f"running {getattr(pool, '_processes')} processes in parallel") + kwargs.update({"tmp_path": tmp_path, "return_strategy": "reference"}) output = [ pool.apply_async(f_proc, args=(data_handler, station, set_name, store_processed_data), kwds=kwargs) for station in set_stations] for i, p in enumerate(output): - dh, s = p.get() + _res_file, s = p.get() logging.info(f"...finished: {s} ({int((i + 1.) / len(output) * 100)}%)") + with open(_res_file, "rb") as f: + dh = dill.load(f) + os.remove(_res_file) if dh is not None: collection.add(dh) valid_stations.append(s) pool.close() else: # serial solution logging.info("use serial validate station approach") + kwargs.update({"return_strategy": "result"}) for station in set_stations: dh, s = f_proc(data_handler, station, set_name, store_processed_data, **kwargs) if dh is not None: @@ -268,7 +276,7 @@ class PreProcessing(RunEnvironment): valid_stations.append(s) logging.info(f"run for {t_outer} to check {len(set_stations)} station(s). Found {len(collection)}/" - f"{len(set_stations)} valid stations.") + f"{len(set_stations)} valid stations ({set_name}).") if set_name == "train": self.store_data_handler_attributes(data_handler, collection) return collection, valid_stations @@ -288,7 +296,8 @@ class PreProcessing(RunEnvironment): def transformation(self, data_handler: AbstractDataHandler, stations): if hasattr(data_handler, "transformation"): kwargs = self.data_store.create_args_dict(data_handler.requirements(), scope="train") - transformation_dict = data_handler.transformation(stations, **kwargs) + tmp_path = self.data_store.get_default("tmp_path", default=None) + transformation_dict = data_handler.transformation(stations, tmp_path=tmp_path, **kwargs) if transformation_dict is not None: self.data_store.set("transformation", transformation_dict) @@ -312,12 +321,13 @@ class PreProcessing(RunEnvironment): logging.info("No preparation required because no competitor was provided to the workflow.") -def f_proc(data_handler, station, name_affix, store, **kwargs): +def f_proc(data_handler, station, name_affix, store, return_strategy="", tmp_path=None, **kwargs): """ Try to create a data handler for given arguments. If build fails, this station does not fulfil all requirements and - therefore f_proc will return None as indication. On a successfull build, f_proc returns the built data handler and + therefore f_proc will return None as indication. On a successful build, f_proc returns the built data handler and the station that was used. This function must be implemented globally to work together with multiprocessing. """ + assert return_strategy in ["result", "reference"] try: res = data_handler.build(station, name_affix=name_affix, store_processed_data=store, **kwargs) except (AttributeError, EmptyQueryResult, KeyError, requests.ConnectionError, ValueError, IndexError) as e: @@ -326,7 +336,15 @@ def f_proc(data_handler, station, name_affix, store, **kwargs): f"remove station {station} because it raised an error: {e} -> {' | '.join(f_inspect_error(formatted_lines))}") logging.debug(f"detailed information for removal of station {station}: {traceback.format_exc()}") res = None - return res, station + if return_strategy == "result": + return res, station + else: + if tmp_path is None: + tmp_path = os.getcwd() + _tmp_file = os.path.join(tmp_path, f"{station}_{'%032x' % random.getrandbits(128)}.pickle") + with open(_tmp_file, "wb") as f: + dill.dump(res, f, protocol=4) + return _tmp_file, station def f_inspect_error(formatted): diff --git a/mlair/workflows/default_workflow.py b/mlair/workflows/default_workflow.py index 5894555a6af52299efcd8d88d76c0d3791a1599e..961979cb774e928bda96d4cd1a3a7b0f8565e968 100644 --- a/mlair/workflows/default_workflow.py +++ b/mlair/workflows/default_workflow.py @@ -31,7 +31,6 @@ class DefaultWorkflow(Workflow): permute_data_on_training=None, extreme_values=None, extremes_on_right_tail_only=None, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, - evaluate_bootstraps=None, number_of_bootstraps=None, create_new_bootstraps=None, plot_list=None, model=None, batch_size=None, diff --git a/run.py b/run.py index eb703e11cf9a3028dda0368ac3f7b7ca1578bd2a..11cc01257fdf4535845a2cfedb065dd27942ef66 100644 --- a/run.py +++ b/run.py @@ -25,7 +25,7 @@ def main(parser_args): # stations=["DEBW087","DEBW013", "DEBW107", "DEBW076"], stations=["DEBW013", "DEBW087", "DEBW107", "DEBW076"], train_model=False, create_new_model=True, network="UBA", - evaluate_bootstraps=False, # plot_list=["PlotCompetitiveSkillScore"], + evaluate_feature_importance=False, # plot_list=["PlotCompetitiveSkillScore"], competitors=["test_model", "test_model2"], competitor_path=os.path.join(os.getcwd(), "data", "comp_test"), **parser_args.__dict__, start_script=__file__) diff --git a/run_climate_filter.py b/run_climate_filter.py new file mode 100755 index 0000000000000000000000000000000000000000..7f3fcbaaba94506faeadd884073620496155f8ea --- /dev/null +++ b/run_climate_filter.py @@ -0,0 +1,91 @@ +__author__ = "Lukas Leufen" +__date__ = '2019-11-14' + +import argparse + +from mlair.workflows import DefaultWorkflow +from mlair.data_handler.data_handler_mixed_sampling import DataHandlerMixedSamplingWithClimateFirFilter +from mlair.model_modules.fully_connected_networks import BranchedInputFCN as NN +from mlair.configuration.defaults import DEFAULT_PLOT_LIST, DEFAULT_TRAIN_END + + +STATS = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'maximum', 'u': 'average_values', + 'v': 'average_values', 'no': 'dma8eu', 'no2': 'dma8eu', 'cloudcover': 'average_values', + 'pblheight': 'maximum'} + +DATA_ORIGIN = {"no": "", "no2": "", "o3": "", + "cloudcover": "REA", "pblheight": "REA", "relhum": "REA", + "temp": "REA", "u": "REA", "v": "REA"} + + +def main(parser_args): + # plots = remove_items(DEFAULT_PLOT_LIST, "PlotConditionalQuantiles") + args = dict( + sampling=("hourly", "daily"), # T1A, T1D + stations=["DEBW107", "DEBW013"], # [:5], # T1B, TODO: remove indexing for meaningful experiments + network="UBA", # T1B + variables=["o3", "no", "no2", "cloudcover", "pblheight", "relhum", "temp", "u", "v"], # T1B + statistics_per_var=STATS, # T1A, T1D, T1F + + data_origin=DATA_ORIGIN, + # data_handler=DataHandlerMixedSampling, + # window_history_offset=16, + # window_history_size=6 * 24 + 16, # T1D + data_handler=DataHandlerMixedSamplingWithClimateFirFilter, + filter_cutoff_period=[21], + filter_order=[42], + filter_window_type=("kaiser", 5), + filter_add_unfiltered=True, + apriori_sel_opts=slice(DEFAULT_TRAIN_END), + apriori_type="residuum_stats", + apriori_diurnal=True, + use_filter_branches=True, + + window_history_size=2 * 24 + 16, + window_history_offset=16, # T1F + window_lead_time=4, # T1D + target_var="o3", # T1D + interpolation_limit=(24, 2), # T1F + transformation={ + "o3": {"method": "log"}, + "no": {"method": "log"}, + "no2": {"method": "log"}, + "cloudcover": {"method": "min_max", "feature_range": [-1, 1]}, + "pblheight": {"method": "log"}, + "relhum": {"method": "min_max", "feature_range": [-1, 1]}, + "temp": {"method": "standardise"}, + "u": {"method": "standardise"}, + "v": {"method": "standardise"}, }, # T1F, also apply same target transformation + + start="2006-01-01", + train_start="2006-01-01", + end="2011-12-31", + test_end="2011-12-31", + + train_model=False, create_new_model=True, + epochs=1, + model=NN, + evaluate_feature_importance=False, + feature_importance_bootstrap_type=["singleinput", "branch", "variable"], + feature_importance_bootstrap_method=["shuffle", "zero_mean"], + plot_list=DEFAULT_PLOT_LIST, + # competitors=["IntelliO3-ts-v1", "MFCN_64_32_16_climFIR", "FCN_1449_512_32_4"], # "FCN_1449_16_8_4",], + lazy_preprocessing=True, + use_multiprocessing=True, + use_multiprocessing_on_debug=False, + overwrite_local_data=False, + overwrite_lazy_data=False, + max_number_multiprocessing=2, + **parser_args.__dict__ + ) + print(parser_args.__dict__) + workflow = DefaultWorkflow(**args, start_script=__file__) + workflow.run() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default=None, + help="set experiment date as string") + args = parser.parse_args(["--experiment_date", "testrun"]) + main(args) diff --git a/run_mixed_sampling.py b/run_mixed_sampling.py index 819ef51129854b4539632ef91a55e33a2607eb55..784f653fbfb2eb4c78e6e858acf67cd0ae47a593 100644 --- a/run_mixed_sampling.py +++ b/run_mixed_sampling.py @@ -20,7 +20,7 @@ data_origin = {'o3': '', 'no': '', 'no2': '', def main(parser_args): args = dict(stations=["DEBW107", "DEBW013"], network="UBA", - evaluate_bootstraps=False, plot_list=[], + evaluate_feature_importance=False, plot_list=[], data_origin=data_origin, data_handler=DataHandlerMixedSampling, interpolation_limit=(3, 1), overwrite_local_data=False, sampling=("hourly", "daily"), diff --git a/test/test_configuration/test_defaults.py b/test/test_configuration/test_defaults.py index b6bdd9556f73ff711003b01c3a2b65a1c20c66d3..f6bc6d24724c2620083602d3864bcbca0a709681 100644 --- a/test/test_configuration/test_defaults.py +++ b/test/test_configuration/test_defaults.py @@ -62,10 +62,16 @@ class TestAllDefaults: assert DEFAULT_HPC_LOGIN_LIST == ["ju", "hdfmll"] def test_postprocessing_parameters(self): - assert DEFAULT_EVALUATE_BOOTSTRAPS is True - assert DEFAULT_CREATE_NEW_BOOTSTRAPS is False - assert DEFAULT_NUMBER_OF_BOOTSTRAPS == 20 + assert DEFAULT_DO_UNCERTAINTY_ESTIMATE is True + assert DEFAULT_UNCERTAINTY_ESTIMATE_BLOCK_LENGTH == "1m" + assert DEFAULT_UNCERTAINTY_ESTIMATE_EVALUATE_COMPETITORS is True + assert DEFAULT_UNCERTAINTY_ESTIMATE_N_BOOTS == 1000 + assert DEFAULT_EVALUATE_FEATURE_IMPORTANCE is True + assert DEFAULT_FEATURE_IMPORTANCE_CREATE_NEW_BOOTSTRAPS is False + assert DEFAULT_FEATURE_IMPORTANCE_N_BOOTS == 20 + assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_TYPE == "singleinput" + assert DEFAULT_FEATURE_IMPORTANCE_BOOTSTRAP_METHOD == "shuffle" assert DEFAULT_PLOT_LIST == ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", - "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", + "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotFeatureImportanceSkillScore", "PlotConditionalQuantiles", "PlotAvailability", "PlotAvailabilityHistogram", - "PlotDataHistogram", "PlotPeriodogram"] + "PlotDataHistogram", "PlotPeriodogram", "PlotSampleUncertaintyFromBootstrap"] diff --git a/test/test_data_handler/old_t_bootstraps.py b/test/test_data_handler/old_t_bootstraps.py index 21c18c6c2d6f6a6a38a41250f00d3d14a29ed457..e21af9f614d4cfaaadf946cdab72cc69cd5b19a7 100644 --- a/test/test_data_handler/old_t_bootstraps.py +++ b/test/test_data_handler/old_t_bootstraps.py @@ -7,7 +7,7 @@ import numpy as np import pytest import xarray as xr -from mlair.data_handler.bootstraps import BootStraps +from mlair.data_handler.input_bootstraps import Bootstraps from src.data_handler import DataPrepJoin @@ -171,22 +171,22 @@ class TestBootStraps: @pytest.fixture def bootstrap(self, orig_generator, data_path): - return BootStraps(orig_generator, data_path, 20) + return Bootstraps(orig_generator, data_path, 20) @pytest.fixture @mock.patch("mlair.data_handling.bootstraps.CreateShuffledData", return_value=None) def bootstrap_no_shuffling(self, mock_create_shuffle_data, orig_generator, data_path): shutil.rmtree(data_path) - return BootStraps(orig_generator, data_path, 20) + return Bootstraps(orig_generator, data_path, 20) def test_init_no_shuffling(self, bootstrap_no_shuffling, data_path): - assert isinstance(bootstrap_no_shuffling, BootStraps) + assert isinstance(bootstrap_no_shuffling, Bootstraps) assert bootstrap_no_shuffling.number_of_bootstraps == 20 assert bootstrap_no_shuffling.bootstrap_path == data_path def test_init_with_shuffling(self, orig_generator, data_path, caplog): caplog.set_level(logging.INFO) - BootStraps(orig_generator, data_path, 20) + Bootstraps(orig_generator, data_path, 20) assert caplog.record_tuples[0] == ('root', logging.INFO, "create / check shuffled bootstrap data") def test_stations(self, bootstrap_no_shuffling, orig_generator): @@ -213,9 +213,9 @@ class TestBootStraps: @mock.patch("mlair.data_handling.data_generator.DataGenerator._load_pickle_data", side_effect=FileNotFoundError) def test_get_generator_different_generator(self, mock_load_pickle, data_path, orig_generator): - BootStraps(orig_generator, data_path, 20) # to create + Bootstraps(orig_generator, data_path, 20) # to create orig_generator.window_history_size = 4 - bootstrap = BootStraps(orig_generator, data_path, 20) + bootstrap = Bootstraps(orig_generator, data_path, 20) station = bootstrap.stations[0] var = bootstrap.variables[0] var_others = bootstrap.variables[1:] diff --git a/test/test_helpers/test_statistics.py b/test/test_helpers/test_statistics.py index 2a77f0806b886bee1a3961ff0a972e8f0ee62873..f5148cdc293939d5711afb57c2fa009c47b6c86d 100644 --- a/test/test_helpers/test_statistics.py +++ b/test/test_helpers/test_statistics.py @@ -4,7 +4,8 @@ import pytest import xarray as xr from mlair.helpers.statistics import standardise, standardise_inverse, standardise_apply, centre, centre_inverse, \ - centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply + centre_apply, apply_inverse_transformation, min_max, min_max_inverse, min_max_apply, log, log_inverse, log_apply, \ + create_single_bootstrap_realization, calculate_average, create_n_bootstrap_realizations lazy = pytest.lazy_fixture @@ -221,3 +222,36 @@ class TestLog: data_ref, opts = log(data_orig, dim) data_test = log_apply(data_orig, opts["mean"], opts["std"]) assert np.testing.assert_array_almost_equal(data_ref, data_test) is None + + +class TestCreateBootstrapRealizations: + + @pytest.fixture + def data(self): + return xr.DataArray(np.array(range(20)).reshape(2 , -1).T, + dims={'time': range(10), 'model': ['m1', 'm2']}, + coords={'time': range(10), 'model': ['m1', 'm2']}) + + def test_create_single_bootstrap_realization(self, data): + np.random.seed(42) + proc_data = create_single_bootstrap_realization(data, "time") + assert isinstance(proc_data, xr.DataArray) + assert (proc_data.coords['time'].values == np.array([6, 3, 7, 4, 6, 9, 2, 6, 7, 4])).all() + # check if all time index values of proc_data are from data + assert np.in1d(proc_data.indexes['time'].values, data.indexes['time'].values).all() + + def test_calculate_average(self, data): + assert isinstance(data, xr.DataArray) + assert calculate_average(data) == data.mean() + assert (calculate_average(data, axis=0) == data.mean(axis=0)).all() + + def test_create_n_bootstrap_realizations(self, data): + boot_data = create_n_bootstrap_realizations(data, dim_name_time='time', dim_name_model='model', + n_boots=1000, dim_name_boots='boots') + assert isinstance(boot_data, xr.DataArray) + assert boot_data.shape == (1000, 2) + + boot_data = create_n_bootstrap_realizations(data.sel(model='m1').squeeze(), dim_name_time='time', + dim_name_model='model', n_boots=1000, dim_name_boots='boots') + assert isinstance(boot_data, xr.DataArray) + assert boot_data.shape == (1000,)