diff --git a/.gitlab/issue_templates/release.md b/.gitlab/issue_templates/release.md index 618738d3184c68514fe32602af32188e001d228b..a95cf033eed919339c6c1734638542c3e0cdbc57 100644 --- a/.gitlab/issue_templates/release.md +++ b/.gitlab/issue_templates/release.md @@ -15,7 +15,7 @@ vX.Y.Z * [ ] Update version number in `mlair/__ init__.py` * [ ] Create new dist file: `python3 setup.py sdist bdist_wheel` * [ ] Update file link `distribution file (current version)` in `README.md` -* [ ] Update file link in `docs/_source/get-started.rst` +* [ ] Update file link in `docs/_source/installation.rst` * [ ] Commit + push * [ ] Merge `release_vX.Y.Z` into `master` * [ ] Create new tag with diff --git a/CHANGELOG.md b/CHANGELOG.md index d3989b65d2206d26aff9778582ef99014e45ce2f..82163eced04bc50d1ebb352bff20ca23dda55711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,27 @@ # Changelog All notable changes to this project will be documented in this file. +## v1.3.0 - 2021-02-24 - competitors and improved transformation + +### general: +* release of official MLAir logo (#274) +* new transformation schema for better independence of MLAir and data handler (#272) +* competing models can be included in postprocessing for direct comparison (#198) + +### new features: +* new helper functions for geographic issues (#280) +* default data handler and inheritances can use min/max and log transformation (#276, #275) +* include IntelliO3-ts model as reference via automatic download (#131) + +### technical: +* experiment name now always includes target sampling type (#263) +* competitive skill score plot is refactored (#260) +* bug fix for climatological skill scores (#259) +* bug fix for custom objects handling (#277) +* bug fix for monitoring plots when multiple output branches are used (#278) +* update requirements to newer version and dependencies (#262, #273) +* HPC scripts are updated to work properly with parallel data processing (#281) + ## v1.2.1 - 2021-02-08 - bug fix for recursive import error ### general: @@ -9,7 +30,7 @@ All notable changes to this project will be documented in this file. ### technical: -* bug fix for recursive import error, #269 +* bug fix for recursive import error, (#269) ## v1.2.0 - 2020-12-18 - parallel preprocessing and improved data handlers diff --git a/README.md b/README.md index 3733882832181c721188005050f775e40ec23878..cbaa61d2632e46ceb44d39bfeceef7423d1b9784 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ HPC systems, see [here](#special-instructions-for-installation-on-jülich-hpc-sy * Installation of **MLAir**: * Either clone MLAir from the [gitlab repository](https://gitlab.version.fz-juelich.de/toar/mlair.git) and use it without installation (beside the requirements) - * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.2.1-py3-none-any.whl)) + * or download the distribution file ([current version](https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.3.0-py3-none-any.whl)) and install it via `pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using `import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't diff --git a/dist/mlair-1.3.0-py3-none-any.whl b/dist/mlair-1.3.0-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..170a938db6dc7309ff9762723aafd815b97dfbeb Binary files /dev/null and b/dist/mlair-1.3.0-py3-none-any.whl differ diff --git a/docs/_source/defaults.rst b/docs/_source/defaults.rst index 775134f5761b27cbdc6927efaf3e3d6fa1dd68cf..e95cf10eb8b53e776a2607dafba52fd1edad98ca 100644 --- a/docs/_source/defaults.rst +++ b/docs/_source/defaults.rst @@ -17,6 +17,7 @@ create_new_model data_handler data_origin data_path +debug - MLAir checks if it is running in debug mode and stores this dimensions end epochs @@ -57,6 +58,7 @@ train_start transformation :py:`{}` implement all further transformation functionality inside your custom data handler use_all_stations_on_all_data_sets +use_multiprocessing :py:`True` is set to False if MLAir is running in debug mode upsampling val_end val_min_length diff --git a/docs/_source/installation.rst b/docs/_source/installation.rst index 7578d9abf49b9e4b67dac19b6263c4bc05110eea..20db920216b2f0cda7568e7a153a6176d441e995 100644 --- a/docs/_source/installation.rst +++ b/docs/_source/installation.rst @@ -26,7 +26,7 @@ Installation of MLAir * Install all requirements from `requirements.txt <https://gitlab.version.fz-juelich.de/toar/machinelearningtools/-/blob/master/requirements.txt>`_ preferably in a virtual environment * Either clone MLAir from the `gitlab repository <https://gitlab.version.fz-juelich.de/toar/machinelearningtools.git>`_ -* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.2.0-py3-none-any.whl>`_) +* or download the distribution file (`current version <https://gitlab.version.fz-juelich.de/toar/mlair/-/blob/master/dist/mlair-1.3.0-py3-none-any.whl>`_) and install it via :py:`pip install <dist_file>.whl`. In this case, you can simply import MLAir in any python script inside your virtual environment using :py:`import mlair`. * (tf) Currently, TensorFlow-1.13 is mentioned in the requirements. We already tested the TensorFlow-1.15 version and couldn't diff --git a/mlair/__init__.py b/mlair/__init__.py index e8ad9509e3fb90826c862d1a17641df071168b18..05e8d504fbf171b8889343161252bbd439e52473 100644 --- a/mlair/__init__.py +++ b/mlair/__init__.py @@ -1,7 +1,7 @@ __version_info__ = { 'major': 1, - 'minor': 2, - 'micro': 1, + 'minor': 3, + 'micro': 0, } from mlair.run_modules import RunEnvironment, ExperimentSetup, PreProcessing, ModelSetup, Training, PostProcessing diff --git a/mlair/configuration/defaults.py b/mlair/configuration/defaults.py index 04e441fe2ec3b421cf5f0ad1469584f5ef2aa668..8805acfc99d2064b656e3fc80c95a6de198acf29 100644 --- a/mlair/configuration/defaults.py +++ b/mlair/configuration/defaults.py @@ -53,6 +53,7 @@ DEFAULT_SAMPLING = "daily" DEFAULT_DATA_ORIGIN = {"cloudcover": "REA", "humidity": "REA", "pblheight": "REA", "press": "REA", "relhum": "REA", "temp": "REA", "totprecip": "REA", "u": "REA", "v": "REA", "no": "", "no2": "", "o3": "", "pm10": "", "so2": ""} +DEFAULT_USE_MULTIPROCESSING = True def get_defaults(): diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py index ff1f13aed4b8d829edc653c6e99b6cff82287476..15271c8041decd8a7aad6f053feb61f8343c82b5 100644 --- a/mlair/data_handler/default_data_handler.py +++ b/mlair/data_handler/default_data_handler.py @@ -39,7 +39,8 @@ class DefaultDataHandler(AbstractDataHandler): def __init__(self, id_class: data_handler, experiment_path: str, min_length: int = 0, extreme_values: num_or_list = None, extremes_on_right_tail_only: bool = False, name_affix=None, - store_processed_data=True, iter_dim=DEFAULT_ITER_DIM, time_dim=DEFAULT_TIME_DIM): + store_processed_data=True, iter_dim=DEFAULT_ITER_DIM, time_dim=DEFAULT_TIME_DIM, + use_multiprocessing=True): super().__init__() self.id_class = id_class self.time_dim = time_dim @@ -49,6 +50,7 @@ class DefaultDataHandler(AbstractDataHandler): self._Y = None self._X_extreme = None self._Y_extreme = None + self._use_multiprocessing = use_multiprocessing _name_affix = str(f"{str(self.id_class)}_{name_affix}" if name_affix is not None else id(self)) self._save_file = os.path.join(experiment_path, "data", f"{_name_affix}.pickle") self._collection = self._create_collection() @@ -286,7 +288,7 @@ class DefaultDataHandler(AbstractDataHandler): new = opts.get(k) transformation_dict[i][var][k] = new if old is None else old.combine_first(new) - if multiprocessing.cpu_count() > 1: # parallel solution + if multiprocessing.cpu_count() > 1 and kwargs.get("use_multiprocessing", True) is True: # parallel solution logging.info("use parallel transformation approach") pool = multiprocessing.Pool( min([psutil.cpu_count(logical=False), len(set_stations), 16])) # use only physical cpus diff --git a/mlair/run_modules/experiment_setup.py b/mlair/run_modules/experiment_setup.py index 30672ecc9206319896205d886157b2f2f8977f39..c777bcc4d568862485d733ca42f2ea38c52799eb 100644 --- a/mlair/run_modules/experiment_setup.py +++ b/mlair/run_modules/experiment_setup.py @@ -4,6 +4,7 @@ __date__ = '2019-11-15' import argparse import logging import os +import sys from typing import Union, Dict, Any, List, Callable from mlair.configuration import path_config @@ -17,7 +18,8 @@ from mlair.configuration.defaults import DEFAULT_STATIONS, DEFAULT_VAR_ALL_DICT, DEFAULT_TRAIN_START, DEFAULT_TRAIN_END, DEFAULT_TRAIN_MIN_LENGTH, DEFAULT_VAL_START, DEFAULT_VAL_END, \ DEFAULT_VAL_MIN_LENGTH, DEFAULT_TEST_START, DEFAULT_TEST_END, DEFAULT_TEST_MIN_LENGTH, DEFAULT_TRAIN_VAL_MIN_LENGTH, \ DEFAULT_USE_ALL_STATIONS_ON_ALL_DATA_SETS, DEFAULT_EVALUATE_BOOTSTRAPS, DEFAULT_CREATE_NEW_BOOTSTRAPS, \ - DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM + DEFAULT_NUMBER_OF_BOOTSTRAPS, DEFAULT_PLOT_LIST, DEFAULT_SAMPLING, DEFAULT_DATA_ORIGIN, DEFAULT_ITER_DIM, \ + DEFAULT_USE_MULTIPROCESSING from mlair.data_handler import DefaultDataHandler from mlair.run_modules.run_environment import RunEnvironment from mlair.model_modules.fully_connected_networks import FCN_64_32_16 as VanillaModel @@ -62,48 +64,6 @@ class ExperimentSetup(RunEnvironment): * `target_dim` [.] * `window_lead_time` [.] - # interpolation - self._set_param("dimensions", dimensions, default={'new_index': ['datetime', 'Stations']}) - self._set_param("time_dim", time_dim, default='datetime') - self._set_param("interpolation_method", interpolation_method, default='linear') - self._set_param("limit_nan_fill", limit_nan_fill, default=1) - - # train set parameters - self._set_param("start", train_start, default="1997-01-01", scope="train") - self._set_param("end", train_end, default="2007-12-31", scope="train") - self._set_param("min_length", train_min_length, default=90, scope="train") - - # validation set parameters - self._set_param("start", val_start, default="2008-01-01", scope="val") - self._set_param("end", val_end, default="2009-12-31", scope="val") - self._set_param("min_length", val_min_length, default=90, scope="val") - - # test set parameters - self._set_param("start", test_start, default="2010-01-01", scope="test") - self._set_param("end", test_end, default="2017-12-31", scope="test") - self._set_param("min_length", test_min_length, default=90, scope="test") - - # train_val set parameters - self._set_param("start", self.data_store.get("start", "train"), scope="train_val") - self._set_param("end", self.data_store.get("end", "val"), scope="train_val") - train_val_min_length = sum([self.data_store.get("min_length", s) for s in ["train", "val"]]) - self._set_param("min_length", train_val_min_length, default=180, scope="train_val") - - # use all stations on all data sets (train, val, test) - self._set_param("use_all_stations_on_all_data_sets", use_all_stations_on_all_data_sets, default=True) - - # set post-processing instructions - self._set_param("evaluate_bootstraps", evaluate_bootstraps, scope="general.postprocessing") - create_new_bootstraps = max([self.data_store.get("train_model", "general"), create_new_bootstraps or False]) - self._set_param("create_new_bootstraps", create_new_bootstraps, scope="general.postprocessing") - self._set_param("number_of_bootstraps", number_of_bootstraps, default=20, scope="general.postprocessing") - self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing") - - # check variables, statistics and target variable - self._check_target_var() - self._compare_variables_and_statistics() - - Creates * plot of model architecture in `<model_name>.pdf` @@ -135,8 +95,11 @@ class ExperimentSetup(RunEnvironment): predicted. :param dimensions: :param time_dim: - :param interpolation_method: - :param limit_nan_fill: + :param interpolation_method: The method to use for interpolation. + :param interpolation_limit: The maximum number of subsequent time steps in a gap to fill by interpolation. If the + gap exceeds this number, the gap is not filled by interpolation at all. The value of time steps is an arbitrary + number that is applied depending on the `sampling` frequency. A limit of 2 means that either 2 hours or 2 days + are allowed to be interpolated in dependency of the set sampling rate. :param train_start: :param train_end: :param val_start: @@ -197,6 +160,29 @@ class ExperimentSetup(RunEnvironment): :param data_path: path to find and store meteorological and environmental / air quality data. Leave this parameter empty, if your host system is known and a suitable path was already hardcoded in the program (see :py:func:`prepare host <src.configuration.path_config.prepare_host>`). + :param experiment_date: + :param window_dim: "Temporal" dimension of the input and target data, that is provided for each sample. The number + of samples provided in this dimension can be set using `window_history_size` for inputs and `window_lead_time` + on target site. + :param iter_dim: + :param batch_path: + :param login_nodes: + :param hpc_hosts: + :param model: + :param batch_size: + :param epochs: Number of epochs used in training. If a training is resumed and the number of epochs of the already + (partly) trained model is lower than this parameter, training is continue. In case this number is higher than + the given epochs parameter, no training is resumed. Epochs is set to 20 per default, but this value is just a + placeholder that should be adjusted for a meaningful training. + :param data_handler: + :param data_origin: + :param competitors: Provide names of reference models trained by MLAir that can be found in the `competitor_path`. + These models will be used in the postprocessing for comparison. + :param competitor_path: The path where MLAir can find competing models. If not provided, this path is assumed to be + in the ´data_path´ directory as a subdirectory called `competitors` (default). + :param use_multiprocessing: Enable parallel preprocessing (postprocessing not implemented yet) by setting this + parameter to `True` (default). If set to `False` the computation is performed in an serial approach. + Multiprocessing is disabled when running in debug mode and cannot be switched on. """ @@ -228,7 +214,8 @@ class ExperimentSetup(RunEnvironment): number_of_bootstraps=None, create_new_bootstraps=None, data_path: str = None, batch_path: str = None, login_nodes=None, hpc_hosts=None, model=None, batch_size=None, epochs=None, data_handler=None, - data_origin: Dict = None, competitors: list = None, competitor_path: str = None, **kwargs): + data_origin: Dict = None, competitors: list = None, competitor_path: str = None, + use_multiprocessing: bool = None, **kwargs): # create run framework super().__init__() @@ -265,6 +252,12 @@ class ExperimentSetup(RunEnvironment): logging.info(f"Experiment path is: {experiment_path}") path_config.check_path_and_create(self.data_store.get("experiment_path")) + # host system setup + debug_mode = sys.gettrace() is not None + self._set_param("debug_mode", debug_mode) + use_multiprocessing = False if debug_mode is True else use_multiprocessing + self._set_param("use_multiprocessing", use_multiprocessing, default=DEFAULT_USE_MULTIPROCESSING) + # batch path (temporary) self._set_param("batch_path", batch_path, default=os.path.join(experiment_path, "batch_data")) diff --git a/mlair/run_modules/pre_processing.py b/mlair/run_modules/pre_processing.py index 422ae8eea023351e24492144571a3fdcc455a7f8..d38e3b006fa51254ec09015aacbb51800e099138 100644 --- a/mlair/run_modules/pre_processing.py +++ b/mlair/run_modules/pre_processing.py @@ -245,8 +245,9 @@ class PreProcessing(RunEnvironment): collection = DataCollection(name=set_name) valid_stations = [] kwargs = self.data_store.create_args_dict(data_handler.requirements(), scope=set_name) + use_multiprocessing = self.data_store.get("use_multiprocessing") - if multiprocessing.cpu_count() > 1: # parallel solution + if multiprocessing.cpu_count() > 1 and use_multiprocessing: # parallel solution logging.info("use parallel validate station approach") pool = multiprocessing.Pool( min([psutil.cpu_count(logical=False), len(set_stations), 16])) # use only physical cpus