diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt index 772efd78abe7713a5a4ce94a47c0515fc4276c2a..a1294e314d9d04402ba7c063754a56b49deab602 100644 --- a/docs/requirements_docs.txt +++ b/docs/requirements_docs.txt @@ -1,5 +1,5 @@ sphinx==3.0.3 sphinx-autoapi==1.3.0 +sphinx-autodoc-typehints==1.10.3 sphinx-rtd-theme==0.4.3 -recommonmark==0.6.0 -sphinx-autodoc-typehints==1.10.3 \ No newline at end of file +recommonmark==0.6.0 \ No newline at end of file diff --git a/src/configuration/path_config.py b/src/configuration/path_config.py index 6fdbbd2930596d57e953d3f6994face595357c25..289c15821db587b0866eb4808981cfae640cb9a0 100644 --- a/src/configuration/path_config.py +++ b/src/configuration/path_config.py @@ -8,17 +8,19 @@ from typing import Tuple ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -def prepare_host(create_new=True, sampling="daily") -> str: +def prepare_host(create_new=True, data_path=None, sampling="daily") -> str: """ Set up host path. - Warning: This functions can only handle known hosts. For the moment, please add your hostname hardcoded here. For - future, this will be replace by a more flexible configuration file setup. + INFO: This functions is designed to handle known hosts. For proper working, please add your hostname hardcoded here. + Otherwise parse your custom data_path in kwargs. If data_path is provided, hardcoded paths for known hosts will be + ignored! :param create_new: Create new path if enabled + :param data_path: Parse your custom path (and therefore ignore preset paths fitting to known hosts) :param sampling: sampling rate to separate data physically by temporal resolution - :return: full path of data + :return: full path to data """ hostname = socket.gethostname() runner_regex = re.compile(r"runner-.*-project-2411-concurrent-\d+") @@ -26,20 +28,23 @@ def prepare_host(create_new=True, sampling="daily") -> str: user = os.getlogin() except OSError: user = "default" - if hostname == "ZAM144": - path = f"/home/{user}/Data/toar_{sampling}/" - elif hostname == "zam347": - path = f"/home/{user}/Data/toar_{sampling}/" - elif hostname == "linux-aa9b": - path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" - elif (len(hostname) > 2) and (hostname[:2] == "jr"): - path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" - elif (len(hostname) > 2) and (hostname[:2] == "jw"): - path = f"/p/home/jusers/{user}/juwels/intelliaq/DATA/toar_{sampling}/" - elif runner_regex.match(hostname) is not None: - path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + if data_path is None: + if hostname == "ZAM144": + path = f"/home/{user}/Data/toar_{sampling}/" + elif hostname == "zam347": + path = f"/home/{user}/Data/toar_{sampling}/" + elif hostname == "linux-aa9b": + path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + elif (len(hostname) > 2) and (hostname[:2] == "jr"): + path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" + elif (len(hostname) > 2) and (hostname[:2] == "jw"): + path = f"/p/home/jusers/{user}/juwels/intelliaq/DATA/toar_{sampling}/" + elif runner_regex.match(hostname) is not None: + path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" + else: + raise OSError(f"unknown host '{hostname}'") else: - raise OSError(f"unknown host '{hostname}'") + path = os.path.abspath(data_path) if not os.path.exists(path): try: if create_new: diff --git a/src/data_handling/data_generator.py b/src/data_handling/data_generator.py index 0f8b9959c72d5f01c63e85d85e8fbf570ae6e23c..6747e82e0da2d1a68c99a09d75c76cdcd53a05ba 100644 --- a/src/data_handling/data_generator.py +++ b/src/data_handling/data_generator.py @@ -297,7 +297,7 @@ class DataGenerator(keras.utils.Sequence): data.make_labels(self.target_dim, self.target_var, self.interpolate_dim, self.window_lead_time) data.make_observation(self.target_dim, self.target_var, self.interpolate_dim) data.remove_nan(self.interpolate_dim) - if self.extreme_values: + if self.extreme_values is not None: kwargs = {"extremes_on_right_tail_only": self.kwargs.get("extremes_on_right_tail_only", False)} data.multiply_extremes(self.extreme_values, **kwargs) if save_local_tmp_storage: diff --git a/src/plotting/tracker_plot.py b/src/plotting/tracker_plot.py index 2d7b06cb6d7430be80eeae8ecedf811a3f2dc37c..20db5d9d9f22df548b1d499c4e8e0faa3fbfa1ee 100644 --- a/src/plotting/tracker_plot.py +++ b/src/plotting/tracker_plot.py @@ -213,7 +213,7 @@ class TrackChain: class TrackPlot: - def __init__(self, tracker_list, sparse_conn_mode=True, plot_folder: str = ".", skip_run_env=True): + def __init__(self, tracker_list, sparse_conn_mode=True, plot_folder: str = ".", skip_run_env=True, plot_name=None): self.width = 0.6 self.height = 0.5 @@ -229,16 +229,17 @@ class TrackPlot: track_chain_dict = track_chain_obj.create_track_chain() self.set_ypos_anchor(track_chain_obj.scopes, track_chain_obj.dims) self.fig, self.ax = plt.subplots(figsize=(len(tracker_list) * 2, (self.anchor.max() - self.anchor.min()) / 3)) - self._plot(track_chain_dict, sparse_conn_mode, skip_run_env, plot_folder) + self._plot(track_chain_dict, sparse_conn_mode, skip_run_env, plot_folder, plot_name) - def _plot(self, track_chain_dict, sparse_conn_mode, skip_run_env, plot_folder): + def _plot(self, track_chain_dict, sparse_conn_mode, skip_run_env, plot_folder, plot_name=None): stages, v_lines = self.create_track_chain_plot(track_chain_dict, sparse_conn_mode=sparse_conn_mode, skip_run_env=skip_run_env) self.set_lims() self.add_variable_names() self.add_stages(v_lines, stages) plt.tight_layout() - plot_name = os.path.join(os.path.abspath(plot_folder), "tracking.pdf") + plot_name = "tracking.pdf" if plot_name is None else plot_name + plot_name = os.path.join(os.path.abspath(plot_folder), plot_name) plt.savefig(plot_name, dpi=600) def line(self, start_x, end_x, y, color="darkgrey"): diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index 97b0ea304e4e7236058609aac93cb3cc16f255df..50db9206bd3c26ff26c90df8c7a85c506e658634 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -30,36 +30,124 @@ class ExperimentSetup(RunEnvironment): Set up the model. Schedule of experiment setup: - #. set channels (from variables dimension) - #. build imported model - #. plot model architecture - #. load weights if enabled (e.g. to resume a training) - #. set callbacks and checkpoint - #. compile model + * set up experiment path + * set up data path (according to host system) + * set up forecast, bootstrap and plot path (inside experiment path) + * set all parameters given in args (or use default values) + * check target variable + * check `variables` and `statistics_per_var` parameter for consistency Sets - * `channels` [model] - * `model` [model] - * `hist` [model] - * `callbacks` [model] - * `model_name` [model] - * all settings from model class like `dropout_rate`, `initial_lr`, `batch_size`, and `optimizer` [model] + * `data_path` [.] + * `create_new_model` [.] + * `bootstrap_path` [.] + * `trainable` [.] + * `fraction_of_training` [.] + * `extreme_values` [train] + * `extremes_on_right_tail_only` [train] + * `upsampling` [train] + * `permute_data` [train] + * `experiment_name` [.] + * `experiment_path` [.] + * `plot_path` [.] + * `forecast_path` [.] + * `stations` [.] + * `network` [.] + * `station_type` [.] + * `statistics_per_var` [.] + * `variables` [.] + * `start` [.] + * `end` [.] + * `window_history_size` [.] + * `overwrite_local_data` [preprocessing] + * `sampling` [.] + * `transformation` [., preprocessing] + * `target_var` [.] + * `target_dim` [.] + * `window_lead_time` [.] + + # interpolation + self._set_param("dimensions", dimensions, default={'new_index': ['datetime', 'Stations']}) + self._set_param("interpolate_dim", interpolate_dim, default='datetime') + self._set_param("interpolate_method", interpolate_method, default='linear') + self._set_param("limit_nan_fill", limit_nan_fill, default=1) + + # train set parameters + self._set_param("start", train_start, default="1997-01-01", scope="train") + self._set_param("end", train_end, default="2007-12-31", scope="train") + self._set_param("min_length", train_min_length, default=90, scope="train") + + # validation set parameters + self._set_param("start", val_start, default="2008-01-01", scope="val") + self._set_param("end", val_end, default="2009-12-31", scope="val") + self._set_param("min_length", val_min_length, default=90, scope="val") + + # test set parameters + self._set_param("start", test_start, default="2010-01-01", scope="test") + self._set_param("end", test_end, default="2017-12-31", scope="test") + self._set_param("min_length", test_min_length, default=90, scope="test") + + # train_val set parameters + self._set_param("start", self.data_store.get("start", "train"), scope="train_val") + self._set_param("end", self.data_store.get("end", "val"), scope="train_val") + train_val_min_length = sum([self.data_store.get("min_length", s) for s in ["train", "val"]]) + self._set_param("min_length", train_val_min_length, default=180, scope="train_val") + + # use all stations on all data sets (train, val, test) + self._set_param("use_all_stations_on_all_data_sets", use_all_stations_on_all_data_sets, default=True) + + # set post-processing instructions + self._set_param("evaluate_bootstraps", evaluate_bootstraps, scope="general.postprocessing") + create_new_bootstraps = max([self.data_store.get("trainable", "general"), create_new_bootstraps or False]) + self._set_param("create_new_bootstraps", create_new_bootstraps, scope="general.postprocessing") + self._set_param("number_of_bootstraps", number_of_bootstraps, default=20, scope="general.postprocessing") + self._set_param("plot_list", plot_list, default=DEFAULT_PLOT_LIST, scope="general.postprocessing") + + # check variables, statistics and target variable + self._check_target_var() + self._compare_variables_and_statistics() + + + + + + + + Creates * plot of model architecture in `<model_name>.pdf` - :param parser_args: argument parser, currently only accepting experiment_data argument - :param stations: list of stations or single station to use for experiment - :param network: name of network to restrict to use only stations from this measurement network - :param station_type: restrict network type to one of TOAR's categories (background, traffic, industrial) - :param variables: list of all variables to use - :param statistics_per_var: dictionary with statistics to use for variables (if data is daily and loaded from JOIN) - :param start: start date of overall data - :param end: end date of overall data - :param window_history_size: number of time steps to use for input data - :param target_var: target variable to predict by model - :param target_dim: dimension of this variable - :param window_lead_time: number of time steps to predict by model + :param parser_args: argument parser, currently only accepting ``experiment_date argument`` to be used for + experiment's name and path creation. Final experiment's name is derived from given name and the time series + sampling as `<name>_network_<sampling>/` . All interim and final results, logging, plots, ... of this run are + stored in this directory if not explicitly provided in kwargs. Only the data itself and data for bootstrap + investigations are stored outside this structure. + :param stations: list of stations or single station to use in experiment. If not provided, stations are set to + :py:const:`default stations <DEFAULT_STATIONS>`. + :param network: name of network to restrict to use only stations from this measurement network. Default is + `AIRBASE` . + :param station_type: restrict network type to one of TOAR's categories (background, traffic, industrial). Default is + `None` to use all categories. + :param variables: list of all variables to use. Valid names can be found in + `Section 2.1 Parameters <https://join.fz-juelich.de/services/rest/surfacedata/>`_. If not provided, this + parameter is filled with keys from ``statistics_per_var``. + :param statistics_per_var: dictionary with statistics to use for variables (if data is daily and loaded from JOIN). + If not provided, :py:const:`default statistics <DEFAULT_VAR_ALL_DICT>` is applied. ``statistics_per_var`` is + compared with given ``variables`` and unused variables are removed. Therefore, statistics at least need to + provide all variables from ``variables``. For more details on available statistics, we refer to + `Section 3.3 List of statistics/metrics for stats service <https://join.fz-juelich.de/services/rest/surfacedata/>`_ + in the JOIN documentation. Valid parameter names can be found in + `Section 2.1 Parameters <https://join.fz-juelich.de/services/rest/surfacedata/>`_. + :param start: start date of overall data (default `"1997-01-01"`) + :param end: end date of overall data (default `"2017-12-31"`) + :param window_history_size: number of time steps to use for input data (default 13). Time steps `t_0 - w` to `t_0` + are used as input data (therefore actual data size is `w+1`). + :param target_var: target variable to predict by model, currently only a single target variable is supported. + Because this framework was originally designed to predict ozone, default is `"o3"`. + :param target_dim: dimension of target variable (default `"variables"`). + :param window_lead_time: number of time steps to predict by model (default 3). Time steps `t_0+1` to `t_0+w` are + predicted. :param dimensions: :param interpolate_dim: :param interpolate_method: @@ -71,26 +159,59 @@ class ExperimentSetup(RunEnvironment): :param test_start: :param test_end: :param use_all_stations_on_all_data_sets: - :param trainable: - :param fraction_of_train: + :param trainable: train a new model from scratch or resume training with existing model if `True` (default) or + freeze loaded model and do not perform any modification on it. ``trainable`` is set to `True` if + ``create_new_model`` is `True`. + :param fraction_of_train: given value is used to split between test data and train data (including validation data). + The value of ``fraction_of_train`` must be in `(0, 1)` but is recommended to be in the interval `[0.6, 0.9]`. + Default value is `0.8`. Split between train and validation is fixed to 80% - 20% and currently not changeable. :param experiment_path: - :param plot_path: - :param forecast_path: - :param overwrite_local_data: - :param sampling: - :param create_new_model: + :param plot_path: path to save all plots. If left blank, this will be included in the experiment path (recommended). + Otherwise customise the location to save all plots. + :param forecast_path: path to save all forecasts in files. It is recommended to leave this parameter blank, all + forecasts will be the directory `forecasts` inside the experiment path (default). For customisation, add your + path here. + :param overwrite_local_data: Reload input and target data from web and replace local data if `True` (default + `False`). + :param sampling: set temporal sampling rate of data. You can choose from daily (default), monthly, seasonal, + vegseason, summer and annual for aggregated values and hourly for the actual values. Note, that hourly values on + JOIN are currently not accessible from outside. To access this data, you need to add your personal token in + :py:mod:`join settings <src.configuration.join_settings>` and make sure to untrack this file! + :param create_new_model: determine whether a new model will be created (`True`, default) or not (`False`). If this + parameter is set to `False`, make sure, that a suitable model already exists in the experiment path. This model + must fit in terms of input and output dimensions as well as ``window_history_size`` and ``window_lead_time`` and + must be implemented as a :py:mod:`model class <src.model_modules.model_class>` and imported in + :py:mod:`model setup <src.run_modules.model_setup>`. If ``create_new_model`` is `True`, parameter ``trainable`` + is automatically set to `True` too. :param bootstrap_path: - :param permute_data_on_training: - :param transformation: + :param permute_data_on_training: shuffle train data individually for each station if `True`. This is performed each + iteration for new, so that each sample very likely differs from epoch to epoch. Train data permutation is + disabled (`False`) per default. If the case of extreme value manifolding, data permutation is enabled anyway. + :param transformation: set transformation options in dictionary style. All information about transformation options + can be found in :py:meth:`setup transformation <src.data_handling.data_generator.DataGenerator.setup_transformation>`. + If no transformation is provided, all options are set to :py:const:`default transformation <DEFAULT_TRANSFORMATION>`. :param train_min_length: :param val_min_length: :param test_min_length: - :param extreme_values: - :param extremes_on_right_tail_only: + :param extreme_values: augment target samples with values of lower occurrences indicated by its normalised + deviation from mean by manifolding. These extreme values need to be indicated by a list of thresholds. For + each entry in this list, all values outside an +/- interval will be added in the training (and only the + training) set for a second time to the sample. If multiple valus are given, a sample is added for each + exceedence once. E.g. a sample with `value=2.5` occurs twice in the training set for given + `extreme_values=[2, 3]`, whereas a sample with `value=5` occurs three times in the training set. For default, + upsampling of extreme values is disabled (`None`). Upsamling can be modified to manifold only values that are + actually larger than given values from ``extreme_values`` (apply only on right side of distribution) by using + ``extremes_on_right_tail_only``. This can be useful for positive skew variables. + :param extremes_on_right_tail_only: applies only if ``extreme_values`` are given. If ``extremes_on_right_tail_only`` + is `True`, only manifold values that are larger than given extremes (apply upsampling only on right side of + distribution). In default mode, this is set to `False` to manifold extremes on both sides. :param evaluate_bootstraps: :param plot_list: :param number_of_bootstraps: :param create_new_bootstraps: + :param data_path: path to find and store meteorological and environmental / air quality data. Leave this parameter + empty, if your host system is known and a suitable path was already hardcoded in the program (see + :py:func:`prepare host <src.configuration.path_config.prepare_host>`). """ @@ -111,17 +232,17 @@ class ExperimentSetup(RunEnvironment): interpolate_dim=None, interpolate_method=None, limit_nan_fill=None, train_start=None, train_end=None, val_start=None, val_end=None, test_start=None, - test_end=None, use_all_stations_on_all_data_sets=True, trainable=None, fraction_of_train=None, - experiment_path=None, plot_path=None, forecast_path=None, overwrite_local_data=None, sampling="daily", - create_new_model=None, bootstrap_path=None, permute_data_on_training=False, transformation=None, - train_min_length=None, val_min_length=None, test_min_length=None, extreme_values=None, - extremes_on_right_tail_only=None, evaluate_bootstraps=True, plot_list=None, number_of_bootstraps=None, - create_new_bootstraps=None): + test_end=None, use_all_stations_on_all_data_sets=True, trainable: bool = None, fraction_of_train: float = None, + experiment_path=None, plot_path: str = None, forecast_path: str = None, overwrite_local_data: bool = None, sampling: str = "daily", + create_new_model: bool = None, bootstrap_path=None, permute_data_on_training: bool = None, transformation=None, + train_min_length=None, val_min_length=None, test_min_length=None, extreme_values: list = None, + extremes_on_right_tail_only: bool = None, evaluate_bootstraps=True, plot_list=None, number_of_bootstraps=None, + create_new_bootstraps=None, data_path: str = None): """Set uo experiment.""" super().__init__() # experiment setup - self._set_param("data_path", path_config.prepare_host(sampling=sampling)) + self._set_param("data_path", path_config.prepare_host(data_path=data_path, sampling=sampling)) self._set_param("create_new_model", create_new_model, default=True) if self.data_store.get("create_new_model"): trainable = True @@ -134,7 +255,8 @@ class ExperimentSetup(RunEnvironment): self._set_param("extremes_on_right_tail_only", extremes_on_right_tail_only, default=False, scope="train") self._set_param("upsampling", extreme_values is not None, scope="train") upsampling = self.data_store.get("upsampling", "train") - self._set_param("permute_data", max([permute_data_on_training, upsampling]), scope="train") + permute_data = False if permute_data_on_training is None else permute_data_on_training + self._set_param("permute_data", permute_data or upsampling, scope="train") # set experiment name exp_date = self._get_parser_args(parser_args).get("experiment_date") @@ -261,7 +383,6 @@ class ExperimentSetup(RunEnvironment): stat_new = helpers.remove_items(stat, list(unused_vars)) self._set_param("statistics_per_var", stat_new) - def _check_target_var(self): """Check if target variable is in statistics_per_var dictionary.""" target_var = helpers.to_list(self.data_store.get("target_var")) diff --git a/src/run_modules/run_environment.py b/src/run_modules/run_environment.py index 16d21ae9b294a481e87e66a28b67f7ab759bbe78..a0e619f364a060b3ed44639c6057046db197d84b 100644 --- a/src/run_modules/run_environment.py +++ b/src/run_modules/run_environment.py @@ -115,7 +115,7 @@ class RunEnvironment(object): # copy log file and clear data store only if called as base class and not as super class if self.__class__.__name__ == "RunEnvironment": try: - TrackPlot(self.tracker_list, True, plot_folder=self.data_store.get_default("experiment_path", ".")) + self.__plot_tracking() self.__save_tracking() self.__copy_log_file() except FileNotFoundError: @@ -147,6 +147,10 @@ class RunEnvironment(object): with open(new_file, "w") as f: json.dump(tracker, f) + def __plot_tracking(self): + plot_folder, plot_name = os.path.split(self.__find_file_pattern("tracking_%03i.pdf")) + TrackPlot(self.tracker_list, sparse_conn_mode=True, plot_folder=plot_folder, plot_name=plot_name) + def __find_file_pattern(self, name): counter = 0 filename_pattern = os.path.join(self.data_store.get_default("experiment_path", os.path.realpath(".")), name)