training docs

423d59cd · lukas leufen · 538a42c3 · 423d59cd · 423d59cd · 423d59cd
Commit 423d59cd authored 5 years ago by lukas leufen
--- a/src/helpers/statistics.py
+++ b/src/helpers/statistics.py
@@ -117,8 +117,7 @@ class SkillScores:
    r"""
    Calculate different kinds of skill scores.

-    **Skill score on MSE**:
-
+    Skill score on MSE:
        Calculate skill score based on MSE for given forecast, reference and observations.

        .. math::
@@ -131,8 +130,7 @@ class SkillScores:

            skill_scores = SkillScores(None).general_skill_score(data, observation_name, forecast_name, reference_name)

-    **Competitive skill score**:
-
+    Competitive skill score:
        Calculate skill scores to highlight differences between forecasts. This skill score is also based on the MSE.
        Currently required forecasts are CNN, OLS and persi, as well as the observation obs.

@@ -141,12 +139,12 @@ class SkillScores:
            skill_scores_class = SkillScores(internal_data)  # must contain columns CNN, OLS, persi and obs.
            skill_scores = skill_scores_class.skill_scores(window_lead_time=3)

-    **Skill score according to Murphy**:
-
-    Follow climatological skill score definition of Murphy (1988). External data is data from another time period than
-    the internal data set on initialisation. In other terms, this should be the train and validation data whereas the
-    internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is evaluated to a model
-    to another, this must be performend test data set. Therefore, for this case the foreign data is train and val data.
+    Skill score according to Murphy:
+        Follow climatological skill score definition of Murphy (1988). External data is data from another time period
+        than the internal data set on initialisation. In other terms, this should be the train and validation data
+        whereas the internal data is the test data. This sounds perhaps counter-intuitive, but if a skill score is
+        evaluated to a model to another, this must be performend test data set. Therefore, for this case the foreign
+        data is train and val data.

        .. code-block:: python


--- a/src/model_modules/__init__.py
+++ b/src/model_modules/__init__.py
+"""Collection of all modules that are related to a model."""
--- a/src/model_modules/keras_extensions.py
+++ b/src/model_modules/keras_extensions.py
@@ -296,7 +296,7 @@ class CallbackHandler:
        else:
            return [clb["callback"] for clb in self._get_callbacks()]

-    def get_callback_by_name(self, obj_name: str) -> Callback:
+    def get_callback_by_name(self, obj_name: str) -> Union[Callback, History]:
        """
        Get single callback by its name.


--- a/src/run_modules/training.py
+++ b/src/run_modules/training.py
+"""Training module."""
+
 __author__ = "Lukas Leufen, Felix Kleinert"
 __date__ = '2019-12-05'

@@ -7,16 +9,54 @@ import os
 from typing import Union

 import keras
+from keras.callbacks import Callback, History

 from src.data_handling.data_distributor import Distributor
-from src.model_modules.keras_extensions import LearningRateDecay, CallbackHandler
+from src.model_modules.keras_extensions import CallbackHandler
 from src.plotting.training_monitoring import PlotModelHistory, PlotModelLearningRate
 from src.run_modules.run_environment import RunEnvironment


 class Training(RunEnvironment):
+    """
+    Perform training.
+        #. set_generators(): set generators for training, validation and testing and distribute according to batch size
+        #. make_predict_function(): create predict function before distribution on multiple nodes (detailed information
+           in method description)
+        #. train(): start or resume training of model and save callbacks
+        #. save_model(): save best model from training as final model
+
+    Required objects [scope] from data store:
+        * `model` [model]
+        * `batch_size` [model]
+        * `epochs` [model]
+        * `callbacks` [model]
+        * `model_name* [model]
+        * `experiment_name` [.]
+        * `experiment_path` [.]
+        * `trainable` [.]
+        * `create_new_model` [.]
+        * `generator` [train, val, test]
+        * `plot_path` [.]
+
+    Optional objects
+        * `permute_data` [train, val, test]
+        * `upsampling` [train, val, test]
+
+    Sets
+        * `best_model` [.]
+
+    Creates
+        * `<exp_name>_model-best.h5`
+        * `<exp_name>_model-best-callbacks-<name>.h5` (all callbacks from CallbackHandler)
+        * `history.json`
+        * `history_lr.json` (optional)
+        * `<exp_name>_history_<name>.pdf` (different monitoring plots depending on loss metrics and callbacks)
+
+    """

    def __init__(self):
+        """Set up training."""
        super().__init__()
        self.model: keras.Model = self.data_store.get("model", "model")
        self.train_set: Union[Distributor, None] = None
@@ -31,17 +71,7 @@ class Training(RunEnvironment):
        self._run()

    def _run(self) -> None:
-        """
-        Perform training
-        1) set_generators():
-            set generators for training, validation and testing and distribute according to batch size
-        2) make_predict_function():
-            create predict function before distribution on multiple nodes (detailed information in method description)
-        3) train():
-            start or resume training of model and save callbacks
-        4) save_model():
-            save best model from training as final model
-        """
+        """Run training. Details in class description."""
        self.set_generators()
        self.make_predict_function()
        if self._trainable:
@@ -52,40 +82,44 @@ class Training(RunEnvironment):

    def make_predict_function(self) -> None:
        """
-        Creates the predict function. Must be called before distributing. This is necessary, because tf will compile
-        the predict function just in the moment it is used the first time. This can cause problems, if the model is
-        distributed on different workers. To prevent this, the function is pre-compiled. See discussion @
+        Create predict function.
+
+        Must be called before distributing. This is necessary, because tf will compile the predict function just in
+        the moment it is used the first time. This can cause problems, if the model is distributed on different
+        workers. To prevent this, the function is pre-compiled. See discussion @
        https://stackoverflow.com/questions/40850089/is-keras-thread-safe/43393252#43393252
        """
        self.model._make_predict_function()

    def _set_gen(self, mode: str) -> None:
        """
-        Set and distribute the generators for given mode regarding batch size
+        Set and distribute the generators for given mode regarding batch size.

        :param mode: name of set, should be from ["train", "val", "test"]
        """
        gen = self.data_store.get("generator", mode)
-        # permute_data = self.data_store.get_default("permute_data", mode, default=False)
        kwargs = self.data_store.create_args_dict(["permute_data", "upsampling"], scope=mode)
        setattr(self, f"{mode}_set", Distributor(gen, self.model, self.batch_size, **kwargs))

    def set_generators(self) -> None:
        """
-        Set all generators for training, validation, and testing subsets. The called sub-method will automatically
-        distribute the data according to the batch size. The subsets can be accessed as class variables train_set,
-        val_set, and test_set .
+        Set all generators for training, validation, and testing subsets.
+
+        The called sub-method will automatically distribute the data according to the batch size. The subsets can be
+        accessed as class variables train_set, val_set, and test_set.
        """
        for mode in ["train", "val", "test"]:
            self._set_gen(mode)

    def train(self) -> None:
        """
-        Perform training using keras fit_generator(). Callbacks are stored locally in the experiment directory. Best
-        model from training is saved for class variable model. If the file path of checkpoint is not empty, this method
-        assumes, that this is not a new training starting from the very beginning, but a resumption from a previous
-        started but interrupted training (or a stopped and now continued training). Train will automatically load the
-        locally stored information and the corresponding model and proceed with the already started training.
+        Perform training using keras fit_generator().
+
+        Callbacks are stored locally in the experiment directory. Best model from training is saved for class
+        variable model. If the file path of checkpoint is not empty, this method assumes, that this is not a new
+        training starting from the very beginning, but a resumption from a previous started but interrupted training
+        (or a stopped and now continued training). Train will automatically load the locally stored information and the
+        corresponding model and proceed with the already started training.
        """
        logging.info(f"Train with {len(self.train_set)} mini batches.")
        logging.info(f"Train with option upsampling={self.train_set.upsampling}.")
@@ -105,7 +139,7 @@ class Training(RunEnvironment):
            self.callbacks.load_callbacks()
            self.callbacks.update_checkpoint()
            self.model = keras.models.load_model(checkpoint.filepath)
-            hist = self.callbacks.get_callback_by_name("hist")
+            hist: History = self.callbacks.get_callback_by_name("hist")
            initial_epoch = max(hist.epoch) + 1
            _ = self.model.fit_generator(generator=self.train_set.distribute_on_batches(),
                                         steps_per_epoch=len(self.train_set),
@@ -125,9 +159,7 @@ class Training(RunEnvironment):
        self.create_monitoring_plots(history, lr)

    def save_model(self) -> None:
-        """
-        save model in local experiment directory. Model is named as <experiment_name>_<custom_model_name>.h5 .
-        """
+        """Save model in local experiment directory. Model is named as `<experiment_name>_<custom_model_name>.h5`."""
        model_name = self.data_store.get("model_name", "model")
        logging.debug(f"save best model to {model_name}")
        self.model.save(model_name)
@@ -146,13 +178,15 @@ class Training(RunEnvironment):
        except OSError:
            logging.info('no weights to reload...')

-    def save_callbacks_as_json(self, history: keras.callbacks.History, lr_sc: keras.callbacks) -> None:
+    def save_callbacks_as_json(self, history: Callback, lr_sc: Callback) -> None:
        """
        Save callbacks (history, learning rate) of training.
+
        * history.history -> history.json
        * lr_sc.lr -> history_lr.json

        :param history: history object of training
+        :param lr_sc: learning rate object
        """
        logging.debug("saving callbacks")
        path = self.data_store.get("experiment_path")
@@ -162,13 +196,14 @@ class Training(RunEnvironment):
            with open(os.path.join(path, "history_lr.json"), "w") as f:
                json.dump(lr_sc.lr, f)

-    def create_monitoring_plots(self, history: keras.callbacks.History, lr_sc: LearningRateDecay) -> None:
+    def create_monitoring_plots(self, history: Callback, lr_sc: Callback) -> None:
        """
-        Creates the history and learning rate plot in dependence of the number of epochs. The plots are saved in the
-        experiment's plot_path. History plot is named '<exp_name>_history_loss_val_loss.pdf', the learning rate with
-        '<exp_name>_history_learning_rate.pdf'.
+        Create plot of history and learning rate in dependence of the number of epochs.
+
+        The plots are saved in the experiment's plot_path. History plot is named `<exp_name>_history_loss_val_loss.pdf`,
+        the learning rate with `<exp_name>_history_learning_rate.pdf`.

-        :param history: keras history object with losses to plot (must include 'loss' and 'val_loss')
+        :param history: keras history object with losses to plot (must at least include `loss` and `val_loss`)
        :param lr_sc:  learning rate decay object with 'lr' attribute
        """
        path = self.data_store.get("plot_path")