added 4th subset "train_val" that consists on the union of train and val. Is...

added 4th subset "train_val" that consists on the union of train and val. Is needed for the external skill score calculation. Also added simple MSE function (no need to install sklearn just for mse)

added 4th subset "train_val" that consists on the union of train and val. Is...
9833edec · lukas leufen · 4a4ba794 · 9833edec · 9833edec · 9833edec
Commit 9833edec authored 5 years ago by lukas leufen
--- a/src/run_modules/experiment_setup.py
+++ b/src/run_modules/experiment_setup.py
@@ -95,6 +95,10 @@ class ExperimentSetup(RunEnvironment):
        self._set_param("start", test_start, default="2010-01-01", scope="general.test")
        self._set_param("end", test_end, default="2017-12-31", scope="general.test")
+        # train_val parameters
+        self._set_param("start", self.data_store.get("start", "general.train"), scope="general.train_val")
+        self._set_param("end", self.data_store.get("end", "general.val"), scope="general.train_val")
        # use all stations on all data sets (train, val, test)
        self._set_param("use_all_stations_on_all_data_sets", use_all_stations_on_all_data_sets, default=True)

--- a/src/run_modules/pre_processing.py
+++ b/src/run_modules/pre_processing.py
@@ -56,26 +56,28 @@ class PreProcessing(RunEnvironment):
    def split_train_val_test(self):
        fraction_of_training = self.data_store.get("fraction_of_training", "general")
        stations = self.data_store.get("stations", "general")
-        train_index, val_index, test_index = self.split_set_indices(len(stations), fraction_of_training)
+        train_index, val_index, test_index, train_val_index = self.split_set_indices(len(stations), fraction_of_training)
-        for (ind, scope) in zip([train_index, val_index, test_index], ["train", "val", "test"]):
+        subset_names = ["train", "val", "test", "train_val"]
+        for (ind, scope) in zip([train_index, val_index, test_index, train_val_index], subset_names):
            self.create_set_split(ind, scope)
    @staticmethod
-    def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice]:
+    def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice, slice]:
        """
        create the training, validation and test subset slice indices for given total_length. The test data consists on
        (1-fraction) of total_length (fraction*len:end). Train and validation data therefore are made from fraction of
        total_length (0:fraction*len). Train and validation data is split by the factor 0.8 for train and 0.2 for
-        validation.
+        validation. In addition, split_set_indices returns also the combination of training and validation subset.
        :param total_length: list with all objects to split
        :param fraction: ratio between test and union of train/val data
-        :return: slices for each subset in the order: train, val, test
+        :return: slices for each subset in the order: train, val, test, train_val
        """
        pos_test_split = int(total_length * fraction)
        train_index = slice(0, int(pos_test_split * 0.8))
        val_index = slice(int(pos_test_split * 0.8), pos_test_split)
        test_index = slice(pos_test_split, total_length)
-        return train_index, val_index, test_index
+        train_val_index = slice(0, pos_test_split)
+        return train_index, val_index, test_index, train_val_index
    def create_set_split(self, index_list, set_name):
        scope = f"general.{set_name}"

--- a/src/statistics.py
+++ b/src/statistics.py
 __author__ = 'Lukas Leufen'
 __date__ = '2019-10-23'
+import numpy as np
 import xarray as xr
 import pandas as pd
 from typing import Union, Tuple
@@ -70,3 +71,7 @@ def centre_inverse(data: Data, mean: Data) -> Data:
    :return:
    """
    return data + mean
+def mean_squared_error(a, b):
+    return np.square(a - b).mean()