From 9833edec97d75132cf37d0c69c769133573d9968 Mon Sep 17 00:00:00 2001
From: lukas leufen <l.leufen@fz-juelich.de>
Date: Mon, 27 Jan 2020 17:16:53 +0100
Subject: [PATCH] added 4th subset "train_val" that consists on the union of
 train and val. Is needed for the external skill score calculation. Also added
 simple MSE function (no need to install sklearn just for mse)

---
 src/run_modules/experiment_setup.py |  4 ++++
 src/run_modules/pre_processing.py   | 14 ++++++++------
 src/statistics.py                   |  5 +++++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py
index cc2c71f9..12a985f7 100644
--- a/src/run_modules/experiment_setup.py
+++ b/src/run_modules/experiment_setup.py
@@ -95,6 +95,10 @@ class ExperimentSetup(RunEnvironment):
         self._set_param("start", test_start, default="2010-01-01", scope="general.test")
         self._set_param("end", test_end, default="2017-12-31", scope="general.test")
 
+        # train_val parameters
+        self._set_param("start", self.data_store.get("start", "general.train"), scope="general.train_val")
+        self._set_param("end", self.data_store.get("end", "general.val"), scope="general.train_val")
+
         # use all stations on all data sets (train, val, test)
         self._set_param("use_all_stations_on_all_data_sets", use_all_stations_on_all_data_sets, default=True)
 
diff --git a/src/run_modules/pre_processing.py b/src/run_modules/pre_processing.py
index 6ab1f0dd..a32f7075 100644
--- a/src/run_modules/pre_processing.py
+++ b/src/run_modules/pre_processing.py
@@ -56,26 +56,28 @@ class PreProcessing(RunEnvironment):
     def split_train_val_test(self):
         fraction_of_training = self.data_store.get("fraction_of_training", "general")
         stations = self.data_store.get("stations", "general")
-        train_index, val_index, test_index = self.split_set_indices(len(stations), fraction_of_training)
-        for (ind, scope) in zip([train_index, val_index, test_index], ["train", "val", "test"]):
+        train_index, val_index, test_index, train_val_index = self.split_set_indices(len(stations), fraction_of_training)
+        subset_names = ["train", "val", "test", "train_val"]
+        for (ind, scope) in zip([train_index, val_index, test_index, train_val_index], subset_names):
             self.create_set_split(ind, scope)
 
     @staticmethod
-    def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice]:
+    def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice, slice]:
         """
         create the training, validation and test subset slice indices for given total_length. The test data consists on
         (1-fraction) of total_length (fraction*len:end). Train and validation data therefore are made from fraction of
         total_length (0:fraction*len). Train and validation data is split by the factor 0.8 for train and 0.2 for
-        validation.
+        validation. In addition, split_set_indices returns also the combination of training and validation subset.
         :param total_length: list with all objects to split
         :param fraction: ratio between test and union of train/val data
-        :return: slices for each subset in the order: train, val, test
+        :return: slices for each subset in the order: train, val, test, train_val
         """
         pos_test_split = int(total_length * fraction)
         train_index = slice(0, int(pos_test_split * 0.8))
         val_index = slice(int(pos_test_split * 0.8), pos_test_split)
         test_index = slice(pos_test_split, total_length)
-        return train_index, val_index, test_index
+        train_val_index = slice(0, pos_test_split)
+        return train_index, val_index, test_index, train_val_index
 
     def create_set_split(self, index_list, set_name):
         scope = f"general.{set_name}"
diff --git a/src/statistics.py b/src/statistics.py
index 6f34187e..7c3caf91 100644
--- a/src/statistics.py
+++ b/src/statistics.py
@@ -1,6 +1,7 @@
 __author__ = 'Lukas Leufen'
 __date__ = '2019-10-23'
 
+import numpy as np
 import xarray as xr
 import pandas as pd
 from typing import Union, Tuple
@@ -70,3 +71,7 @@ def centre_inverse(data: Data, mean: Data) -> Data:
     :return:
     """
     return data + mean
+
+
+def mean_squared_error(a, b):
+    return np.square(a - b).mean()
-- 
GitLab