From 9833edec97d75132cf37d0c69c769133573d9968 Mon Sep 17 00:00:00 2001 From: lukas leufen <l.leufen@fz-juelich.de> Date: Mon, 27 Jan 2020 17:16:53 +0100 Subject: [PATCH] added 4th subset "train_val" that consists on the union of train and val. Is needed for the external skill score calculation. Also added simple MSE function (no need to install sklearn just for mse) --- src/run_modules/experiment_setup.py | 4 ++++ src/run_modules/pre_processing.py | 14 ++++++++------ src/statistics.py | 5 +++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index cc2c71f9..12a985f7 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -95,6 +95,10 @@ class ExperimentSetup(RunEnvironment): self._set_param("start", test_start, default="2010-01-01", scope="general.test") self._set_param("end", test_end, default="2017-12-31", scope="general.test") + # train_val parameters + self._set_param("start", self.data_store.get("start", "general.train"), scope="general.train_val") + self._set_param("end", self.data_store.get("end", "general.val"), scope="general.train_val") + # use all stations on all data sets (train, val, test) self._set_param("use_all_stations_on_all_data_sets", use_all_stations_on_all_data_sets, default=True) diff --git a/src/run_modules/pre_processing.py b/src/run_modules/pre_processing.py index 6ab1f0dd..a32f7075 100644 --- a/src/run_modules/pre_processing.py +++ b/src/run_modules/pre_processing.py @@ -56,26 +56,28 @@ class PreProcessing(RunEnvironment): def split_train_val_test(self): fraction_of_training = self.data_store.get("fraction_of_training", "general") stations = self.data_store.get("stations", "general") - train_index, val_index, test_index = self.split_set_indices(len(stations), fraction_of_training) - for (ind, scope) in zip([train_index, val_index, test_index], ["train", "val", "test"]): + train_index, val_index, test_index, train_val_index = self.split_set_indices(len(stations), fraction_of_training) + subset_names = ["train", "val", "test", "train_val"] + for (ind, scope) in zip([train_index, val_index, test_index, train_val_index], subset_names): self.create_set_split(ind, scope) @staticmethod - def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice]: + def split_set_indices(total_length: int, fraction: float) -> Tuple[slice, slice, slice, slice]: """ create the training, validation and test subset slice indices for given total_length. The test data consists on (1-fraction) of total_length (fraction*len:end). Train and validation data therefore are made from fraction of total_length (0:fraction*len). Train and validation data is split by the factor 0.8 for train and 0.2 for - validation. + validation. In addition, split_set_indices returns also the combination of training and validation subset. :param total_length: list with all objects to split :param fraction: ratio between test and union of train/val data - :return: slices for each subset in the order: train, val, test + :return: slices for each subset in the order: train, val, test, train_val """ pos_test_split = int(total_length * fraction) train_index = slice(0, int(pos_test_split * 0.8)) val_index = slice(int(pos_test_split * 0.8), pos_test_split) test_index = slice(pos_test_split, total_length) - return train_index, val_index, test_index + train_val_index = slice(0, pos_test_split) + return train_index, val_index, test_index, train_val_index def create_set_split(self, index_list, set_name): scope = f"general.{set_name}" diff --git a/src/statistics.py b/src/statistics.py index 6f34187e..7c3caf91 100644 --- a/src/statistics.py +++ b/src/statistics.py @@ -1,6 +1,7 @@ __author__ = 'Lukas Leufen' __date__ = '2019-10-23' +import numpy as np import xarray as xr import pandas as pd from typing import Union, Tuple @@ -70,3 +71,7 @@ def centre_inverse(data: Data, mean: Data) -> Data: :return: """ return data + mean + + +def mean_squared_error(a, b): + return np.square(a - b).mean() -- GitLab