From 03f191dc39c640d66772808101d8c1e90c656292 Mon Sep 17 00:00:00 2001 From: Felix Kleinert <f.kleinert@fz-juelich.de> Date: Tue, 7 Apr 2020 16:56:27 +0200 Subject: [PATCH] introduce partition check to raise an OSerror on login nodes --- run.py | 8 +++++--- setup_venv.sh | 2 ++ src/run_modules/experiment_setup.py | 23 ++++++++++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/run.py b/run.py index 98097128..0efb0e4d 100644 --- a/run.py +++ b/run.py @@ -4,7 +4,7 @@ __date__ = '2019-11-14' import argparse -from src.run_modules.experiment_setup import ExperimentSetup +from src.run_modules.experiment_setup import ExperimentSetup, PartitionCheck from src.run_modules.model_setup import ModelSetup from src.run_modules.post_processing import PostProcessing from src.run_modules.pre_processing import PreProcessing @@ -16,10 +16,12 @@ def main(parser_args): with RunEnvironment(): ExperimentSetup(parser_args, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], - station_type='background', trainable=False, create_new_model=False, window_history_size=6, - create_new_bootstraps=True) + station_type='background', trainable=False, create_new_model=True, window_history_size=6, + create_new_bootstraps=False) PreProcessing() + PartitionCheck() + ModelSetup() Training() diff --git a/setup_venv.sh b/setup_venv.sh index 21733f38..960ee0e0 100755 --- a/setup_venv.sh +++ b/setup_venv.sh @@ -25,4 +25,6 @@ pip install --ignore-installed matplotlib==3.2.0 # export PYTHONPATH=${PWD}/venv/lib/python3.6/site-packages:${PYTHONPATH} # srun python run.py +# create batch run scripts +source create_runscripts_HPC.sh diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index 6e3b69c0..45778074 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -24,7 +24,8 @@ DEFAULT_VAR_ALL_DICT = {'o3': 'dma8eu', 'relhum': 'average_values', 'temp': 'max DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "estimate"} DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "plot_conditional_quantiles"] -DEFAULT_HPC_HOST_LIST = ["jw", "ju", "jr"] #first part of node names for Juwels (jw[comp], ju[login]) and Jureca(jr). +DEFAULT_HPC_LOGIN_LIST = ["ju"] +DEFAULT_HPC_HOST_LIST = ["jw", "jr"] #first part of node names for Juwels (jw[comp], ju[login]) and Jureca(jr). class ExperimentSetup(RunEnvironment): @@ -42,7 +43,7 @@ class ExperimentSetup(RunEnvironment): create_new_model=None, bootstrap_path=None, permute_data_on_training=False, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, extreme_values=None, extremes_on_right_tail_only=None, evaluate_bootstraps=True, plot_list=None, number_of_bootstraps=None, - create_new_bootstraps=None, data_path=None): + create_new_bootstraps=None, data_path=None, login_nodes=None, hpc_hosts=None): # create run framework super().__init__() @@ -51,7 +52,8 @@ class ExperimentSetup(RunEnvironment): self._set_param("data_path", data_path, default=helpers.prepare_host(sampling=sampling)) self._set_param("hostname", helpers.get_host()) # self._set_param("hostname", "jwc0123") - self._set_param("hpc_hosts", DEFAULT_HPC_HOST_LIST) + self._set_param("hpc_hosts", hpc_hosts, default=DEFAULT_HPC_HOST_LIST + DEFAULT_HPC_LOGIN_LIST) + self._set_param("login_nodes", login_nodes, default=DEFAULT_HPC_LOGIN_LIST) self._set_param("create_new_model", create_new_model, default=True) if self.data_store.get("create_new_model"): trainable = True @@ -185,6 +187,21 @@ class ExperimentSetup(RunEnvironment): self._set_param("statistics_per_var", stat_new) +class PartitionCheck(RunEnvironment): + + """ Checking if running on a HPC login node. The onöy reason to run on login nodes is to download data. Training and validation should happen on compute nodes""" + + def __init__(self): + # create run framework + super().__init__() + + self._run() + + def _run(self): + if self.data_store.get('hostname')[:2] in self.data_store.get('login_nodes'): + raise OSError('You are on a login node to download data. Use compute nodes and run again if you want to train and validate a model.') + + if __name__ == "__main__": formatter = '%(asctime)s - %(levelname)s: %(message)s [%(filename)s:%(funcName)s:%(lineno)s]' -- GitLab