diff --git a/.gitignore b/.gitignore index 222931f853f9ddf2e25dbfb6c26f1051c456bef4..3554b33c8a9239ac1c7db4958c0dd80cc731fb1c 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,9 @@ *.sql *.sqlite *.sqlite3 +*-err.[0-9]* +*-out.[0-9]* +*.ipynb_checkpoints # OS generated files # ###################### @@ -49,6 +52,7 @@ Thumbs.db ########################## /tmp/ /logging/ +/HPC_logging/ # test related data # ##################### @@ -61,6 +65,12 @@ report.html /TestExperiment/ /testrun_network*/ + +# experiment path # +# ################# +????-??-??_????-??_network + + # secret variables # #################### -/src/join_settings.py \ No newline at end of file +/src/join_settings.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f3ec1ab98cf8e46b97e2d803518ed57c6cfd4622..484348180c28c8c0e86024be95ff52039466221c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -163,3 +163,4 @@ pages: - public/badges/ - public/coverage/ - public/test/ + - public/webpage/ diff --git a/HPC_setup/create_runscripts_HPC.sh b/HPC_setup/create_runscripts_HPC.sh new file mode 100755 index 0000000000000000000000000000000000000000..1322f47d69e213071b7aaaf4e93c24add8705b9d --- /dev/null +++ b/HPC_setup/create_runscripts_HPC.sh @@ -0,0 +1,131 @@ +#!/bin/csh -x + +# __author__ = Felix Kleinert +# __date__ = '2020-04-30' +# This script creates run scripts for JUWELS or HDFML + +# When you call this script directly you can use +# $1 which has to be `juwels' or `hdfml'. +# $2 which is the path where the run scripts should be stored + +if [[ $1 != '' ]]; then + hpcsys=$1 +else + if [[ $HOSTNAME == *"juwels"* ]]; then + hpcsys="juwels" + elif [[ $HOSTNAME == *"hdfml"* ]]; then + hpcsys="hdfml" + else + echo "Unknown hpc host \`$HOSTNAME\`. Pass 'juwels' or 'hdfml' as first argument." + exit + fi +fi + +if [[ $2 != '' ]]; then + cur=$2 +else + cur=$PWD +fi + +echo "############################################################" +echo "# #" +echo "# user interaction required #" +echo "# #" +echo "############################################################" +echo + +echo "This script creates the HPC batch scripts to run mlt on compute nodes on JUWELS or hdfml." +echo "You can modify the created run scripts afterwards if needed." + +echo +echo +echo "Creating run script for $hpcsys:" +echo + +budget='' +while [[ $budget == '' ]] +do + echo + read -p "Enter project budget for --account flag: " budget +done + +email=`jutil user show -o json | grep email | cut -f2 -d':' | cut -f1 -d',' | cut -f2 -d'"'` +echo +read -p "Enter e-mail address for --mail-user (default: ${email}): " new_email + +if [[ -z "$new_email" ]]; then + new_email=$email +fi + +# create HPC_logging dir +hpclogging="HPC_logging/" +mkdir -p ${cur}/${hpclogging} + + +# ordering for looping: +# "partition nGPUs timing" +if [[ $hpcsys = "juwels" ]]; then + for i in "develgpus 2 02:00:00" "gpus 4 08:00:00"; do + set -- $i + +cat <<EOT > ${cur}/run_${hpcsys}_$1.bash +#!/bin/bash -x +#SBATCH --account=${budget} +#SBATCH --nodes=1 +#SBATCH --output=${hpclogging}mlt-out.%j +#SBATCH --error=${hpclogging}mlt-err.%j +#SBATCH --time=$3 +#SBATCH --partition=$1 +#SBATCH --gres=gpu:$2 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=${email} + +source HPC_setup/mlt_modules_${hpcsys}.sh +source venv_${hpcsys}/bin/activate + +timestamp=\`date +"%Y-%m-%d_%H%M-%S"\` + +export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.6/site-packages:\${PYTHONPATH} + +srun python run.py --experiment_date=\$timestamp +EOT + + echo "Created runscript: run_${hpcsys}_$1.bash" + + done + +elif [[ $hpcsys = "hdfml" ]]; then +cat <<EOT > ${cur}/run_${hpcsys}_batch.bash +#!/bin/bash -x +#SBATCH --account=${budget} +#SBATCH --nodes=1 +#SBATCH --output=${hpclogging}mlt-out.%j +#SBATCH --error=${hpclogging}mlt-err.%j +#SBATCH --time=08:00:00 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=${email} + +source HPC_setup/mlt_modules_${hpcsys}.sh +source venv_${hpcsys}/bin/activate + +timestamp=\`date +"%Y-%m-%d_%H%M-%S"\` + +export PYTHONPATH=\${PWD}/venv_${hpcsys}/lib/python3.6/site-packages:\${PYTHONPATH} + +srun python run.py --experiment_date=\$timestamp +EOT + +fi + +echo +echo "You have to run the the following command on a login node to download data:" +echo " \`python run.py'" +echo + +echo "Please execute the following command to check if the setup went well:" +if [[ ${hpcsys} = 'juwels' ]]; then + echo " \`sbatch run_juwels_develgpus.bash'" +else + echo " \`sbatch run_hdfml_batch.bash'" +fi + diff --git a/HPC_setup/mlt_modules_hdfml.sh b/HPC_setup/mlt_modules_hdfml.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ecbc13f6bf7284e9a3500e158bfcd8bcfb13804 --- /dev/null +++ b/HPC_setup/mlt_modules_hdfml.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# __author__ = Felix Kleinert +# __date__ = '2020-04-29' + +# This script loads the required modules for mlt which are available on HDFML. +# Note that some other packages have to be installed into a venv (see setup_venv_hdfml.sh). + +module --force purge +module use $OTHERSTAGES + +ml Stages/2019a +ml GCCcore/.8.3.0 +ml Python/3.6.8 +ml TensorFlow/1.13.1-GPU-Python-3.6.8 +ml Keras/2.2.4-GPU-Python-3.6.8 +ml SciPy-Stack/2019a-Python-3.6.8 +ml dask/1.1.5-Python-3.6.8 +ml GEOS/3.7.1-Python-3.6.8 +ml Graphviz/2.40.1 + + + + diff --git a/HPC_setup/mlt_modules_juwels.sh b/HPC_setup/mlt_modules_juwels.sh new file mode 100755 index 0000000000000000000000000000000000000000..d20b246d4f396363a23e68d64f89b6d3abaee8c4 --- /dev/null +++ b/HPC_setup/mlt_modules_juwels.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# __author__ = Felix Kleinert +# __date__ = '2020-04-06' + +# This script loads the required modules for mlt which are available on JUWELS. +# Note that some other packages have to be installed into a venv (see setup_venv.sh). + +module --force purge +module use $OTHERSTAGES + +ml Stages/Devel-2019a +ml GCCcore/.8.3.0 + +ml Jupyter/2019a-Python-3.6.8 +ml Python/3.6.8 +ml TensorFlow/1.13.1-GPU-Python-3.6.8 +ml Keras/2.2.4-GPU-Python-3.6.8 +ml SciPy-Stack/2019a-Python-3.6.8 +ml dask/1.1.5-Python-3.6.8 +ml GEOS/3.7.1-Python-3.6.8 +ml Graphviz/2.40.1 \ No newline at end of file diff --git a/HPC_setup/requirements_HDFML_additionals.txt b/HPC_setup/requirements_HDFML_additionals.txt new file mode 100644 index 0000000000000000000000000000000000000000..5065149a3e1da7a197834bbf17d74281b815e732 --- /dev/null +++ b/HPC_setup/requirements_HDFML_additionals.txt @@ -0,0 +1,19 @@ +coverage==5.0.3 +importlib-metadata==1.5.0 +matplotlib==3.2.0 # in SciPy-Stack +netcdf4 +pandas==1.0.1 # in SciPy-Stack +patsy==0.5.1 # +py==1.8.1 # ? +pyproj==2.5.0 # in basemap +pyshp==2.1.0 # in basemap +pytest==5.3.5 # in python (but we need higher version) +pytest-cov==2.8.1 +pytest-html==2.0.1 +pytest-lazy-fixture==0.6.3 +pytest-metadata==1.8.0 +pytest-sugar +statsmodels==0.11.1 # (in jupyter, bit not oh hdfml) +xarray==0.15.0 # in SciPy-Stacki (v0.12.1, only) +zipp==3.1.0 +tabulate diff --git a/HPC_setup/requirements_JUWELS_additionals.txt b/HPC_setup/requirements_JUWELS_additionals.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a2d6bc0f5cb4ce565b9eb69aad27cd1b3bbaef6 --- /dev/null +++ b/HPC_setup/requirements_JUWELS_additionals.txt @@ -0,0 +1,16 @@ +coverage==5.0.3 +importlib-metadata==1.5.0 +matplotlib==3.2.0 # in SciPy-Stack +pandas==1.0.1 # in SciPy-Stack / but older version +py==1.8.1 # ? +pyproj==2.5.0 # in basemap +pyshp==2.1.0 # in basemap +pytest==5.3.5 # in python (but we need higher version) +pytest-cov==2.8.1 +pytest-html==2.0.1 +pytest-lazy-fixture==0.6.3 +pytest-metadata==1.8.0 +pytest-sugar +statsmodels==0.11.1 # (in Jupyter, but not found) +xarray==0.15.0 # in SciPy-Stack only 0.12.1 a +tabulate diff --git a/HPC_setup/setup_venv_hdfml.sh b/HPC_setup/setup_venv_hdfml.sh new file mode 100644 index 0000000000000000000000000000000000000000..585e43b10ae2e807eeec8f1345dc65a915c31184 --- /dev/null +++ b/HPC_setup/setup_venv_hdfml.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# __author__ = Felix Kleinert +# __date__ = '2020-04-06' + +# This script creates a virtual env which contains all modules which are not available via slrum/easybuild (see mlt_modules_hdfml.sh) +# enter setting dir if called externally + +# $1 has to be an abs path to HPC_setup. If not provided, $PWD is used + +if [[ $1 != '' ]]; then + cur=$1 +else + cur=$PWD +fi + +# load existing modules +source ${cur}mlt_modules_hdfml.sh + +# create venv +python3 -m venv ${cur}../venv_hdfml + +source ${cur}/../venv_hdfml/bin/activate + +# export path for side-packages +export PYTHONPATH=${cur}/../venv_hdfml/lib/python3.6/site-packages:${PYTHONPATH} + +pip install -r ${cur}/requirements_HDFML_additionals.txt +pip install --ignore-installed matplotlib==3.2.0 +pip install --ignore-installed pandas==1.0.1 +pip install --ignore-installed statsmodels==0.11.1 +pip install --ignore-installed tabulate + +# see wiki on hdfml for information oh h5py: +# https://gitlab.version.fz-juelich.de/haf/Wiki/-/wikis/HDF-ML%20System + +export CC=mpicc +export HDF5_MPI="ON" +pip install --no-binary=h5py h5py + + diff --git a/HPC_setup/setup_venv_juwels.sh b/HPC_setup/setup_venv_juwels.sh new file mode 100755 index 0000000000000000000000000000000000000000..b543db1ee5ac4bea4f64467e360a084a2156c02a --- /dev/null +++ b/HPC_setup/setup_venv_juwels.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# __author__ = Felix Kleinert +# __date__ = '2020-04-06' + +# This script creates a virtual env which contains all modules which are not available via slrum/easybuild (see mlt_modules.sh) + +# $1 has to be an abs path to HPC_setup. If not provided, $PWD is used + +# enter setting dir if called externally +if [[ $1 != '' ]]; then + cur=$1 +else + cur=$PWD +fi + +# load existing modules +source ${cur}mlt_modules_juwels.sh + +# create venv +python3 -m venv ${cur}/../venv_juwels + +source ${cur}/../venv_juwels/bin/activate + +# export path for side-packages +export PYTHONPATH=${cur}/../venv_juwels/lib/python3.6/site-packages:${PYTHONPATH} + +pip install -r ${cur}/requirements_JUWELS_additionals.txt +pip install --ignore-installed matplotlib==3.2.0 +pip install --ignore-installed pandas==1.0.1 + + +# Comment: Maybe we have to export PYTHONPATH a second time ater activating the venv (after job allocation) +# source venv/bin/activate +# alloc_develgpu +# source venv/bin/activate +# export PYTHONPATH=${PWD}/venv/lib/python3.6/site-packages:${PYTHONPATH} +# srun python run.py + +# create batch run scripts +# source create_runscripts_HPC.sh + diff --git a/README.md b/README.md index 3467a31f23b7f770d32afb91cb62d5207ccf3d62..baae0af91036da10ba70f154ac875c18908858c3 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,26 @@ and [Network In Network (Lin et al., 2014)](https://arxiv.org/abs/1312.4400). * Install __proj__ on your machine using the console. E.g. for opensuse / leap `zypper install proj` * c++ compiler required for cartopy installation +## HPC - JUWELS and HDFML setup +The following instruction guide you throug the installation on JUWELS and HDFML. +* Clone the repo to HPC system (we recommend to place it in `/p/projects/<project name>`. +* Setup venv by executing `source setupHPC.sh`. This script loads all pre-installed modules and creates a venv for +all other packages. Furthermore, it creates slurm/batch scripts to execute code on compute nodes. <br> +You have to enter the HPC project's budget name (--account flag). +* The default external data path on JUWELS and HDFML is set to `/p/project/deepacf/intelliaq/<user>/DATA/toar_<sampling>`. +<br>To choose a different location open `run.py` and add the following keyword argument to `ExperimentSetup`: +`data_path=<your>/<custom>/<path>`. +* Execute `python run.py` on a login node to download example data. The program will throw an OSerror after downloading. +* Execute either `sbatch run_juwels_develgpus.bash` or `sbatch run_hdfml_batch.bash` to verify that the setup went well. +* Currently cartopy is not working on our HPC system, therefore PlotStations does not create any output. + +### HPC JUWELS and HDFML remarks +Please note, that the HPC setup is customised for JUWELS and HDFML. When using another HPC system, you can use the HPC setup files as a skeleton and customise it to your needs. + +Note: The method `PartitionCheck` currently only checks if the hostname starts with `ju` or `hdfmll`. +Therefore, it might be necessary to adopt the `if` statement in `PartitionCheck._run`. + + # Security * To use hourly data from ToarDB via JOIN interface, a private token is required. Request your personal access token and @@ -76,4 +96,4 @@ station-wise std is a decent estimate of the true std. `"mean"=<value, e.g. xr.DataArray>`: If mean and std are already calculated or shall be set manually, just add the scaling values instead of the calculation method. For method *centre*, std can still be None, but is required for the *standardise* method. **Important**: Format of given values **must** match internal data format of DataPreparation -class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. \ No newline at end of file +class: `xr.DataArray` with `dims=["variables"]` and one value for each variable. diff --git a/create_runscripts_HPC.sh b/create_runscripts_HPC.sh new file mode 100755 index 0000000000000000000000000000000000000000..af657fd11779f67861785c1573acd80235380b53 --- /dev/null +++ b/create_runscripts_HPC.sh @@ -0,0 +1,91 @@ +#!/bin/csh -x + +echo "############################################################" +echo "# #" +echo "# user interaction required #" +echo "# #" +echo "############################################################" +echo + +echo "This script creates the HPC batch scripts to run mlt on compute nodes (gpus and develgpus)." +echo "You can modify the created run scripts afterwards if needed." + +while true; do + read -p "Do you wish to create run scripts for JUWELS? [yes/no]" yn + case $yn in + [Yy]* ) juwels=True; break;; + [Nn]* ) juwels=False;; + * ) echo "Please answer yes or no.";; + esac +done + +while true; do + read -p "Do you wish to create run script for HDFML? [yes/no]" yn + case $yn in + [Yy]* ) hdfml=True; break;; + [Nn]* ) hdfml=False;; + * ) echo "Please answer yes or no.";; + esac +done + + +budget='' +while [[ $budget == '' ]] +do + echo + read -p "Enter project budget for --account flag: " budget +done + +email=`jutil user show -o json | grep email | cut -f2 -d':' | cut -f1 -d',' | cut -f2 -d'"'` +echo +read -p "Enter e-mail address for --mail-user (default: ${email}): " new_email + +if [[ -z "$new_email" ]]; then + new_email=$email +fi + +# create HPC_logging dir +hpclogging="../HPC_logging/" +mkdir -p $hpclogging + + +# ordering for looping: +# "partition nGPUs timing" +if [[ $juwels == True ]]; then + for i in "develgpus 2 02:00:00" "gpus 4 08:00:00"; do + set -- $i + +cat <<EOT > run_$1.bash +#!/bin/bash -x +#SBATCH --account=${budget} +#SBATCH --nodes=1 +#SBATCH --output=${hpclogging}mlt-out.%j +#SBATCH --error=${hpclogging}/mlt-err.%j +#SBATCH --time=$3 +#SBATCH --partition=$1 +#SBATCH --gres=gpu:$2 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=${email} + +source mlt_modules_.sh +source venv/bin/activate + +timestamp=\`date +"%Y-%m-%d_%H%M-%S"\` + +export PYTHONPATH=\${PWD}/venv/lib/python3.6/site-packages:\${PYTHONPATH} + +srun python run.py --experiment_date=\$timestamp +EOT + + echo "Created runscript: run_$1.bash" + + done +fi + +echo +echo "You have to run the the following command on a login node to download data:" +echo " \`python run.py'" +echo +echo "Please execute the following command to check if the setup went well:" +echo " \`sbatch run_develgpus.bash'" + diff --git a/run.py b/run.py index 9809712876dc886007b042a52d7b46c027800faf..bfe1c7ed62d9b2e8a707c117e252ff3f931f339a 100644 --- a/run.py +++ b/run.py @@ -5,6 +5,7 @@ __date__ = '2019-11-14' import argparse from src.run_modules.experiment_setup import ExperimentSetup +from src.run_modules.partition_check import PartitionCheck from src.run_modules.model_setup import ModelSetup from src.run_modules.post_processing import PostProcessing from src.run_modules.pre_processing import PreProcessing @@ -16,22 +17,23 @@ def main(parser_args): with RunEnvironment(): ExperimentSetup(parser_args, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], - station_type='background', trainable=False, create_new_model=False, window_history_size=6, - create_new_bootstraps=True) + station_type='background', trainable=False, create_new_model=True, window_history_size=6, + create_new_bootstraps=False) PreProcessing() + PartitionCheck() + ModelSetup() Training() PostProcessing() - if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default=None, + parser.add_argument('--experiment_date', metavar='--exp_date', type=str, default="testrun", help="set experiment date as string") - args = parser.parse_args(["--experiment_date", "testrun"]) + args = parser.parse_args() main(args) + diff --git a/setupHPC.sh b/setupHPC.sh new file mode 100644 index 0000000000000000000000000000000000000000..0248fdc09e658bac1ba6f9742426ce41996e1ade --- /dev/null +++ b/setupHPC.sh @@ -0,0 +1,22 @@ + +basepath=${PWD}/ +settingpath=HPC_setup/ + +if [[ $HOSTNAME == *"juwels"* ]]; then + echo "You are on juwels. Prepare env for juwels..." + hpcsys="juwels" +elif [[ $HOSTNAME == *"hdfml"* ]]; then + echo "You are on hdfml. Prepare env for hdfml..." + hpcsys="hdfml" +else + echo "Unknown hpc host \`$HOSTNAME\`. Script only works on juwels and hdfml." + echo "exit" + exit +fi + +echo "execute: HPC_setup/setup_venv_${hpcsys}.sh $basepath$settingpath" +source HPC_setup/setup_venv_${hpcsys}.sh $basepath$settingpath + +echo "execute: HPC_setup/create_runscripts_HPC.sh $hpcsys $basepath" +source HPC_setup/create_runscripts_HPC.sh $hpcsys $basepath + diff --git a/src/helpers.py b/src/helpers.py index d4180336ec63f4f5477d3f2a149b5cb146be5597..4a6b1b2f657c1cd8eb375489e1cbcc0365b1bde5 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -9,6 +9,7 @@ from functools import wraps import logging import math import os +import getpass import socket import time import types @@ -112,13 +113,15 @@ class TimeTracking(object): logging.info(f"{self._name} finished after {self}") +def get_host(): + return socket.gethostname() + + def prepare_host(create_new=True, sampling="daily"): - hostname = socket.gethostname() + + hostname = get_host() + user = getpass.getuser() runner_regex = re.compile(r"runner-.*-project-2411-concurrent-\d+") - try: - user = os.getlogin() - except OSError: - user = "default" if hostname == "ZAM144": path = f"/home/{user}/Data/toar_{sampling}/" elif hostname == "zam347": @@ -127,8 +130,8 @@ def prepare_host(create_new=True, sampling="daily"): path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" elif (len(hostname) > 2) and (hostname[:2] == "jr"): path = f"/p/project/cjjsc42/{user}/DATA/toar_{sampling}/" - elif (len(hostname) > 2) and (hostname[:2] == "jw"): - path = f"/p/home/jusers/{user}/juwels/intelliaq/DATA/toar_{sampling}/" + elif (len(hostname) > 2) and (hostname[:2] in ['jw', 'ju'] or hostname[:5] in ['hdfml']): + path = f"/p/project/deepacf/intelliaq/{user}/DATA/toar_{sampling}/" elif runner_regex.match(hostname) is not None: path = f"/home/{user}/machinelearningtools/data/toar_{sampling}/" else: diff --git a/src/plotting/postprocessing_plotting.py b/src/plotting/postprocessing_plotting.py index b61e832c80ac9ad83a5aa4a4b5310b17f6add098..4fcb1f49d828f47c36a5597341585896a19fcc9a 100644 --- a/src/plotting/postprocessing_plotting.py +++ b/src/plotting/postprocessing_plotting.py @@ -7,8 +7,7 @@ import os import warnings from typing import Dict, List, Tuple -import cartopy.crs as ccrs -import cartopy.feature as cfeature + import matplotlib import matplotlib.pyplot as plt import numpy as np @@ -137,6 +136,7 @@ class PlotStationMap(AbstractPlotClass): background, but this can be adjusted by loading locally stored topography data (not implemented yet). The plot is saved under plot_path with the name station_map.pdf """ + def __init__(self, generators: Dict, plot_folder: str = "."): """ Sets attributes and create plot @@ -153,6 +153,8 @@ class PlotStationMap(AbstractPlotClass): """ Draw coastline, lakes, ocean, rivers and country borders as background on the map. """ + + import cartopy.feature as cfeature self._ax.add_feature(cfeature.COASTLINE.with_scale("50m"), edgecolor='black') self._ax.add_feature(cfeature.LAKES.with_scale("50m")) self._ax.add_feature(cfeature.OCEAN.with_scale("50m")) @@ -166,6 +168,8 @@ class PlotStationMap(AbstractPlotClass): :param generators: dictionary with the plot color of each data set as key and the generator containing all stations as value. """ + + import cartopy.crs as ccrs if generators is not None: for color, gen in generators.items(): for k, v in enumerate(gen): @@ -181,6 +185,8 @@ class PlotStationMap(AbstractPlotClass): :param generators: dictionary with the plot color of each data set as key and the generator containing all stations as value. """ + + import cartopy.crs as ccrs fig = plt.figure(figsize=(10, 5)) self._ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree()) self._ax.set_extent([0, 20, 42, 58], crs=ccrs.PlateCarree()) diff --git a/src/run_modules/experiment_setup.py b/src/run_modules/experiment_setup.py index 09b9f143fc0442ee34ef5735366145be86b5fa07..b04c0e2ac2a2262c92c5f7149014206d0d390e18 100644 --- a/src/run_modules/experiment_setup.py +++ b/src/run_modules/experiment_setup.py @@ -6,6 +6,8 @@ import argparse import logging import os from typing import Union, Dict, Any +import socket + from src import helpers from src.run_modules.run_environment import RunEnvironment @@ -23,6 +25,8 @@ DEFAULT_TRANSFORMATION = {"scope": "data", "method": "standardise", "mean": "est DEFAULT_PLOT_LIST = ["PlotMonthlySummary", "PlotStationMap", "PlotClimatologicalSkillScore", "PlotTimeSeries", "PlotCompetitiveSkillScore", "PlotBootstrapSkillScore", "PlotConditionalQuantiles", "PlotAvailability"] +DEFAULT_HPC_LOGIN_LIST = ["ju", "hdfmll"] # ju[wels} #hdfmll(ogin) +DEFAULT_HPC_HOST_LIST = ["jw", "hdfmlc"] # first part of node names for Juwels (jw[comp], hdfmlc(ompute). class ExperimentSetup(RunEnvironment): @@ -40,13 +44,17 @@ class ExperimentSetup(RunEnvironment): create_new_model=None, bootstrap_path=None, permute_data_on_training=False, transformation=None, train_min_length=None, val_min_length=None, test_min_length=None, extreme_values=None, extremes_on_right_tail_only=None, evaluate_bootstraps=True, plot_list=None, number_of_bootstraps=None, - create_new_bootstraps=None): + create_new_bootstraps=None, data_path=None, login_nodes=None, hpc_hosts=None): # create run framework super().__init__() # experiment setup - self._set_param("data_path", helpers.prepare_host(sampling=sampling)) + self._set_param("data_path", data_path, default=helpers.prepare_host(sampling=sampling)) + self._set_param("hostname", helpers.get_host()) + # self._set_param("hostname", "jwc0123") + self._set_param("hpc_hosts", hpc_hosts, default=DEFAULT_HPC_HOST_LIST + DEFAULT_HPC_LOGIN_LIST) + self._set_param("login_nodes", login_nodes, default=DEFAULT_HPC_LOGIN_LIST) self._set_param("create_new_model", create_new_model, default=True) if self.data_store.get("create_new_model"): trainable = True diff --git a/src/run_modules/partition_check.py b/src/run_modules/partition_check.py new file mode 100644 index 0000000000000000000000000000000000000000..8f4c703e6b94f11905121d93c44dd8bf583abdec --- /dev/null +++ b/src/run_modules/partition_check.py @@ -0,0 +1,26 @@ +__author__ = "Felix Kleinert" +__date__ = '2020-04-07' + +from src.run_modules.run_environment import RunEnvironment + + +class PartitionCheck(RunEnvironment): + """ + Checking if running on a HPC login node. The only reason to run on login nodes is to download data. + Training and validation should happen on compute nodes + + Note: This Method is highly customised to the HCP-systems in Juelich (FZJ, JSC). When using an other HPC system, + make sure to double check the indexing of `self.data_store.get('hostname')'. + """ + + def __init__(self): + # create run framework + super().__init__() + + self._run() + + def _run(self): + if (self.data_store.get('hostname')[:2] in self.data_store.get('login_nodes')) or ( + self.data_store.get('hostname')[:6] in self.data_store.get('login_nodes')): + raise OSError( + 'You are on a login node to download data. Use compute nodes and run again if you want to train and validate a model.') diff --git a/src/run_modules/post_processing.py b/src/run_modules/post_processing.py index dfeaf06533e8023cf872763e0f34d98c5dd27a01..bc3cdf2653aed86a00a139d963a26d826131b5b6 100644 --- a/src/run_modules/post_processing.py +++ b/src/run_modules/post_processing.py @@ -195,10 +195,16 @@ class PostProcessing(RunEnvironment): if self.bootstrap_skill_scores is not None and "PlotBootstrapSkillScore" in plot_list: PlotBootstrapSkillScore(self.bootstrap_skill_scores, plot_folder=self.plot_path, model_setup="CNN") + if "PlotConditionalQuantiles" in plot_list: PlotConditionalQuantiles(self.test_data.stations, data_pred_path=path, plot_folder=self.plot_path) if "PlotStationMap" in plot_list: - PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) + if self.data_store.get("hostname")[:2] in self.data_store.get("hpc_hosts") or self.data_store.get( + "hostname")[:6] in self.data_store.get("hpc_hosts"): + logging.warning( + f"Skip 'PlotStationMap` because running on a hpc node: {self.data_store.get('hostname')}") + else: + PlotStationMap(generators={'b': self.test_data}, plot_folder=self.plot_path) if "PlotMonthlySummary" in plot_list: PlotMonthlySummary(self.test_data.stations, path, r"forecasts_%s_test.nc", self.target_var, plot_folder=self.plot_path) diff --git a/test/test_helpers.py b/test/test_helpers.py index 9c71a53389344083e4e18a83a6aab5838ad678ca..0065a94b7b18d88c2e86e60df5633d47ba15f42a 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -126,20 +126,33 @@ class TestTimeTracking: assert t.end is None expression = PyTestRegex(r"my job finished after \d+:\d+:\d+ \(hh:mm:ss\)") assert caplog.record_tuples[-1] == ('root', 20, expression) + + +class TestGetHost: + + @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", + "runner-6HmDp9Qd-project-2411-concurrent"]) + def test_get_host(self, mock_host): + assert get_host() == "linux-aa9b" + assert get_host() == "ZAM144" + assert get_host() == "zam347" + assert get_host() == "jrtest" + assert get_host() == "jwtest" + assert get_host() == "runner-6HmDp9Qd-project-2411-concurrent" class TestPrepareHost: @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", "runner-6HmDp9Qd-project-2411-concurrent-01"]) - @mock.patch("os.getlogin", return_value="testUser") + @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=True) def test_prepare_host(self, mock_host, mock_user, mock_path): assert prepare_host() == "/home/testUser/machinelearningtools/data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/home/testUser/Data/toar_daily/" assert prepare_host() == "/p/project/cjjsc42/testUser/DATA/toar_daily/" - assert prepare_host() == "/p/home/jusers/testUser/juwels/intelliaq/DATA/toar_daily/" + assert prepare_host() == "/p/project/deepacf/intelliaq/testUser/DATA/toar_daily/" assert prepare_host() == '/home/testUser/machinelearningtools/data/toar_daily/' @mock.patch("socket.gethostname", return_value="NotExistingHostName") @@ -149,7 +162,8 @@ class TestPrepareHost: prepare_host() assert "unknown host 'NotExistingHostName'" in e.value.args[0] - @mock.patch("os.getlogin", return_value="zombie21") + #@mock.patch("os.getlogin", return_value="zombie21") + @mock.patch("getpass.getuser", return_value="zombie21") @mock.patch("src.helpers.check_path_and_create", side_effect=PermissionError) def test_error_handling(self, mock_cpath, mock_user): # if "runner-6HmDp9Qd-project-2411-concurrent" not in platform.node(): @@ -162,26 +176,9 @@ class TestPrepareHost: # assert "does not exist for host 'linux-aa9b'" in e.value.args[0] assert PyTestRegex(r"path '.*' does not exist for host '.*'\.") == e.value.args[0] - @mock.patch("socket.gethostname", side_effect=["linux-aa9b", "ZAM144", "zam347", "jrtest", "jwtest", - "runner-6HmDp9Qd-project-2411-concurrent-01"]) - @mock.patch("os.getlogin", side_effect=OSError) - @mock.patch("os.path.exists", return_value=True) - def test_os_error(self, mock_path, mock_user, mock_host): - path = prepare_host() - assert path == "/home/default/machinelearningtools/data/toar_daily/" - path = prepare_host() - assert path == "/home/default/Data/toar_daily/" - path = prepare_host() - assert path == "/home/default/Data/toar_daily/" - path = prepare_host() - assert path == "/p/project/cjjsc42/default/DATA/toar_daily/" - path = prepare_host() - assert path == "/p/home/jusers/default/juwels/intelliaq/DATA/toar_daily/" - path = prepare_host() - assert path == '/home/default/machinelearningtools/data/toar_daily/' @mock.patch("socket.gethostname", side_effect=["linux-aa9b"]) - @mock.patch("os.getlogin", return_value="testUser") + @mock.patch("getpass.getuser", return_value="testUser") @mock.patch("os.path.exists", return_value=False) @mock.patch("os.makedirs", side_effect=None) def test_os_path_exists(self, mock_host, mock_user, mock_path, mock_check): diff --git a/test/test_modules/test_partition_check.py b/test/test_modules/test_partition_check.py new file mode 100644 index 0000000000000000000000000000000000000000..6966b2aadea4075a4cc9bb63abeaeedc25e8bb5f --- /dev/null +++ b/test/test_modules/test_partition_check.py @@ -0,0 +1,76 @@ +import logging + +import pytest +import mock +from src.run_modules.experiment_setup import ExperimentSetup +from src.run_modules.partition_check import PartitionCheck +from src.run_modules.run_environment import RunEnvironment +from src.helpers import get_host +from src.helpers import PyTestRegex + + +class TestPartitionCheck: + + # @pytest.fixture + # def obj_super_init(self): + # obj = object.__new__(PartitionCheck) + # super(PartitionCheck, obj).__init__() + # obj.data_store.set("NAME1", 1, "general") + # yield obj + # RunEnvironment().__del__() + + @pytest.fixture + @mock.patch("src.helpers.get_host", return_value="juwels") + @mock.patch("getpass.getuser", return_value="testUser") + @mock.patch("os.path.exists", return_value=False) + @mock.patch("os.makedirs", side_effect=None) + def obj_with_exp_setup_login(self, mock_host, mock_user, mock_path, mock_check): + ExperimentSetup(parser_args={}, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], + statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background") + pre = object.__new__(PartitionCheck) + super(PartitionCheck, pre).__init__() + yield pre + RunEnvironment().__del__() + + @pytest.fixture + @mock.patch("src.helpers.get_host", return_value="hdfmlc01") + @mock.patch("getpass.getuser", return_value="testUser") + @mock.patch("os.path.exists", return_value=False) + @mock.patch("os.makedirs", side_effect=None) + def obj_with_exp_setup_compute(self, mock_host, mock_user, mock_path, mock_check): + ExperimentSetup(parser_args={}, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087', 'DEBW001'], + statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}, station_type="background") + pre = object.__new__(PartitionCheck) + super(PartitionCheck, pre).__init__() + yield pre + RunEnvironment().__del__() + + def test_init(self, caplog): + ExperimentSetup(parser_args={}, stations=['DEBW107', 'DEBY081', 'DEBW013', 'DEBW076', 'DEBW087'], + statistics_per_var={'o3': 'dma8eu', 'temp': 'maximum'}) + caplog.clear() + caplog.set_level(logging.INFO) + with PartitionCheck(): + assert caplog.record_tuples[0] == ('root', 20, 'PartitionCheck started') + + RunEnvironment().__del__() + + @mock.patch("src.helpers.get_host", return_value="juwels") + @mock.patch("getpass.getuser", return_value="testUser") + @mock.patch("os.path.exists", return_value=False) + @mock.patch("os.makedirs", side_effect=None) + def test_run_login(self, mock_host, mock_user, mock_path, mock_check, obj_with_exp_setup_login, caplog): + + with pytest.raises(OSError) as e: + obj_with_exp_setup_login.__next__()._run() + assert "You are on a login node to download data. Use compute nodes and run again if you want to train and " \ + "validate a model." == \ + e.value.args[0] + + @mock.patch("src.helpers.get_host", return_value="hdfmlc01") + @mock.patch("getpass.getuser", return_value="testUser") + @mock.patch("os.path.exists", return_value=False) + @mock.patch("os.makedirs", side_effect=None) + def test_run_compute(self, mock_host, mock_user, mock_path, mock_check, obj_with_exp_setup_compute, caplog): + + assert obj_with_exp_setup_compute.__next__()._run() is None