diff --git a/video_prediction_tools/HPC_scripts/train_model_era5_container_template.sh b/video_prediction_tools/HPC_scripts/train_model_era5_container_template.sh deleted file mode 100755 index ba4e60c2083785af223f345067cd727236a7dc8d..0000000000000000000000000000000000000000 --- a/video_prediction_tools/HPC_scripts/train_model_era5_container_template.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -x -#SBATCH --account=deepacf -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --output=train_model_era5_container-out.%j -#SBATCH --error=train_model_era5_container-err.%j -#SBATCH --time=24:00:00 -##SBATCH --time=00:20:00 -#SBATCH --gres=gpu:1 -#SBATCH --partition=booster -#SBATCH --mail-type=ALL -#SBATCH --mail-user=me@somewhere.com - -### Two nodes, 8 GPUs -##SBATCH --nodes=2 -##SBATCH --ntasks=8 -##SBATCH --ntasks-per-node=4 -##SBATCH --gres=gpu:4 -## Also take care for the job submission with srun below!!! - - -WORK_DIR=`pwd` -BASE_DIR=$(dirname "$WORK_DIR") -# Name of virtual environment -VIRT_ENV_NAME="my_venv" -# Name of container image (must be available in working directory) -CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" - -# clean-up modules to avoid conflicts between host and container settings -module purge - -# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py) -source_dir=/my/path/to/tfrecords/files -destination_dir=/my/model/output/path - -# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae -model=convLSTM -datasplit_dict=${destination_dir}/data_split.json -model_hparams=${destination_dir}/model_hparams.json - -# run training in container -export CUDA_VISIBLE_DEVICES=0,1,2,3 -## One node, single GPU -srun --mpi=pspmix --cpu-bind=none \ -singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \ - --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ - -## Two nodes, 8 GPUs -#srun -N 2 -n 8 --ntasks-per-node 4 singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \ -# --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ diff --git a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh old mode 100644 new mode 100755 index 9c03ae7adde3886fd7e005fec5b17b4c7da84dd9..ba4e60c2083785af223f345067cd727236a7dc8d --- a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh @@ -2,52 +2,49 @@ #SBATCH --account=deepacf #SBATCH --nodes=1 #SBATCH --ntasks=1 -##SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=1 -#SBATCH --output=train_era5-out.%j -#SBATCH --error=train_era5-err.%j -#SBATCH --time=20:00:00 +#SBATCH --output=train_model_era5_container-out.%j +#SBATCH --error=train_model_era5_container-err.%j +#SBATCH --time=24:00:00 +##SBATCH --time=00:20:00 #SBATCH --gres=gpu:1 -#SBATCH --partition=gpus +#SBATCH --partition=booster #SBATCH --mail-type=ALL -#SBATCH --mail-user=b.gong@fz-juelich.de -##jutil env activate -p cjjsc42 +#SBATCH --mail-user=me@somewhere.com -######### Template identifier (don't remove) ######### -echo "Do not run the template scripts" -exit 99 -######### Template identifier (don't remove) ######### +### Two nodes, 8 GPUs +##SBATCH --nodes=2 +##SBATCH --ntasks=8 +##SBATCH --ntasks-per-node=4 +##SBATCH --gres=gpu:4 +## Also take care for the job submission with srun below!!! -# Name of virtual environment + +WORK_DIR=`pwd` +BASE_DIR=$(dirname "$WORK_DIR") +# Name of virtual environment VIRT_ENV_NAME="my_venv" +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" + +# clean-up modules to avoid conflicts between host and container settings +module purge -# Loading mouldes -source ../env_setup/modules_train.sh -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi -fi - -# declare directory-variables which will be modified by config_runscript.py -source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/ -destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/ +# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py) +source_dir=/my/path/to/tfrecords/files +destination_dir=/my/model/output/path # valid identifiers for model-argument are: convLSTM, savp, mcnet and vae -# the destination_dir_full cannot end up with "/", this will cause to save all the checkpoints issue in the results_dir model=convLSTM -datasplit_dict=../data_split/cv_test.json +datasplit_dict=${destination_dir}/data_split.json model_hparams=${destination_dir}/model_hparams.json -dataset=era5 -#If you train savp, Please uncomment the following CUDA configuration -#CUDA_VISIBLE_DEVICES=1 +# run training in container +export CUDA_VISIBLE_DEVICES=0,1,2,3 +## One node, single GPU +srun --mpi=pspmix --cpu-bind=none \ +singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \ + --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ -# run training -srun python ../main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \ - --dataset ${dataset} --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir} --checkpoint ${destination_dir} +## Two nodes, 8 GPUs +#srun -N 2 -n 8 --ntasks-per-node 4 singularity exec --nv ${CONTAINER_IMG} ./wrapper_container.sh ${VIRT_ENV_NAME} python3 ${BASE_DIR}/main_scripts/main_train_models.py \ +# --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/ diff --git a/video_prediction_tools/env_setup/create_env.sh b/video_prediction_tools/env_setup/create_env.sh index ac7ff53179407620556534c527f9a7c53ab2f96c..30b8f96f75e33792833725ed7c7ffdbad4de0bd8 100755 --- a/video_prediction_tools/env_setup/create_env.sh +++ b/video_prediction_tools/env_setup/create_env.sh @@ -19,62 +19,44 @@ check_argin() { if [[ $argin == *"-base_dir="* ]]; then base_outdir=${argin#"-base_dir="} fi - if [[ $argin == *"-lcontainer"* ]]; then - bool_container=1 - fi done - if [[ -z "${bool_container}" ]]; then - bool_container=0 - fi } # **************** Auxiliary functions **************** # **************** Actual script **************** # some first sanity checks -if [[ ${BASH_SOURCE[0]} == ${0} ]]; then +if [[ ${BASH_SOURCE[0]} == "${0}" ]]; then echo "ERROR: 'create_env.sh' must be sourced, i.e. execute by prompting 'source create_env.sh [virt_env_name]'" exit 1 fi # from now on, just return if something unexpected occurs instead of exiting # as the latter would close the terminal including logging out -if [[ ! -n "$1" ]]; then +if [[ -z "$1" ]]; then echo "ERROR: Provide a name to set up the virtual environment, i.e. execute by prompting 'source create_env.sh [virt_env_name]" return fi if [[ "$#" -gt 1 ]]; then - check_argin ${@:2} # sets base_outdir if provided, always sets l_container + check_argin ${@:2} # sets base_outdir if provided fi # set some variables -HOST_NAME=`hostname` +HOST_NAME="$(hostname)" ENV_NAME=$1 -ENV_SETUP_DIR=`pwd` -WORKING_DIR="$(dirname "$ENV_SETUP_DIR")" -EXE_DIR="$(basename "$ENV_SETUP_DIR")" +THIS_DIR="$(pwd)" +WORKING_DIR="$(dirname "$THIS_DIR")" +EXE_DIR="$(basename "$THIS_DIR")" ENV_DIR=${WORKING_DIR}/${ENV_NAME} -TF_CONTAINER=${WORKING_DIR}/env_setup/tensorflow_21.09-tf1-py3.sif +TF_CONTAINER=${WORKING_DIR}/HPC_scripts/tensorflow_21.09-tf1-py3.sif ## perform sanity checks -# correct bool_container if host is Juwels Booster and ensure running singularity -if [[ "${bool_container}" == 0 ]]; then - echo "******************************************** NOTE ********************************************" - echo " Set up virtual environment without TF1.15-container. " - echo " Note that training without container using GPUs on the Juelich HPC-systems is not possible! " - echo "******************************************** NOTE ********************************************" -fi modules_purge="" -if [[ "${bool_container}" == 1 ]]; then - echo "Virtual environment will be set up in TensorFlow 1.15-container." - modules_purge=purge - # Check if singularity exists - if [[ ! -f "${TF_CONTAINER}" ]]; then - echo "ERROR: Could not found required TensorFlow 1.15-container under ${TF_CONTAINER}" - return - fi +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + return fi # further sanity checks: @@ -95,16 +77,10 @@ else fi ## check integratability of modules -if [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *jwlogin* && ! "${HOST_NAME}" == *jwlogin2[2-4]* ]]; then +if [[ "${HOST_NAME}" == hdfml* || "${HOST_NAME}" == *jwlogin* ]]; then # load modules and check for their availability echo "***** Checking modules required during the workflow... *****" - source ${ENV_SETUP_DIR}/modules_preprocess.sh purge - source ${ENV_SETUP_DIR}/modules_train.sh purge - source ${ENV_SETUP_DIR}/modules_postprocess.sh ${modules_purge} -elif [[ "${HOST_NAME}" == *jwlogin2[2-4]* ]]; then - echo "***** Old Stages are not available on Juwels Booster ****" - echo "***** To check modules for preprocessing, ****" - echo "***** run this script on Juwels or HDF-ML. ****" + source "${ENV_SETUP_DIR}"/modules_preprocess.sh purge else echo "ERROR: AMBS-workflow is currently only supported on the Juelich HPC-systems HDF-ML, Juwels and Juwels Booster" return @@ -117,8 +93,7 @@ if [[ "$ENV_EXIST" == 0 ]]; then # Activate virtual environment and install additional Python packages. echo "Configuring and activating virtual environment on ${HOST_NAME}" - if [[ "${bool_container}" == 1 ]]; then - singularity exec --nv "${TF_CONTAINER}" ./install_venv_container.sh "${ENV_DIR}" + singularity exec --nv "${TF_CONTAINER}" ./install_venv_container.sh "${ENV_DIR}" else # cretae virtual environemt here python3 -m venv $ENV_DIR diff --git a/video_prediction_tools/env_setup/install_venv_container.sh b/video_prediction_tools/env_setup/install_venv_container.sh index 3ab281cf4aa78cc639920b94ecda1c739109c5b9..ca42d613851ae2fc9779d45eb35ec9e03a0323fa 100755 --- a/video_prediction_tools/env_setup/install_venv_container.sh +++ b/video_prediction_tools/env_setup/install_venv_container.sh @@ -2,17 +2,18 @@ # # __authors__ = Bing Gong, Michael Langguth # __date__ = '2021_10_28' -# __last_update__ = '2021_10_28' by Michael Langguth +# __last_update__ = '2022_01_26' by Michael Langguth # # **************** Description **************** # This auxiliary script sets up the virtual environment within a singularity container. # **************** Description **************** # set some basic variables -BASE_DIR=`pwd` +BASE_DIR="$(pwd)" VENV_BASE=$1 VENV_NAME="$(basename "${VENV_BASE}")" VENV_DIR=${VENV_BASE}/${VENV_NAME} +VENV_REQ=${BASE_DIR}/requirements.txt # sanity checks # check if we are running in a container @@ -34,8 +35,8 @@ if [ -d "$1" ]; then fi # check for requirement-file -if [ ! -f "${BASE_DIR}/requirements_container.txt" ]; then - echo "ERROR: Cannot find requirement-file ${BASE_DIR}/requirements_container.txt to set up virtual environment." +if [ ! -f "${VENV_REQ}" ]; then + echo "ERROR: Cannot find requirement-file '${VENV_REQ}' to set up virtual environment." return fi @@ -55,7 +56,7 @@ source "${VENV_DIR}/bin/activate" # set PYTHONPATH and install packages export PYTHONPATH="/usr/local/lib/python3.8/dist-packages/" echo 'export PYTHONPATH="/usr/local/lib/python3.8/dist-packages/"' >> "${VENV_DIR}/bin/activate" -pip install -r "${BASE_DIR}/requirements_container.txt" +pip install -r "${VENV_REQ}" # get back to basic directory cd "${BASE_DIR}" || exit diff --git a/video_prediction_tools/env_setup/requirements.txt b/video_prediction_tools/env_setup/requirements.txt index 24775be7e7b72788ccf9d98f88ec9f9885fea85b..d3d609fdec3ef7e5de406e21812cba59d4df1d25 100755 --- a/video_prediction_tools/env_setup/requirements.txt +++ b/video_prediction_tools/env_setup/requirements.txt @@ -1,2 +1,5 @@ -opencv-python==4.2.0.34 -hickle +matplotlib==3.3.0 +mpi4py==3.0.3 +pandas==0.25.3 +scikit-image==0.18.1 +opencv-python-headless==4.2.0.34 diff --git a/video_prediction_tools/env_setup/requirements_container.txt b/video_prediction_tools/env_setup/requirements_container.txt deleted file mode 100755 index d3d609fdec3ef7e5de406e21812cba59d4df1d25..0000000000000000000000000000000000000000 --- a/video_prediction_tools/env_setup/requirements_container.txt +++ /dev/null @@ -1,5 +0,0 @@ -matplotlib==3.3.0 -mpi4py==3.0.3 -pandas==0.25.3 -scikit-image==0.18.1 -opencv-python-headless==4.2.0.34