From a3f1ccbaf304b5613769deb0129ed0c28612a9d0 Mon Sep 17 00:00:00 2001 From: Michael <m.langguth@fz-juelich.de> Date: Thu, 27 Jan 2022 14:05:10 +0100 Subject: [PATCH] Adapt runscript-template for postprocessing for using TF1.15-container. --- .../HPC_scripts/train_model_era5_template.sh | 2 +- .../visualize_postprocess_era5_template.sh | 49 +++++++++++-------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh index aced15f8..a3dc423d 100755 --- a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh @@ -38,7 +38,7 @@ fi # clean-up modules to avoid conflicts between host and container settings module purge -# declare directory-variables +# declare directory-variables which will be modified by generate_runscript.py source_dir=/my/path/to/tfrecords/files destination_dir=/my/model/output/path diff --git a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh index a29b8e1b..893a3fa0 100644 --- a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh @@ -10,42 +10,49 @@ #SBATCH --gres=gpu:1 #SBATCH --partition=gpus #SBATCH --mail-type=ALL -#SBATCH --mail-user=b.gong@fz-juelich.de -##jutil env activate -p cjjsc42 +#SBATCH --mail-user=me@somewhere.com ######### Template identifier (don't remove) ######### echo "Do not run the template scripts" exit 99 ######### Template identifier (don't remove) ######### +# auxiliary variables +WORK_DIR=`pwd` +BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment VIRT_ENV_NAME="my_venv" +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" +WRAPPER="${WORK_DIR}/wrapper_container.sh" -# Loading modules -source ../env_setup/modules_postprocess.sh -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi +# sanity checks +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + exit 1 fi -# declare directory-variables which will be modified by config_runscript.py +if [[ ! -f ${WRAPPER} ]]; then + echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." + exit 1 +fi + +# clean-up modules to avoid conflicts between host and container settings +module purge + +# declare directory-variables which will be modified by generate_runscript.py # Note: source_dir is only needed for retrieving the base-directory source_dir=/my/source/dir/ checkpoint_dir=/my/trained/model/dir results_dir=/my/results/dir lquick="" -# name of model -model=convLSTM - # run postprocessing/generation of model results including evaluation metrics -srun python -u ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ - --results_dir ${results_dir} --batch_size 4 \ - --num_stochastic_samples 1 ${lquick} \ - > postprocess_era5-out_all.${SLURM_JOB_ID} +export CUDA_VISIBLE_DEVICES=0 +## One node, single GPU +srun --mpi=pspmix --cpu-bind=none \ + singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ + python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ + --results_dir ${results_dir} --batch_size 4 \ + --num_stochastic_samples 1 ${lquick} \ + > postprocess_era5-out_all."${SLURM_JOB_ID}" -- GitLab