diff --git a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh index aced15f81b69379a9819f6c50d87226f060d4de5..a3dc423d695c18615764a4761fb8546990e02efa 100755 --- a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh @@ -38,7 +38,7 @@ fi # clean-up modules to avoid conflicts between host and container settings module purge -# declare directory-variables +# declare directory-variables which will be modified by generate_runscript.py source_dir=/my/path/to/tfrecords/files destination_dir=/my/model/output/path diff --git a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh index a29b8e1b0f297dc986c5633a45857c590d37b514..893a3fa078ce8b563c8e1c53870e74a161fa356f 100644 --- a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh +++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh @@ -10,42 +10,49 @@ #SBATCH --gres=gpu:1 #SBATCH --partition=gpus #SBATCH --mail-type=ALL -#SBATCH --mail-user=b.gong@fz-juelich.de -##jutil env activate -p cjjsc42 +#SBATCH --mail-user=me@somewhere.com ######### Template identifier (don't remove) ######### echo "Do not run the template scripts" exit 99 ######### Template identifier (don't remove) ######### +# auxiliary variables +WORK_DIR=`pwd` +BASE_DIR=$(dirname "$WORK_DIR") # Name of virtual environment VIRT_ENV_NAME="my_venv" +# Name of container image (must be available in working directory) +CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif" +WRAPPER="${WORK_DIR}/wrapper_container.sh" -# Loading modules -source ../env_setup/modules_postprocess.sh -# Activate virtual environment if needed (and possible) -if [ -z ${VIRTUAL_ENV} ]; then - if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then - echo "Activating virtual environment..." - source ../${VIRT_ENV_NAME}/bin/activate - else - echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..." - exit 1 - fi +# sanity checks +if [[ ! -f ${CONTAINER_IMG} ]]; then + echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'." + exit 1 fi -# declare directory-variables which will be modified by config_runscript.py +if [[ ! -f ${WRAPPER} ]]; then + echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image." + exit 1 +fi + +# clean-up modules to avoid conflicts between host and container settings +module purge + +# declare directory-variables which will be modified by generate_runscript.py # Note: source_dir is only needed for retrieving the base-directory source_dir=/my/source/dir/ checkpoint_dir=/my/trained/model/dir results_dir=/my/results/dir lquick="" -# name of model -model=convLSTM - # run postprocessing/generation of model results including evaluation metrics -srun python -u ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ - --results_dir ${results_dir} --batch_size 4 \ - --num_stochastic_samples 1 ${lquick} \ - > postprocess_era5-out_all.${SLURM_JOB_ID} +export CUDA_VISIBLE_DEVICES=0 +## One node, single GPU +srun --mpi=pspmix --cpu-bind=none \ + singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \ + python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \ + --results_dir ${results_dir} --batch_size 4 \ + --num_stochastic_samples 1 ${lquick} \ + > postprocess_era5-out_all."${SLURM_JOB_ID}"