Merge branch 'michael_issue#145_training_with_Nvidia_TF1.15_container' into develop

5687e99f · Michael Langguth · e2b50cd5 · 4d507a09 · 5687e99f · 5687e99f
Commit 5687e99f authored 3 years ago by Michael Langguth
--- a/.gitignore
+++ b/.gitignore
@@ -71,6 +71,9 @@ target/
 # Jupyter Notebook
 .ipynb_checkpoints
+# singularity containers
+*.sif
 # pyenv
 .python-version

--- a/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh
+++ b/video_prediction_tools/HPC_scripts/preprocess_data_era5_step2_template.sh
@@ -17,22 +17,29 @@ echo "Do not run the template scripts"
 exit 99
 ######### Template identifier (don't remove) #########
+# auxiliary variables
+WORK_DIR="$(pwd)"
+BASE_DIR=$(dirname "$WORK_DIR")
 # Name of virtual environment
 VIRT_ENV_NAME="my_venv"
+# Name of container image (must be available in working directory)
+CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
+WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
-# Loading mouldes
+# sanity checks
-source ../env_setup/modules_train.sh
+if [[ ! -f ${CONTAINER_IMG} ]]; then
-# Activate virtual environment if needed (and possible)
+  echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
-if [ -z ${VIRTUAL_ENV} ]; then
-   if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
-      echo "Activating virtual environment..."
-      source ../${VIRT_ENV_NAME}/bin/activate
-   else 
-      echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
  exit 1
 fi
+if [[ ! -f ${WRAPPER} ]]; then
+  echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
+  exit 1
 fi
+# clean-up modules to avoid conflicts between host and container settings
+module purge
 # declare directory-variables which will be modified by config_runscript.py
 source_dir=/my/path/to/pkl/files/
 destination_dir=/my/path/to/tfrecords/files
@@ -40,6 +47,11 @@ destination_dir=/my/path/to/tfrecords/files
 sequence_length=20
 sequences_per_file=10
 # run Preprocessing (step 2 where Tf-records are generated)
-srun python ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
+# run postprocessing/generation of model results including evaluation metrics
+export CUDA_VISIBLE_DEVICES=0
+## One node, single GPU
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
     -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file}
--- a/video_prediction_tools/HPC_scripts/preprocess_data_moving_mnist_template.sh
+++ b/video_prediction_tools/HPC_scripts/preprocess_data_moving_mnist_template.sh
@@ -37,4 +37,9 @@ source_dir=/my/path/to/mnist/raw/data/
 destination_dir=/my/path/to/mnist/tfrecords/
 # run Preprocessing (step 2 where Tf-records are generated)
-srun python ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
+# run postprocessing/generation of model results including evaluation metrics
+export CUDA_VISIBLE_DEVICES=0
+## One node, single GPU
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python3 ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
--- a/video_prediction_tools/HPC_scripts/train_model_era5_template.sh
+++ b/video_prediction_tools/HPC_scripts/train_model_era5_template.sh
@@ -2,52 +2,56 @@
 #SBATCH --account=deepacf
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
-##SBATCH --ntasks-per-node=1
+#SBATCH --output=train_model_era5-out.%j
-#SBATCH --cpus-per-task=1
+#SBATCH --error=train_model_era5-err.%j
-#SBATCH --output=train_era5-out.%j
+#SBATCH --time=24:00:00
-#SBATCH --error=train_era5-err.%j
-#SBATCH --time=20:00:00
 #SBATCH --gres=gpu:1
-#SBATCH --partition=gpus
+#SBATCH --partition=some_partition
 #SBATCH --mail-type=ALL
-#SBATCH --mail-user=b.gong@fz-juelich.de
+#SBATCH --mail-user=me@somewhere.com
-##jutil env activate -p cjjsc42
 ######### Template identifier (don't remove) #########
 echo "Do not run the template scripts"
 exit 99
 ######### Template identifier (don't remove) #########
+# auxiliary variables
+WORK_DIR="$(pwd)"
+BASE_DIR=$(dirname "$WORK_DIR")
 # Name of virtual environment
 VIRT_ENV_NAME="my_venv"
+# Name of container image (must be available in working directory)
+CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
+WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
-# Loading mouldes
+# sanity checks
-source ../env_setup/modules_train.sh
+if [[ ! -f ${CONTAINER_IMG} ]]; then
-# Activate virtual environment if needed (and possible)
+  echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
-if [ -z ${VIRTUAL_ENV} ]; then
-   if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
-      echo "Activating virtual environment..."
-      source ../${VIRT_ENV_NAME}/bin/activate
-   else 
-      echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
  exit 1
 fi
+if [[ ! -f ${WRAPPER} ]]; then
+  echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
+  exit 1
 fi
-# declare directory-variables which will be modified by config_runscript.py
+# clean-up modules to avoid conflicts between host and container settings
-source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
+module purge
-destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/
+# declare directory-variables which will be modified by generate_runscript.py
+source_dir=/my/path/to/tfrecords/files
+destination_dir=/my/model/output/path
 # valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
-# the destination_dir_full cannot end up with "/", this will cause to save all the checkpoints issue in the results_dir
 model=convLSTM
-datasplit_dict=../data_split/cv_test.json
+datasplit_dict=${destination_dir}/data_split.json
 model_hparams=${destination_dir}/model_hparams.json
-dataset=era5
-#If you train savp, Please uncomment the following CUDA configuration
+# run training in container
-#CUDA_VISIBLE_DEVICES=1
+export CUDA_VISIBLE_DEVICES=0
+## One node, single GPU 
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
+     --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
-# run training
-srun python ../main_scripts/main_train_models.py --input_dir  ${source_dir} --datasplit_dict ${datasplit_dict} \
- --dataset ${dataset}  --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir} --checkpoint ${destination_dir} 
--- a/video_prediction_tools/HPC_scripts/train_model_moving_mnist_template.sh
+++ b/video_prediction_tools/HPC_scripts/train_model_moving_mnist_template.sh
@@ -18,22 +18,28 @@ echo "Do not run the template scripts"
 exit 99
 ######### Template identifier (don't remove) #########
+# auxiliary variables
+WORK_DIR=`pwd`
+BASE_DIR=$(dirname "$WORK_DIR")
 # Name of virtual environment
 VIRT_ENV_NAME="my_venv"
+# Name of container image (must be available in working directory)
+CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
+WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
-# Loading mouldes
+# sanity checks
-source ../env_setup/modules_train.sh
+if [[ ! -f ${CONTAINER_IMG} ]]; then
-# Activate virtual environment if needed (and possible)
+  echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
-if [ -z ${VIRTUAL_ENV} ]; then
-   if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
-      echo "Activating virtual environment..."
-      source ../${VIRT_ENV_NAME}/bin/activate
-   else 
-      echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
  exit 1
 fi
+if [[ ! -f ${WRAPPER} ]]; then
+  echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
+  exit 1
 fi
+# clean-up modules to avoid conflicts between host and container settings
+module purge
 # declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
@@ -47,5 +53,10 @@ model_hparams=../hparams/${dataset}/${model}/model_hparams.json
 destination_dir=${destination_dir}/${model}/"$(date +"%Y%m%dT%H%M")_"$USER""
 # rund training
+# run training in container
-srun python ../scripts/train_dummy.py --input_dir  ${source_dir}/tfrecords/ --dataset moving_mnist  --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
+export CUDA_VISIBLE_DEVICES=0
+## One node, single GPU
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python ../main_scripts/train.py --input_dir  ${source_dir}/tfrecords/ --dataset ${dataset}  --model ${model} \
+      --model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/
--- a/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh
+++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_era5_template.sh
@@ -10,42 +10,49 @@
 #SBATCH --gres=gpu:1
 #SBATCH --partition=gpus
 #SBATCH --mail-type=ALL
-#SBATCH --mail-user=b.gong@fz-juelich.de
+#SBATCH --mail-user=me@somewhere.com
-##jutil env activate -p cjjsc42
 ######### Template identifier (don't remove) #########
 echo "Do not run the template scripts"
 exit 99
 ######### Template identifier (don't remove) #########
+# auxiliary variables
+WORK_DIR="$(pwd)"
+BASE_DIR=$(dirname "$WORK_DIR")
 # Name of virtual environment 
 VIRT_ENV_NAME="my_venv"
+# Name of container image (must be available in working directory)
+CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
+WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
-# Loading modules
+# sanity checks
-source ../env_setup/modules_postprocess.sh
+if [[ ! -f ${CONTAINER_IMG} ]]; then
-# Activate virtual environment if needed (and possible)
+  echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
-if [ -z ${VIRTUAL_ENV} ]; then
-   if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
-      echo "Activating virtual environment..."
-      source ../${VIRT_ENV_NAME}/bin/activate
-   else 
-      echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
  exit 1
 fi
+if [[ ! -f ${WRAPPER} ]]; then
+  echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
+  exit 1
 fi
-# declare directory-variables which will be modified by config_runscript.py
+# clean-up modules to avoid conflicts between host and container settings
+module purge
+# declare directory-variables which will be modified by generate_runscript.py
 # Note: source_dir is only needed for retrieving the base-directory
 source_dir=/my/source/dir/
 checkpoint_dir=/my/trained/model/dir
 results_dir=/my/results/dir
 lquick=""
-# name of model
-model=convLSTM
 # run postprocessing/generation of model results including evaluation metrics
-srun python -u ../main_scripts/main_visualize_postprocess.py --checkpoint  ${checkpoint_dir} --mode test  \
+export CUDA_VISIBLE_DEVICES=0
+## One node, single GPU
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python3 ../main_scripts/main_visualize_postprocess.py --checkpoint  ${checkpoint_dir} --mode test  \
                                                           --results_dir ${results_dir} --batch_size 4 \
                                                           --num_stochastic_samples 1 ${lquick} \
-                                                               > postprocess_era5-out_all.${SLURM_JOB_ID}
+                                                           > postprocess_era5-out_all."${SLURM_JOB_ID}"
--- a/video_prediction_tools/HPC_scripts/visualize_postprocess_moving_mnist_template.sh
+++ b/video_prediction_tools/HPC_scripts/visualize_postprocess_moving_mnist_template.sh
@@ -18,22 +18,29 @@ echo "Do not run the template scripts"
 exit 99
 ######### Template identifier (don't remove) #########
+# auxiliary variables
+WORK_DIR="$(pwd)"
+BASE_DIR=$(dirname "$WORK_DIR")
 # Name of virtual environment
 VIRT_ENV_NAME="my_venv"
+# Name of container image (must be available in working directory)
+CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
+WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
-# Loading modules
+# sanity checks
-source ../env_setup/modules_postprocess.sh
+if [[ ! -f ${CONTAINER_IMG} ]]; then
-# Activate virtual environment if needed (and possible)
+  echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
-if [ -z ${VIRTUAL_ENV} ]; then
-   if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
-      echo "Activating virtual environment..."
-      source ../${VIRT_ENV_NAME}/bin/activate
-   else 
-      echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
  exit 1
 fi
+if [[ ! -f ${WRAPPER} ]]; then
+  echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
+  exit 1
 fi
+# clean-up modules to avoid conflicts between host and container settings
+module purge
 # declare directory-variables which will be modified by config_runscript.py
 source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/moving_mnist
 checkpoint_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/moving_mnist
@@ -42,7 +49,11 @@ results_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/results/m
 model=convLSTM
 # run postprocessing/generation of model results including evaluation metrics
-srun python -u ../scripts/generate_movingmnist.py \
+export CUDA_VISIBLE_DEVICES=0
--input_dir ${source_dir}/ --dataset_hparams sequence_length=20 --checkpoint  ${checkpoint_dir}/${model} \
+## One node, single GPU
--mode test --model ${model} --results_dir ${results_dir}/${model} --batch_size 2 --dataset era5   > generate_era5-out.out
+srun --mpi=pspmix --cpu-bind=none \
+     singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
+     python3 ../scripts/generate_movingmnist.py --input_dir ${source_dir}/ --dataset_hparams sequence_length=20 \
+     --checkpoint  ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \
+     --batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}"
--- a/video_prediction_tools/deprecate/create_env_zam347.sh
+++ b/video_prediction_tools/deprecate/create_env_zam347.sh
--- a/video_prediction_tools/deprecate/datasets/Download_ERA5_Variable.py
+++ b/video_prediction_tools/deprecate/datasets/Download_ERA5_Variable.py
--- a/video_prediction_tools/deprecate/datasets/extract_data/era5_dataset_v2_anomaly.py
+++ b/video_prediction_tools/deprecate/datasets/extract_data/era5_dataset_v2_anomaly.py
--- a/video_prediction_tools/deprecate/datasets/extract_data/extract_era5.py
+++ b/video_prediction_tools/deprecate/datasets/extract_data/extract_era5.py
--- a/video_prediction_tools/deprecate/helper/helper.py
+++ b/video_prediction_tools/deprecate/helper/helper.py
--- a/video_prediction_tools/deprecate/model_modules/sna_model.py
+++ b/video_prediction_tools/deprecate/model_modules/sna_model.py
--- a/video_prediction_tools/deprecate/model_modules/sv2p_model.py
+++ b/video_prediction_tools/deprecate/model_modules/sv2p_model.py
--- a/video_prediction_tools/deprecate/model_modules/vanilla_GAN_model.py
+++ b/video_prediction_tools/deprecate/model_modules/vanilla_GAN_model.py
--- a/video_prediction_tools/env_setup/modules_postprocess.sh
+++ b/video_prediction_tools/env_setup/modules_postprocess.sh
--- a/video_prediction_tools/env_setup/modules_train.sh
+++ b/video_prediction_tools/env_setup/modules_train.sh
--- a/video_prediction_tools/deprecate/pretrained_models/download_model.sh
+++ b/video_prediction_tools/deprecate/pretrained_models/download_model.sh
--- a/video_prediction_tools/deprecate/scripts/combine_results.py
+++ b/video_prediction_tools/deprecate/scripts/combine_results.py
--- a/video_prediction_tools/deprecate/scripts/evaluate.py
+++ b/video_prediction_tools/deprecate/scripts/evaluate.py