Skip to content
Snippets Groups Projects
Commit 5687e99f authored by Michael Langguth's avatar Michael Langguth
Browse files

Merge branch 'michael_issue#145_training_with_Nvidia_TF1.15_container' into develop

parents e2b50cd5 4d507a09
Branches
Tags
No related merge requests found
Pipeline #89945 failed
Showing
with 149 additions and 96 deletions
......@@ -71,6 +71,9 @@ target/
# Jupyter Notebook
.ipynb_checkpoints
# singularity containers
*.sif
# pyenv
.python-version
......
......@@ -17,22 +17,29 @@ echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# Loading mouldes
source ../env_setup/modules_train.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by config_runscript.py
source_dir=/my/path/to/pkl/files/
destination_dir=/my/path/to/tfrecords/files
......@@ -40,6 +47,11 @@ destination_dir=/my/path/to/tfrecords/files
sequence_length=20
sequences_per_file=10
# run Preprocessing (step 2 where Tf-records are generated)
srun python ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
# run postprocessing/generation of model results including evaluation metrics
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
-sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file}
......@@ -37,4 +37,9 @@ source_dir=/my/path/to/mnist/raw/data/
destination_dir=/my/path/to/mnist/tfrecords/
# run Preprocessing (step 2 where Tf-records are generated)
srun python ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
# run postprocessing/generation of model results including evaluation metrics
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
......@@ -2,52 +2,56 @@
#SBATCH --account=deepacf
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --output=train_era5-out.%j
#SBATCH --error=train_era5-err.%j
#SBATCH --time=20:00:00
#SBATCH --output=train_model_era5-out.%j
#SBATCH --error=train_model_era5-err.%j
#SBATCH --time=24:00:00
#SBATCH --gres=gpu:1
#SBATCH --partition=gpus
#SBATCH --partition=some_partition
#SBATCH --mail-type=ALL
#SBATCH --mail-user=b.gong@fz-juelich.de
##jutil env activate -p cjjsc42
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# Loading mouldes
source ../env_setup/modules_train.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# declare directory-variables which will be modified by config_runscript.py
source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/
destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by generate_runscript.py
source_dir=/my/path/to/tfrecords/files
destination_dir=/my/model/output/path
# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
# the destination_dir_full cannot end up with "/", this will cause to save all the checkpoints issue in the results_dir
model=convLSTM
datasplit_dict=../data_split/cv_test.json
datasplit_dict=${destination_dir}/data_split.json
model_hparams=${destination_dir}/model_hparams.json
dataset=era5
#If you train savp, Please uncomment the following CUDA configuration
#CUDA_VISIBLE_DEVICES=1
# run training in container
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
--dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
# run training
srun python ../main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
--dataset ${dataset} --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir} --checkpoint ${destination_dir}
......@@ -18,22 +18,28 @@ echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR=`pwd`
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# Loading mouldes
source ../env_setup/modules_train.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
......@@ -47,5 +53,10 @@ model_hparams=../hparams/${dataset}/${model}/model_hparams.json
destination_dir=${destination_dir}/${model}/"$(date +"%Y%m%dT%H%M")_"$USER""
# rund training
srun python ../scripts/train_dummy.py --input_dir ${source_dir}/tfrecords/ --dataset moving_mnist --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
# run training in container
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python ../main_scripts/train.py --input_dir ${source_dir}/tfrecords/ --dataset ${dataset} --model ${model} \
--model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/
......@@ -10,42 +10,49 @@
#SBATCH --gres=gpu:1
#SBATCH --partition=gpus
#SBATCH --mail-type=ALL
#SBATCH --mail-user=b.gong@fz-juelich.de
##jutil env activate -p cjjsc42
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# Loading modules
source ../env_setup/modules_postprocess.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# declare directory-variables which will be modified by config_runscript.py
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by generate_runscript.py
# Note: source_dir is only needed for retrieving the base-directory
source_dir=/my/source/dir/
checkpoint_dir=/my/trained/model/dir
results_dir=/my/results/dir
lquick=""
# name of model
model=convLSTM
# run postprocessing/generation of model results including evaluation metrics
srun python -u ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../main_scripts/main_visualize_postprocess.py --checkpoint ${checkpoint_dir} --mode test \
--results_dir ${results_dir} --batch_size 4 \
--num_stochastic_samples 1 ${lquick} \
> postprocess_era5-out_all.${SLURM_JOB_ID}
> postprocess_era5-out_all."${SLURM_JOB_ID}"
......@@ -18,22 +18,29 @@ echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# Loading modules
source ../env_setup/modules_postprocess.sh
# Activate virtual environment if needed (and possible)
if [ -z ${VIRTUAL_ENV} ]; then
if [[ -f ../${VIRT_ENV_NAME}/bin/activate ]]; then
echo "Activating virtual environment..."
source ../${VIRT_ENV_NAME}/bin/activate
else
echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by config_runscript.py
source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/moving_mnist
checkpoint_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/moving_mnist
......@@ -42,7 +49,11 @@ results_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/results/m
model=convLSTM
# run postprocessing/generation of model results including evaluation metrics
srun python -u ../scripts/generate_movingmnist.py \
--input_dir ${source_dir}/ --dataset_hparams sequence_length=20 --checkpoint ${checkpoint_dir}/${model} \
--mode test --model ${model} --results_dir ${results_dir}/${model} --batch_size 2 --dataset era5 > generate_era5-out.out
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../scripts/generate_movingmnist.py --input_dir ${source_dir}/ --dataset_hparams sequence_length=20 \
--checkpoint ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \
--batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment