Skip to content
Snippets Groups Projects
Commit 08913a21 authored by gong1's avatar gong1
Browse files

Merge branch 'bing_issue#189_train_modular' into develop

parents 5b3d7a79 55c0478e
Branches
No related tags found
No related merge requests found
Pipeline #125768 failed
Showing
with 0 additions and 573 deletions
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=13
##SBATCH --ntasks-per-node=13
#SBATCH --cpus-per-task=1
#SBATCH --output=DataPreprocess_era5_step2-out.%j
#SBATCH --error=DataPreprocess_era5_step2-err.%j
#SBATCH --time=04:00:00
#SBATCH --gres=gpu:0
#SBATCH --partition=batch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by config_runscript.py
source_dir=/my/path/to/pkl/files/
destination_dir=/my/path/to/tfrecords/files
sequence_length=20
sequences_per_file=10
# run Preprocessing (step 2 where Tf-records are generated)
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
-sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file}
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
# echo "Activating virtual environment..."
# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
# else
# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# exit 1
# fi
#fi
#
# Loading modules
#module purge
#source ../env_setup/modules_train.sh
#export CUDA_VISIBLE_DEVICES=0
#
# srun python3 ../main_scripts/main_preprocess_data_step2.py -source_dir ${source_dir} -dest_dir ${destination_dir} \
# -sequence_length ${sequence_length} -sequences_per_file ${sequences_per_file}
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --output=DataPreprocess_moving_mnist-out.%j
#SBATCH --error=DataPreprocess_moving_mnist-err.%j
#SBATCH --time=04:00:00
#SBATCH --partition=batch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified generate_runscript.py
source_dir=/my/path/to/mnist/raw/data/
destination_dir=/my/path/to/mnist/tfrecords/
# run Preprocessing (step 2 where Tf-records are generated)
# run postprocessing/generation of model results including evaluation metrics
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
# echo "Activating virtual environment..."
# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
# else
# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# exit 1
# fi
#fi
#
# Loading modules
#module purge
#source ../env_setup/modules_train.sh
#export CUDA_VISIBLE_DEVICES=0
#
# srun python3 .../video_prediction/datasets/moving_mnist.py ${source_dir} ${destination_dir}
\ No newline at end of file
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --output=train_moving_mnist-out.%j
#SBATCH --error=train_moving_mnist-err.%j
#SBATCH --time=00:20:00
#SBATCH --gres=gpu:1
#SBATCH --partition=gpus
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR=`pwd`
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified appropriately during Preprocessing (invoked by mpi_split_data_multi_years.py)
source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/moving_mnist
destination_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/moving_mnist
# for choosing the model, convLSTM,savp, mcnet,vae
model=convLSTM
dataset=moving_mnist
model_hparams=../hparams/${dataset}/${model}/model_hparams.json
destination_dir=${destination_dir}/${model}/"$(date +"%Y%m%dT%H%M")_"$USER""
# run training in container
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python ../main_scripts/train.py --input_dir ${source_dir}/tfrecords/ --dataset ${dataset} --model ${model} \
--model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
# echo "Activating virtual environment..."
# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
# else
# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# exit 1
# fi
#fi
#
# Loading modules
#module purge
#source ../env_setup/modules_train.sh
#export CUDA_VISIBLE_DEVICES=0
#
# srun python3 ../main_scripts/train.py --input_dir ${source_dir}/tfrecords/ --dataset ${dataset} --model ${model} \
# --model_hparams_dict ${model_hparams} --output_dir "${destination_dir}"/
\ No newline at end of file
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --output=train_model_era5-out.%j
#SBATCH --error=train_model_era5-err.%j
#SBATCH --time=24:00:00
#SBATCH --gres=gpu:1
#SBATCH --partition=some_partition
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by generate_runscript.py
source_dir=/my/path/to/tfrecords/files
destination_dir=/my/model/output/path
# valid identifiers for model-argument are: convLSTM, savp, mcnet and vae
model=convLSTM
datasplit_dict=${destination_dir}/data_split.json
model_hparams=${destination_dir}/model_hparams.json
# run training in container
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
--dataset weatherbench --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
# echo "Activating virtual environment..."
# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
# else
# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# exit 1
# fi
#fi
#
# Loading modules
#module purge
#source ../env_setup/modules_train.sh
#export CUDA_VISIBLE_DEVICES=0
#
# srun python3 "${BASE_DIR}"/main_scripts/main_train_models.py --input_dir ${source_dir} --datasplit_dict ${datasplit_dict} \
# --dataset era5 --model ${model} --model_hparams_dict ${model_hparams} --output_dir ${destination_dir}/
\ No newline at end of file
#!/bin/bash -x
#SBATCH --account=<your_project>
#SBATCH --nodes=1
#SBATCH --ntasks=1
##SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --output=generate_era5-out.%j
#SBATCH --error=generate_era5-err.%j
#SBATCH --time=00:20:00
#SBATCH --gres=gpu:1
#SBATCH --partition=develgpus
#SBATCH --mail-type=ALL
#SBATCH --mail-user=me@somewhere.com
######### Template identifier (don't remove) #########
echo "Do not run the template scripts"
exit 99
######### Template identifier (don't remove) #########
# auxiliary variables
WORK_DIR="$(pwd)"
BASE_DIR=$(dirname "$WORK_DIR")
# Name of virtual environment
VIRT_ENV_NAME="my_venv"
# !!! ADAPAT DEPENDING ON USAGE OF CONTAINER !!!
# For container usage, comment in the follwoing lines
# Name of container image (must be available in working directory)
CONTAINER_IMG="${WORK_DIR}/tensorflow_21.09-tf1-py3.sif"
WRAPPER="${BASE_DIR}/env_setup/wrapper_container.sh"
# sanity checks
if [[ ! -f ${CONTAINER_IMG} ]]; then
echo "ERROR: Cannot find required TF1.15 container image '${CONTAINER_IMG}'."
exit 1
fi
if [[ ! -f ${WRAPPER} ]]; then
echo "ERROR: Cannot find wrapper-script '${WRAPPER}' for TF1.15 container image."
exit 1
fi
# clean-up modules to avoid conflicts between host and container settings
module purge
# declare directory-variables which will be modified by config_runscript.py
source_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/preprocessedData/moving_mnist
checkpoint_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/models/moving_mnist
results_dir=/p/project/deepacf/deeprain/video_prediction_shared_folder/results/moving_mnist
# name of model
model=convLSTM
# run postprocessing/generation of model results including evaluation metrics
export CUDA_VISIBLE_DEVICES=0
## One node, single GPU
srun --mpi=pspmix --cpu-bind=none \
singularity exec --nv "${CONTAINER_IMG}" "${WRAPPER}" ${VIRT_ENV_NAME} \
python3 ../scripts/generate_movingmnist.py --input_dir ${source_dir}/ --dataset_hparams sequence_length=20 \
--checkpoint ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \
--batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}"
# WITHOUT container usage, comment in the follwoing lines (and uncomment the lines above)
# Activate virtual environment if needed (and possible)
#if [ -z ${VIRTUAL_ENV} ]; then
# if [[ -f ../virtual_envs/${VIRT_ENV_NAME}/bin/activate ]]; then
# echo "Activating virtual environment..."
# source ../virtual_envs/${VIRT_ENV_NAME}/bin/activate
# else
# echo "ERROR: Requested virtual environment ${VIRT_ENV_NAME} not found..."
# exit 1
# fi
#fi
#
# Loading modules
#module purge
#source ../env_setup/modules_train.sh
#export CUDA_VISIBLE_DEVICES=0
#
# srun python3 ../scripts/generate_movingmnist.py --input_dir ${source_dir}/ --dataset_hparams sequence_length=20 \
# --checkpoint ${checkpoint_dir}/${model} --mode test --model ${model} --results_dir ${results_dir}/${model} \
# --batch_size 2 --dataset era5 > generate_era5-out."${SLURM_JOB_ID}"
\ No newline at end of file
{
"batch_size": 10,
"lr": 0.001,
"max_epochs": 2,
"context_frames": 12
}
{
"batch_size": 16,
"lr": 0.0002,
"beta1": 0.5,
"beta2": 0.999,
"l1_weight": 100.0,
"l2_weight": 0.0,
"kl_weight": 0.0,
"video_sn_vae_gan_weight": 0.0,
"video_sn_gan_weight": 0.1,
"vae_gan_feature_cdist_weight": 0.0,
"gan_feature_cdist_weight": 10.0,
"state_weight": 0.0,
"nz": 32,
"max_epochs":2,
"context_frames":12
}
{
"batch_size": 32,
"lr": 0.001,
"beta1": 0.9,
"beta2": 0.999,
"l1_weight": 1.0,
"l2_weight": 0.0,
"kl_weight": 1e-05,
"video_sn_vae_gan_weight": 0.0,
"video_sn_gan_weight": 0.0,
"state_weight": 0.0,
"nz": 32,
"max_epochs":2,
"context_frames":12
}
{
"batch_size": 32,
"lr": 0.0002,
"beta1": 0.5,
"beta2": 0.999,
"l1_weight": 100.0,
"l2_weight": 0.0,
"kl_weight": 0.01,
"video_sn_vae_gan_weight": 0.1,
"video_sn_gan_weight": 0.1,
"vae_gan_feature_cdist_weight": 10.0,
"gan_feature_cdist_weight": 0.0,
"state_weight": 0.0,
"nz": 16,
"max_epochs":4,
"context_frames": 12,
"opt_var": "0",
"decay_steps":[3000,9000],
"end_lr": 0.00000008
}
{
"batch_size": 10,
"lr": 0.001,
"nz":16,
"max_epochs":2,
"context_frames":12,
"weight_recon":1,
"loss_fun": "rmse",
"shuffle_on_val": true
}
{
"batch_size": 16,
"lr": 0.0002,
"beta1": 0.5,
"beta2": 0.999,
"l1_weight": 100.0,
"l2_weight": 0.0,
"kl_weight": 0.01,
"video_sn_vae_gan_weight": 0.1,
"video_sn_gan_weight": 0.1,
"vae_gan_feature_cdist_weight": 10.0,
"gan_feature_cdist_weight": 0.0,
"state_weight": 0.0,
"nz": 16,
"max_epochs":2,
"context_frames":10,
"sequence_length":30
}
{
"batch_size": 10,
"lr": 0.001,
"max_epochs":20,
"context_frames":10,
"loss_fun":"cross_entropy",
"opt_var": "all"
}
{
"batch_size": 10,
"lr": 0.001,
"max_epochs":20,
"context_frames":10,
"loss_fun":"cross_entropy"
}
{
"batch_size": 10,
"lr": 0.001,
"max_epochs": 2,
"context_frames": 12
}
{
"batch_size": 16,
"lr": 0.0002,
"beta1": 0.5,
"beta2": 0.999,
"l1_weight": 100.0,
"l2_weight": 0.0,
"kl_weight": 0.0,
"video_sn_vae_gan_weight": 0.0,
"video_sn_gan_weight": 0.1,
"vae_gan_feature_cdist_weight": 0.0,
"gan_feature_cdist_weight": 10.0,
"state_weight": 0.0,
"nz": 32,
"max_epochs":2,
"context_frames":12
}
{
"batch_size": 32,
"lr": 0.001,
"beta1": 0.9,
"beta2": 0.999,
"l1_weight": 1.0,
"l2_weight": 0.0,
"kl_weight": 1e-05,
"video_sn_vae_gan_weight": 0.0,
"video_sn_gan_weight": 0.0,
"state_weight": 0.0,
"nz": 32,
"max_epochs":2,
"context_frames":12
}
{
"batch_size": 32,
"lr": 0.0002,
"beta1": 0.5,
"beta2": 0.999,
"l1_weight": 100.0,
"l2_weight": 0.0,
"kl_weight": 0.01,
"video_sn_vae_gan_weight": 0.1,
"video_sn_gan_weight": 0.1,
"vae_gan_feature_cdist_weight": 10.0,
"gan_feature_cdist_weight": 0.0,
"state_weight": 0.0,
"nz": 16,
"max_epochs":4,
"context_frames": 12,
"opt_var": "0",
"decay_steps":[3000,9000],
"end_lr": 0.00000008
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment