From 2751d5ef69e6229b8179a9b025d59d5e6d0217bc Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Fri, 22 Nov 2019 06:45:28 +0100 Subject: [PATCH] Added/updated job scripts for data-distributed training. --- horovod_data_distributed/submit_job_jureca.sh | 21 ++++++++++++++++ horovod_data_distributed/submit_job_juron.sh | 25 +++++++++++++++++++ horovod_data_distributed/submit_job_juwels.sh | 3 --- 3 files changed, 46 insertions(+), 3 deletions(-) create mode 100755 horovod_data_distributed/submit_job_jureca.sh create mode 100755 horovod_data_distributed/submit_job_juron.sh diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh new file mode 100755 index 0000000..ad05843 --- /dev/null +++ b/horovod_data_distributed/submit_job_jureca.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HVD_DATA_DIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh new file mode 100755 index 0000000..4fe6a48 --- /dev/null +++ b/horovod_data_distributed/submit_job_juron.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 8 +#BSUB -R "span[ptile=4]" +#BSUB -gpu "num=4" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HVD_DATA_DIST + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 + +# Run the program +mpirun -bind-to none \ + -map-by slot \ + -x NCCL_DEBUG=INFO \ + -x LD_LIBRARY_PATH \ + -x PATH \ + -mca pml ob1 \ + -mca btl ^openib \ + python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh index f5d9b6f..ad05843 100755 --- a/horovod_data_distributed/submit_job_juwels.sh +++ b/horovod_data_distributed/submit_job_juwels.sh @@ -17,8 +17,5 @@ module load MVAPICH2/2.3.1-GDR module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load Horovod/0.16.2-GPU-Python-3.6.8 -# Source the virtual environment -source activate venv_dl_hpc4ns/bin/activate - # Run the program srun python -u mnist_data_distributed.py -- GitLab