diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh new file mode 100755 index 0000000000000000000000000000000000000000..ad05843f0a1145bc41ead0fd66909172b3d08be1 --- /dev/null +++ b/horovod_data_distributed/submit_job_jureca.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HVD_DATA_DIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh new file mode 100755 index 0000000000000000000000000000000000000000..4fe6a4879f94bcc1b6630f798bee72bb6fe85b45 --- /dev/null +++ b/horovod_data_distributed/submit_job_juron.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 8 +#BSUB -R "span[ptile=4]" +#BSUB -gpu "num=4" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HVD_DATA_DIST + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 + +# Run the program +mpirun -bind-to none \ + -map-by slot \ + -x NCCL_DEBUG=INFO \ + -x LD_LIBRARY_PATH \ + -x PATH \ + -mca pml ob1 \ + -mca btl ^openib \ + python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh index f5d9b6f61796eef52d05286cdcd66bef1ddfbccb..ad05843f0a1145bc41ead0fd66909172b3d08be1 100755 --- a/horovod_data_distributed/submit_job_juwels.sh +++ b/horovod_data_distributed/submit_job_juwels.sh @@ -17,8 +17,5 @@ module load MVAPICH2/2.3.1-GDR module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load Horovod/0.16.2-GPU-Python-3.6.8 -# Source the virtual environment -source activate venv_dl_hpc4ns/bin/activate - # Run the program srun python -u mnist_data_distributed.py