From 1738a3cd79cdc2e763b58e7e7f58862050ad2a0c Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Fri, 22 Nov 2019 09:05:06 +0100 Subject: [PATCH] Updated job submission scripts for all systems after testing. Added a setup script for JURON. --- horovod_data_distributed/README.md | 27 ++++++++++++------- horovod_data_distributed/setup_juron.sh | 24 +++++++++++++++++ horovod_data_distributed/submit_job_jureca.sh | 3 ++- horovod_data_distributed/submit_job_juron.sh | 5 +++- horovod_data_distributed/submit_job_juwels.sh | 3 ++- 5 files changed, 50 insertions(+), 12 deletions(-) create mode 100755 horovod_data_distributed/setup_juron.sh diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md index ffb8ea2..612d11a 100644 --- a/horovod_data_distributed/README.md +++ b/horovod_data_distributed/README.md @@ -4,21 +4,30 @@ Please see the main docstring in each program for details. # Notes -The `mnist_data_distributed.py` program requires the [`hpc4ns.distribution`]( +On JURECA and JUWELS, the `mnist_data_distributed.py` program requires the [`hpc4ns.distribution`]( https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils#1-hpc4nsdistribution) -module for distribution of training data filenames across multiple ranks. -Please follow the steps below to install the required package. +module for distribution of training data filenames across multiple ranks. On JURON, multiple additional +package are required. Please follow the steps below to setup the environment before submitting the +training job. + +Note that a maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there +are eight training files. + +## JURECA and JUWELS 1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` -2. Load the system-wide Python module. - * On JURECA and JUWELS: `module load Python/3.6.8` - * On JURON: `module load Python/3.6.1` +2. Load the system-wide Python module: `module load Python/3.6.8` 3. Install the `hpc4ns` package: `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils.git` -The job can be submitted once the `hpc4ns` package is installed. +4. Submit the job +## JURON + +1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` +2. Setup a Python virtual environment with the required packages (may take upto 5 minutes): `./setup_juron.sh` +3. Submit the job: `bsub < submit_job_juron.sh` -**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there -are eight training files. \ No newline at end of file +**Note:** The setup is required only once. Unless you explicitly remove the virtual environment, the same +setup can be used to run the example multiple times. diff --git a/horovod_data_distributed/setup_juron.sh b/horovod_data_distributed/setup_juron.sh new file mode 100755 index 0000000..24aa5bc --- /dev/null +++ b/horovod_data_distributed/setup_juron.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Load the Python module +module load python/3.6.1 + +# Create a virtual environment +python -m venv venv_dl_hpc4ns + +# Activate the virtual environment +source venv_dl_hpc4ns/bin/activate + +# Upgrade pip and setuptools +pip install -U pip setuptools + +# Install mpi4py +env MPICC=/gpfs/software/opt/openmpi/3.1.2-gcc_5.4.0-cuda_10.0.130/bin/mpicc pip install mpi4py + +# Install six +pip install six + +# Install hpc4ns +pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils.git + +printf "%s\n" "Setup complete." diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh index ad05843..eedbaca 100755 --- a/horovod_data_distributed/submit_job_jureca.sh +++ b/horovod_data_distributed/submit_job_jureca.sh @@ -13,7 +13,8 @@ # Load the required modules module load GCC/8.3.0 -module load MVAPICH2/2.3.1-GDR +module load MVAPICH2/2.3.2-GDR +module load mpi4py/3.0.1-Python-3.6.8 module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load Horovod/0.16.2-GPU-Python-3.6.8 diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh index 4fe6a48..9ad3e50 100755 --- a/horovod_data_distributed/submit_job_juron.sh +++ b/horovod_data_distributed/submit_job_juron.sh @@ -2,7 +2,7 @@ #BSUB -q normal #BSUB -W 10 -#BSUB -n 8 +#BSUB -n 4 #BSUB -R "span[ptile=4]" #BSUB -gpu "num=4" #BSUB -e "error.%J.er" @@ -14,6 +14,9 @@ module load python/3.6.1 module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 module load horovod/0.15.2 +# Activate the virtual environment +source venv_dl_hpc4ns/bin/activate + # Run the program mpirun -bind-to none \ -map-by slot \ diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh index ad05843..eedbaca 100755 --- a/horovod_data_distributed/submit_job_juwels.sh +++ b/horovod_data_distributed/submit_job_juwels.sh @@ -13,7 +13,8 @@ # Load the required modules module load GCC/8.3.0 -module load MVAPICH2/2.3.1-GDR +module load MVAPICH2/2.3.2-GDR +module load mpi4py/3.0.1-Python-3.6.8 module load TensorFlow/1.13.1-GPU-Python-3.6.8 module load Horovod/0.16.2-GPU-Python-3.6.8 -- GitLab