From 1738a3cd79cdc2e763b58e7e7f58862050ad2a0c Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Fri, 22 Nov 2019 09:05:06 +0100
Subject: [PATCH] Updated job submission scripts for all systems after testing.
 Added a setup script for JURON.

---
 horovod_data_distributed/README.md            | 27 ++++++++++++-------
 horovod_data_distributed/setup_juron.sh       | 24 +++++++++++++++++
 horovod_data_distributed/submit_job_jureca.sh |  3 ++-
 horovod_data_distributed/submit_job_juron.sh  |  5 +++-
 horovod_data_distributed/submit_job_juwels.sh |  3 ++-
 5 files changed, 50 insertions(+), 12 deletions(-)
 create mode 100755 horovod_data_distributed/setup_juron.sh

diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md
index ffb8ea2..612d11a 100644
--- a/horovod_data_distributed/README.md
+++ b/horovod_data_distributed/README.md
@@ -4,21 +4,30 @@ Please see the main docstring in each program for details.
 
 # Notes
 
-The `mnist_data_distributed.py` program requires the [`hpc4ns.distribution`](
+On JURECA and JUWELS, the `mnist_data_distributed.py` program requires the [`hpc4ns.distribution`](
 https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils#1-hpc4nsdistribution)
-module for distribution of training data filenames across multiple ranks. 
-Please follow the steps below to install the required package.
+module for distribution of training data filenames across multiple ranks. On JURON, multiple additional
+package are required. Please follow the steps below to setup the environment before submitting the
+training job.
+
+Note that a maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there
+are eight training files.
+
+## JURECA and JUWELS
 
 1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
-2.  Load the system-wide Python module.
-    *  On JURECA and JUWELS: `module load Python/3.6.8`
-    *  On JURON: `module load Python/3.6.1`
+2.  Load the system-wide Python module: `module load Python/3.6.8`    
 3.  Install the `hpc4ns` package: 
 
     `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils.git`
 
-The job can be submitted once the `hpc4ns` package is installed.
+4.  Submit the job
 
+## JURON
+
+1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
+2.  Setup a Python virtual environment with the required packages (may take upto 5 minutes): `./setup_juron.sh`
+3.  Submit the job: `bsub < submit_job_juron.sh`
 
-**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there
-are eight training files.
\ No newline at end of file
+**Note:** The setup is required only once. Unless you explicitly remove the virtual environment, the same
+setup can be used to run the example multiple times.
diff --git a/horovod_data_distributed/setup_juron.sh b/horovod_data_distributed/setup_juron.sh
new file mode 100755
index 0000000..24aa5bc
--- /dev/null
+++ b/horovod_data_distributed/setup_juron.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Load the Python module
+module load python/3.6.1
+
+# Create a virtual environment
+python -m venv venv_dl_hpc4ns
+
+# Activate the virtual environment
+source venv_dl_hpc4ns/bin/activate
+
+# Upgrade pip and setuptools
+pip install -U pip setuptools
+
+# Install mpi4py
+env MPICC=/gpfs/software/opt/openmpi/3.1.2-gcc_5.4.0-cuda_10.0.130/bin/mpicc pip install mpi4py
+
+# Install six
+pip install six
+
+# Install hpc4ns
+pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4ns_utils.git
+
+printf "%s\n" "Setup complete."
diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh
index ad05843..eedbaca 100755
--- a/horovod_data_distributed/submit_job_jureca.sh
+++ b/horovod_data_distributed/submit_job_jureca.sh
@@ -13,7 +13,8 @@
 
 # Load the required modules
 module load GCC/8.3.0
-module load MVAPICH2/2.3.1-GDR
+module load MVAPICH2/2.3.2-GDR
+module load mpi4py/3.0.1-Python-3.6.8
 module load TensorFlow/1.13.1-GPU-Python-3.6.8
 module load Horovod/0.16.2-GPU-Python-3.6.8
 
diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh
index 4fe6a48..9ad3e50 100755
--- a/horovod_data_distributed/submit_job_juron.sh
+++ b/horovod_data_distributed/submit_job_juron.sh
@@ -2,7 +2,7 @@
 
 #BSUB -q normal
 #BSUB -W 10
-#BSUB -n 8
+#BSUB -n 4
 #BSUB -R "span[ptile=4]"
 #BSUB -gpu "num=4"
 #BSUB -e "error.%J.er"
@@ -14,6 +14,9 @@ module load python/3.6.1
 module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
 module load horovod/0.15.2
 
+# Activate the virtual environment
+source venv_dl_hpc4ns/bin/activate
+
 # Run the program
 mpirun -bind-to none \
         -map-by slot \
diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh
index ad05843..eedbaca 100755
--- a/horovod_data_distributed/submit_job_juwels.sh
+++ b/horovod_data_distributed/submit_job_juwels.sh
@@ -13,7 +13,8 @@
 
 # Load the required modules
 module load GCC/8.3.0
-module load MVAPICH2/2.3.1-GDR
+module load MVAPICH2/2.3.2-GDR
+module load mpi4py/3.0.1-Python-3.6.8
 module load TensorFlow/1.13.1-GPU-Python-3.6.8
 module load Horovod/0.16.2-GPU-Python-3.6.8
 
-- 
GitLab