From 627d346af39e84a055bb37f2f3c18068b4a54d42 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Mon, 18 Nov 2019 08:38:21 +0100
Subject: [PATCH] The code sample for proper data-distributed training has been
 moved to a separate directory. A directory-local README.md file contains
 setup instructions. An announcement has been added to the main README.md.

---
 README.md                                     | 11 +++++--
 horovod/keras/.run_mnist_data_dist            |  3 --
 horovod_data_distributed/README.md            | 24 +++++++++++++++
 .../mnist_data_distributed.py                 | 30 +++++++++++--------
 horovod_data_distributed/submit_job_juwels.sh | 24 +++++++++++++++
 5 files changed, 73 insertions(+), 19 deletions(-)
 delete mode 100755 horovod/keras/.run_mnist_data_dist
 create mode 100644 horovod_data_distributed/README.md
 rename {horovod/keras => horovod_data_distributed}/mnist_data_distributed.py (87%)
 create mode 100755 horovod_data_distributed/submit_job_juwels.sh

diff --git a/README.md b/README.md
index a36ef62..dc59596 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,17 @@ visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/E
 
 ### Announcements
 
-*  Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well.
-*  Python 2 support has been removed from the tutorial for all frameworks except Caffe.
-*  Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch 
+*  **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code 
+samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism 
+where the training data is distributed instead of epochs. Further information is available in the 
+directory-local `README.md`.
+*  **September 02, 2019:** Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch 
 examples have been removed from this tutorial. This is due to the fact that the tutorial
 developers are not currently working with PyTorch, and are therefore not in a position to provide
 support for PyTorch related issues.
+*  **August 23, 2019:**
+   *  Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well.
+   *  Python 2 support has been removed from the tutorial for all frameworks except Caffe.
 
 # Table of contents
 <!-- TOC -->
diff --git a/horovod/keras/.run_mnist_data_dist b/horovod/keras/.run_mnist_data_dist
deleted file mode 100755
index b9b19fa..0000000
--- a/horovod/keras/.run_mnist_data_dist
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-PYTHONHASHSEED=0 mpirun -np 1 python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md
new file mode 100644
index 0000000..c0934fc
--- /dev/null
+++ b/horovod_data_distributed/README.md
@@ -0,0 +1,24 @@
+# Introduction
+
+Please see the main docstring in each program for details.
+
+# Notes
+
+The `mnist_data_distributed.py` program requires the [`slns.distribution`](
+https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils#1-slnsdistribution)
+module for distribution of training data filenames across multiple ranks. 
+Please follow the steps below to install the required package.
+
+1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
+2.  Load the system-wide Python module.
+    *  On JURECA and JUWELS: `module load Python/3.6.8`
+    *  On JURON: `module load Python/3.6.1`
+3.  Create a Python virtual environment: `python -m venv venv_dl_slns`
+4.  Activate the virtual environment: `source activate venv_dl_slns/bin/activate`
+5.  Install the `slns` package: `python -m pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils.git`
+6.  Open the job submission script you intend to use, and make sure the path to the virtual environment is correct
+
+Once all the above steps are completed, the job can be submitted.
+
+**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there
+only eight training files.
\ No newline at end of file
diff --git a/horovod/keras/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py
similarity index 87%
rename from horovod/keras/mnist_data_distributed.py
rename to horovod_data_distributed/mnist_data_distributed.py
index c1632cd..b6add7e 100644
--- a/horovod/keras/mnist_data_distributed.py
+++ b/horovod_data_distributed/mnist_data_distributed.py
@@ -2,14 +2,14 @@
 # This code is licensed under MIT license (see the LICENSE file for details).
 
 """
-    This program  program distributes the partitioned MNIST data across
-    multiple MPI ranks for truly data distributed training of a shallow ANN
-    for handwritten digit classification.
+    This program distributes the partitioned MNIST data across multiple ranks
+    for truly data distributed training of a shallow ANN for handwritten digit
+    classification.
 
-    The Horovod framework is used for seamless distributed training. Instead
-    of distributing epochs, this program distributes data amongst the ranks,
-    so that each rank contributes training based on its local subset of the
-    training data.
+    The Horovod framework is used for seamless distributed training. However,
+    instead of distributing epochs, this program distributes data amongst the
+    ranks, so that each rank contributes training based on its local subset of
+    the training data.
 
 """
 
@@ -93,14 +93,15 @@ def initialize_hvd_and_mpi():
     are no conflicts between Horovod and mpi4py communicator
     initialization.
 
-    :exception: hpcns.errors.MpiInitError is raised in the case
+    :exception: slns.errors.MpiInitError is raised in the case
                 of initialization failure.
     """
 
     # Initialize Horovod.
     hvd.init()
 
-    # Pin the GPU to be used to process local rank (one GPU per process)
+    # Bind the local rank to a specific GPU, so that each rank uses
+    # a different GPU
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = True
     tf_config.gpu_options.visible_device_list = str(hvd.local_rank())
@@ -139,14 +140,19 @@ def main():
     # Flag to indicate whether this is the MPI root
     is_root = hvd.rank() == 0
 
+    # Decorate the get_filenames function so that instead of returning
+    # a list of all filenames, it returns a list of the subset of
+    # filenames that are to be processed by the local rank.
     dist_decorator = DataDistributor(
         mpi_comm=mpi4py.MPI.COMM_WORLD, shutdown_on_error=True
     )
     get_rank_local_filenames = dist_decorator(get_filenames)
 
+    # Data directory paths
     data_sub_dir = 'mnist/partitioned'
     data_dir = DataValidator.validated_data_dir(data_sub_dir)
 
+    # Prepare training data
     train_filenames = get_rank_local_filenames(
         f'{os.path.join(data_dir, data_sub_dir)}/train')
     x_train, y_train = load_dataset(
@@ -156,6 +162,7 @@ def main():
     x_train = x_train / 255.0
 
     if is_root:
+        # Prepare test data
         test_filenames = get_filenames(
             f'{os.path.join(data_dir, data_sub_dir)}/test')
         x_test, y_test = load_dataset(
@@ -174,7 +181,7 @@ def main():
     # Optimizer
     optimizer = tf.keras.optimizers.Adam()
 
-    # Horovod: add Horovod Distributed Optimizer.
+    # Decorate the optimizer with the Horovod Distributed Optimizer
     optimizer = hvd.DistributedOptimizer(optimizer)
 
     # Compile the model
@@ -189,9 +196,6 @@ def main():
 
     # Training callbacks
     callbacks = [
-        # Horovod: broadcast initial variable states from rank 0 to all other processes.
-        # This is necessary to ensure consistent initialization of all workers when
-        # training is started with random weights or restored from a checkpoint.
         hvd.callbacks.BroadcastGlobalVariablesCallback(0)
     ]
 
diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh
new file mode 100755
index 0000000..24f50d8
--- /dev/null
+++ b/horovod_data_distributed/submit_job_juwels.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HVD_DATA_DIST
+#SBATCH --gres=gpu:4 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
+
+# Source the virtual environment
+source activate venv_dl_slns/bin/activate
+
+# Run the program
+srun python -u mnist_data_distributed.py
-- 
GitLab