From ba420e69e7e11e20692dd299574fe4d35baa680a Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Mon, 26 Apr 2021 15:19:58 +0200
Subject: [PATCH 1/8] Added and tested TF2 code on JUWEL Booster.

---
 caffe/README.md                               |  43 -----
 .../.submit_job_jureca_python2.sh             |  22 ---
 caffe/lenet_python/lenet_auto_solver.prototxt |  24 ---
 .../lenet_python/submit_job_juron_python2.sh  |  17 --
 .../lenet_python/submit_job_juron_python3.sh  |  17 --
 caffe/lenet_python/train_lenet.py             | 107 -----------
 caffe/mnist_cmd/.submit_job_jureca_python2.sh |  22 ---
 caffe/mnist_cmd/lenet_solver.prototxt         |  25 ---
 caffe/mnist_cmd/lenet_train_test.prototxt     | 168 ------------------
 caffe/mnist_cmd/snapshots/.gitkeep            |   0
 caffe/mnist_cmd/submit_job_juron_python2.sh   |  17 --
 caffe/mnist_cmd/submit_job_juron_python3.sh   |  17 --
 horovod_data_distributed/juwels_booster_job   |  28 +++
 .../mnist_data_distributed.py                 |  18 +-
 requirements.txt                              |  61 ++++---
 .../checkpoints}/.gitkeep                     |   0
 tensorflow2/juwels_booster_job                |  27 +++
 tensorflow2/keras_mnist.py                    | 107 +++++++++++
 tensorflow2/mnist.py                          | 109 ++++++++++++
 19 files changed, 321 insertions(+), 508 deletions(-)
 delete mode 100644 caffe/README.md
 delete mode 100755 caffe/lenet_python/.submit_job_jureca_python2.sh
 delete mode 100644 caffe/lenet_python/lenet_auto_solver.prototxt
 delete mode 100755 caffe/lenet_python/submit_job_juron_python2.sh
 delete mode 100755 caffe/lenet_python/submit_job_juron_python3.sh
 delete mode 100644 caffe/lenet_python/train_lenet.py
 delete mode 100755 caffe/mnist_cmd/.submit_job_jureca_python2.sh
 delete mode 100644 caffe/mnist_cmd/lenet_solver.prototxt
 delete mode 100644 caffe/mnist_cmd/lenet_train_test.prototxt
 delete mode 100644 caffe/mnist_cmd/snapshots/.gitkeep
 delete mode 100755 caffe/mnist_cmd/submit_job_juron_python2.sh
 delete mode 100755 caffe/mnist_cmd/submit_job_juron_python3.sh
 create mode 100755 horovod_data_distributed/juwels_booster_job
 rename {caffe/lenet_python/snapshots => tensorflow2/checkpoints}/.gitkeep (100%)
 create mode 100755 tensorflow2/juwels_booster_job
 create mode 100644 tensorflow2/keras_mnist.py
 create mode 100644 tensorflow2/mnist.py

diff --git a/caffe/README.md b/caffe/README.md
deleted file mode 100644
index 1804dce..0000000
--- a/caffe/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-**Caution:** Caffe is no longer being actively developed, which is why we prefer not to support
-it as a system-wide module on the supercomputers for long. This is why Caffe is available with 
-Python 2 support only on JURECA, while it is not at all supported on JUWELS. The users are advised 
-to switch to other frameworks such as Tensorflow/Keras and PyTorch.
-
-# Notes
-
-There are three ways in which Caffe can be used,
-1.  As a command line tool with only built-in layers
-2.  As a library from within a Python program. Either only built-in layers can be used, 
-or one or more custom layers can be written in Python.
-3.  As a command line tool with one or more custom C++ layers.
-
-## Caffe as a command line tool
-
-The `mnist_cmd` sub-directory contains configuration and job scripts for running 
-Caffe as a command line tool with only built-in layers. This example represents use 
-case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` 
-were taken from the MNIST examples directory available in the Caffe repository 
-[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have 
-been made just so the path to the input dataset is correct. The `caffe` command 
-in the job submission scripts can be modified as follows to run training on 
-all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`):
-
-    caffe train --solver=lenet_solver.prototxt -gpu all
-
-## Using Caffe within a Python program
-
-The `lenet_python` sub-directory contains the required files for an example of 
-using Caffe as a library from within a Python program. This corresponds to use case 
-2 as described above. The `train_lenet.py` file contains source code adapted from 
-the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples 
-[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example 
-results in the generation of a learning curve plot in the current directory.
-
-## Caffe with custom C++ layers
-
-Working with custom C++ layers requires recompiling Caffe with the custom code. As 
-this is not possible with a system-wide installation, we have decided not to 
-include an example of this use case. Nevertheless, if you must work with custom 
-C++ layers and require assistance, please send an email to the JULAIN mailing list 
-(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)).
-
diff --git a/caffe/lenet_python/.submit_job_jureca_python2.sh b/caffe/lenet_python/.submit_job_jureca_python2.sh
deleted file mode 100755
index 7506925..0000000
--- a/caffe/lenet_python/.submit_job_jureca_python2.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=CAFFE_LENET_PYTHON
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/Devel-2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load Caffe/1.0-Python-2.7.15
-
-# Run the program
-srun python -u train_lenet.py
diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt
deleted file mode 100644
index 44af3ad..0000000
--- a/caffe/lenet_python/lenet_auto_solver.prototxt
+++ /dev/null
@@ -1,24 +0,0 @@
-# The train/test net protocol buffer definition
-train_net: "lenet_auto_train.prototxt"
-test_net: "lenet_auto_test.prototxt"
-# test_iter specifies how many forward passes the test should carry out.
-# In the case of MNIST, we have test batch size 100 and 100 test iterations,
-# covering the full 10,000 testing images.
-test_iter: 100
-# Carry out testing every 500 training iterations.
-test_interval: 500
-# The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.01
-momentum: 0.9
-weight_decay: 0.0005
-# The learning rate policy
-lr_policy: "inv"
-gamma: 0.0001
-power: 0.75
-# Display every 100 iterations
-display: 100
-# The maximum number of iterations
-max_iter: 10000
-# snapshot intermediate results
-snapshot: 5000
-snapshot_prefix: "snapshots/lenet"
diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh
deleted file mode 100755
index 2025a38..0000000
--- a/caffe/lenet_python/submit_job_juron_python2.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_LENET_PYTHON
-
-# Load the Python and Caffe modules
-module load python/2.7.14
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train LeNet
-python -u train_lenet.py
diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh
deleted file mode 100755
index 7e73776..0000000
--- a/caffe/lenet_python/submit_job_juron_python3.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_LENET_PYTHON
-
-# Load the Python and Caffe modules
-module load python/3.6.1
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train LeNet
-python -u train_lenet.py
diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py
deleted file mode 100644
index ad5cae3..0000000
--- a/caffe/lenet_python/train_lenet.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import os
-import sys
-import matplotlib
-
-# Force matplotlib to not use any Xwindows backend.
-matplotlib.use('Agg')
-import pylab
-
-import caffe
-from caffe import layers as L, params as P
-
-# Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-
-# Prepares network specification
-def lenet(lmdb, batch_size):
-    # Caffe's version of LeNet: a series of linear and simple nonlinear transformations
-    n = caffe.NetSpec()
-
-    n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb,
-                             transform_param=dict(scale=1. / 255), ntop=2)
-
-    n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier'))
-    n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
-    n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier'))
-    n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
-    n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier'))
-    n.relu1 = L.ReLU(n.fc1, in_place=True)
-    n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier'))
-    n.loss = L.SoftmaxWithLoss(n.score, n.label)
-
-    return n.to_proto()
-
-
-# Names of the directories containing the LMDB files for TRAIN and TEST phases
-test_dir = 'mnist/caffe/mnist_test_lmdb'
-train_dir = 'mnist/caffe/mnist_train_lmdb'
-
-# Validated path to the data root
-DataValidator.validated_data_dir(train_dir)
-data_dir = DataValidator.validated_data_dir(test_dir)
-
-# Write the prototxt for TRAIN phase
-with open('lenet_auto_train.prototxt', 'w') as f:
-    f.write(str(lenet(os.path.join(data_dir, train_dir), 64)))
-
-# Write the prototxt for TEST phase
-with open('lenet_auto_test.prototxt', 'w') as f:
-    f.write(str(lenet(os.path.join(data_dir, test_dir), 100)))
-
-# Use the GPU for training
-caffe.set_device(0)
-caffe.set_mode_gpu()
-
-# Load the solver and create train and test nets
-solver = None  # ignore this workaround for lmdb data (can't instantiate two solvers on the same data)
-solver = caffe.SGDSolver('lenet_auto_solver.prototxt')
-
-solver.net.forward()  # train net
-solver.test_nets[0].forward()  # test net (there can be more than one)
-
-niter = 200
-test_interval = 25
-# losses will also be stored in the log
-train_loss = pylab.zeros(niter)
-test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval)))
-output = pylab.zeros((niter, 8, 10))
-
-# the main solver loop
-for it in range(niter):
-    solver.step(1)  # SGD by Caffe
-
-    # store the train loss
-    train_loss[it] = solver.net.blobs['loss'].data
-
-    # store the output on the first test batch
-    # (start the forward pass at conv1 to avoid loading new data)
-    solver.test_nets[0].forward(start='conv1')
-    output[it] = solver.test_nets[0].blobs['score'].data[:8]
-
-    # run a full test every so often
-    # (Caffe can also do this for us and write to a log, but we show here
-    #  how to do it directly in Python, where more complicated things are easier.)
-    if it % test_interval == 0:
-        print('Iteration', it, 'testing...')
-        correct = 0
-        for test_it in range(100):
-            solver.test_nets[0].forward()
-            correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1)
-                           == solver.test_nets[0].blobs['label'].data)
-        test_acc[it // test_interval] = correct / 1e4
-
-# Plot the training curve
-_, ax1 = pylab.subplots()
-ax2 = ax1.twinx()
-ax1.plot(pylab.arange(niter), train_loss)
-ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r')
-ax1.set_xlabel('iteration')
-ax1.set_ylabel('train loss')
-ax2.set_ylabel('test accuracy')
-ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1]))
-
-# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace
-pylab.savefig('learning_curve.png', bbox_inches='tight')
diff --git a/caffe/mnist_cmd/.submit_job_jureca_python2.sh b/caffe/mnist_cmd/.submit_job_jureca_python2.sh
deleted file mode 100755
index 029520e..0000000
--- a/caffe/mnist_cmd/.submit_job_jureca_python2.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=CAFFE_MNIST_CMD
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/Devel-2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load Caffe/1.0-Python-2.7.15
-
-# Train the model using the 'caffe' binary
-srun caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt
deleted file mode 100644
index 103b2e7..0000000
--- a/caffe/mnist_cmd/lenet_solver.prototxt
+++ /dev/null
@@ -1,25 +0,0 @@
-# The train/test net protocol buffer definition
-net: "lenet_train_test.prototxt"
-# test_iter specifies how many forward passes the test should carry out.
-# In the case of MNIST, we have test batch size 100 and 100 test iterations,
-# covering the full 10,000 testing images.
-test_iter: 100
-# Carry out testing every 500 training iterations.
-test_interval: 500
-# The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.01
-momentum: 0.9
-weight_decay: 0.0005
-# The learning rate policy
-lr_policy: "inv"
-gamma: 0.0001
-power: 0.75
-# Display every 100 iterations
-display: 100
-# The maximum number of iterations
-max_iter: 10000
-# snapshot intermediate results
-snapshot: 5000
-snapshot_prefix: "snapshots/lenet"
-# solver mode: CPU or GPU
-solver_mode: GPU
diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt
deleted file mode 100644
index f34ab71..0000000
--- a/caffe/mnist_cmd/lenet_train_test.prototxt
+++ /dev/null
@@ -1,168 +0,0 @@
-name: "LeNet"
-layer {
-  name: "mnist"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    scale: 0.00390625
-  }
-  data_param {
-    source: "../../datasets/mnist/caffe/mnist_train_lmdb"
-    batch_size: 64
-    backend: LMDB
-  }
-}
-layer {
-  name: "mnist"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    scale: 0.00390625
-  }
-  data_param {
-    source: "../../datasets/mnist/caffe/mnist_test_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 20
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "conv1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 2
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 50
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "conv2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 2
-    stride: 2
-  }
-}
-layer {
-  name: "ip1"
-  type: "InnerProduct"
-  bottom: "pool2"
-  top: "ip1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 500
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "ip1"
-  top: "ip1"
-}
-layer {
-  name: "ip2"
-  type: "InnerProduct"
-  bottom: "ip1"
-  top: "ip2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 10
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "ip2"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "ip2"
-  bottom: "label"
-  top: "loss"
-}
diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh
deleted file mode 100755
index b5ee63c..0000000
--- a/caffe/mnist_cmd/submit_job_juron_python2.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_MNIST_CMD
-
-# Load the Python and Caffe modules
-module load python/2.7.14
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train a model for MNIST
-caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh
deleted file mode 100755
index bdac4a2..0000000
--- a/caffe/mnist_cmd/submit_job_juron_python3.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_MNIST_CMD
-
-# Load the Python and Caffe modules
-module load python/3.6.1
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train a model for MNIST
-caffe train --solver=lenet_solver.prototxt -gpu 0
diff --git a/horovod_data_distributed/juwels_booster_job b/horovod_data_distributed/juwels_booster_job
new file mode 100755
index 0000000..803e764
--- /dev/null
+++ b/horovod_data_distributed/juwels_booster_job
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=booster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load mpi4py/3.0.3-Python-3.8.5
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py
index d4c68c1..b2335a8 100644
--- a/horovod_data_distributed/mnist_data_distributed.py
+++ b/horovod_data_distributed/mnist_data_distributed.py
@@ -20,7 +20,6 @@ import mpi4py
 import numpy as np
 import tensorflow as tf
 import horovod.tensorflow.keras as hvd
-from tensorflow.python.keras import backend as K
 
 from hpc4neuro.errors import MpiInitError
 from hpc4neuro.distribution import DataDistributor
@@ -102,10 +101,14 @@ def initialize_hvd_and_mpi():
 
     # Bind the local rank to a specific GPU, so that each rank uses
     # a different GPU
-    tf_config = tf.ConfigProto()
-    tf_config.gpu_options.allow_growth = True
-    tf_config.gpu_options.visible_device_list = str(hvd.local_rank())
-    K.set_session(tf.Session(config=tf_config))
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+        tf.config.experimental.set_visible_devices(
+            gpus[hvd.local_rank()],
+            'GPU'
+        )
 
     # Verify that MPI multi-threading is supported. Horovod cannot work
     # with mpi4py (or any other MPI library) otherwise.
@@ -113,8 +116,9 @@ def initialize_hvd_and_mpi():
     # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163
     if not hvd.mpi_threads_supported():
         raise MpiInitError(
-            'MPI multi-threading is not supported. Horovod cannot work with mpi4py'
-            'in this case. Please enable MPI multi-threading and try again.'
+            'MPI multi-threading is not supported. Horovod cannot work with '
+            'mpi4py in this case. Please enable MPI multi-threading and try '
+            'again.'
         )
 
     # Disable automatic MPI initialization on importing mpi4py.MPI,
diff --git a/requirements.txt b/requirements.txt
index 79144dc..6a4def7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,24 +1,41 @@
-absl-py==0.8.0
-astor==0.8.0
-cffi==1.12.3
-cloudpickle==1.2.1
-gast==0.3.1
-grpcio==1.23.0
+absl-py==0.12.0
+astunparse==1.6.3
+cachetools==4.2.1
+certifi==2020.12.5
+cffi==1.14.5
+chardet==4.0.0
+cloudpickle==1.6.0
+gast==0.3.3
+google-auth==1.29.0
+google-auth-oauthlib==0.4.4
+google-pasta==0.2.0
+grpcio==1.37.0
 h5py==2.10.0
-Markdown==3.1.1
-mock==3.0.5
-mpi4py==3.0.2
-numpy==1.17.2
-protobuf==3.9.1
-psutil==5.6.3
-pycparser==2.19
-six==1.12.0
-Werkzeug==0.15.6
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-tensorboard==1.13.1
-tensorflow-estimator==1.13.0
-tensorflow-gpu==1.13.1
+horovod==0.20.3
+hpc4neuro @ git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git@57a560b4085dba2ba3262d4d3238ef70991be877
+idna==2.10
+Keras-Preprocessing==1.1.2
+Markdown==3.3.4
+mpi4py==3.0.3
+numpy==1.18.5
+oauthlib==3.1.0
+opt-einsum==3.3.0
+protobuf==3.15.8
+psutil==5.8.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.20
+PyYAML==5.4.1
+requests==2.25.1
+requests-oauthlib==1.3.0
+rsa==4.7.2
+six==1.15.0
+tensorboard==2.5.0
+tensorboard-data-server==0.6.0
+tensorboard-plugin-wit==1.8.0
+tensorflow==2.3.1
+tensorflow-estimator==2.3.0
 termcolor==1.1.0
-keras==2.3.1
-horovod==0.16.2
\ No newline at end of file
+urllib3==1.26.4
+Werkzeug==1.0.1
+wrapt==1.12.1
diff --git a/caffe/lenet_python/snapshots/.gitkeep b/tensorflow2/checkpoints/.gitkeep
similarity index 100%
rename from caffe/lenet_python/snapshots/.gitkeep
rename to tensorflow2/checkpoints/.gitkeep
diff --git a/tensorflow2/juwels_booster_job b/tensorflow2/juwels_booster_job
new file mode 100755
index 0000000..625afac
--- /dev/null
+++ b/tensorflow2/juwels_booster_job
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=booster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow2/keras_mnist.py b/tensorflow2/keras_mnist.py
new file mode 100644
index 0000000..e444560
--- /dev/null
+++ b/tensorflow2/keras_mnist.py
@@ -0,0 +1,107 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(dataset_file)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+             tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+
+# Horovod: adjust learning rate based on number of GPUs.
+scaled_lr = 0.001 * hvd.size()
+opt = tf.optimizers.Adam(scaled_lr)
+
+# Horovod: add Horovod DistributedOptimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
+# uses hvd.DistributedOptimizer() to compute gradients.
+mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
+                    optimizer=opt,
+                    metrics=['accuracy'],
+                    experimental_run_tf_function=False)
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+    # Horovod: average metrics among workers at the end of every epoch.
+    #
+    # Note: This callback must be in the list before the ReduceLROnPlateau,
+    # TensorBoard or other metrics-based callbacks.
+    hvd.callbacks.MetricAverageCallback(),
+
+    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+    # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
+
+# Horovod: write logs on worker 0.
+verbose = 1 if hvd.rank() == 0 else 0
+
+# Train the model.
+# Horovod: adjust number of steps based on number of GPUs.
+mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose)
\ No newline at end of file
diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py
new file mode 100644
index 0000000..53cb1da
--- /dev/null
+++ b/tensorflow2/mnist.py
@@ -0,0 +1,109 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(dataset_file)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+             tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+loss = tf.losses.SparseCategoricalCrossentropy()
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = tf.optimizers.Adam(0.001 * hvd.size())
+
+checkpoint_dir = 'checkpoints/'
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
+
+
+@tf.function
+def training_step(images, labels, first_batch):
+    with tf.GradientTape() as tape:
+        probs = mnist_model(images, training=True)
+        loss_value = loss(labels, probs)
+
+    # Horovod: add Horovod Distributed GradientTape.
+    tape = hvd.DistributedGradientTape(tape)
+
+    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
+
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    #
+    # Note: broadcast should be done after the first gradient step to ensure optimizer
+    # initialization.
+    if first_batch:
+        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+    return loss_value
+
+
+# Horovod: adjust number of steps based on number of GPUs.
+for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())):
+    loss_value = training_step(images, labels, batch == 0)
+
+    if batch % 10 == 0 and hvd.local_rank() == 0:
+        print('Step #%d\tLoss: %.6f' % (batch, loss_value))
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from
+# corrupting it.
+if hvd.rank() == 0:
+    checkpoint.save(checkpoint_dir)
-- 
GitLab


From 315aca15e05fbd95df2a1c66dab09e0089f7c016 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Mon, 26 Apr 2021 19:53:57 +0200
Subject: [PATCH 2/8] The first complete tutorial overhaul with Tensorflow2.

---
 .gitattributes                                |   4 -
 .gitignore                                    |   3 +
 README.md                                     | 335 ++++++--------
 datasets/mnist/caffe/mnist_test_lmdb/data.mdb |   3 -
 datasets/mnist/caffe/mnist_test_lmdb/lock.mdb |   3 -
 .../mnist/caffe/mnist_train_lmdb/data.mdb     |   3 -
 .../mnist/caffe/mnist_train_lmdb/lock.mdb     |   3 -
 horovod/README.md                             |  35 --
 horovod/keras/mnist.py                        | 116 -----
 horovod/keras/mnist_advanced.py               | 149 -------
 horovod/keras/run_on_localMachine.sh          |   8 -
 horovod/keras/submit_job_jureca.sh            |  22 -
 horovod/keras/submit_job_juron.sh             |  20 -
 horovod/keras/submit_job_juwels.sh            |  22 -
 horovod/tensorflow/checkpoints/.gitkeep       |   0
 horovod/tensorflow/mnist.py                   | 159 -------
 horovod/tensorflow/mnist_estimator.py         | 214 ---------
 horovod/tensorflow/run_on_localMachine.sh     |   8 -
 horovod/tensorflow/submit_job_jureca.sh       |  22 -
 horovod/tensorflow/submit_job_juron.sh        |  19 -
 horovod/tensorflow/submit_job_juwels.sh       |  22 -
 horovod/tensorflow/synthetic_benchmark.py     | 120 -----
 horovod_data_distributed/README.md            |  33 --
 horovod_data_distributed/setup_juron.sh       |  24 -
 horovod_data_distributed/submit_job_jureca.sh |  22 -
 horovod_data_distributed/submit_job_juron.sh  |  28 --
 horovod_data_distributed/submit_job_juwels.sh |  22 -
 keras/README.md                               |  13 -
 keras/mnist.py                                |  93 ----
 keras/run_on_localMachine.sh                  |   4 -
 keras/submit_job_jureca.sh                    |  20 -
 keras/submit_job_juron.sh                     |  18 -
 keras/submit_job_juwels.sh                    |  20 -
 tensorflow/README.md                          |  21 +-
 .../keras => tensorflow}/checkpoints/.gitkeep |   0
 tensorflow/jureca_job.sh                      |  24 +
 tensorflow/jusuf_job.sh                       |  24 +
 tensorflow/juwels_booster_job.sh              |  24 +
 tensorflow/juwels_job.sh                      |  24 +
 {tensorflow2 => tensorflow}/keras_mnist.py    |   2 +-
 tensorflow/mnist.py                           | 415 +++++-------------
 tensorflow/run_on_localMachine.sh             |   4 -
 tensorflow/submit_job_jureca.sh               |  19 -
 tensorflow/submit_job_juron.sh                |  17 -
 tensorflow/submit_job_juwels.sh               |  19 -
 tensorflow2/checkpoints/.gitkeep              |   0
 tensorflow2/mnist.py                          | 109 -----
 training_data_distribution/README.md          |  27 ++
 .../jureca_job.sh                             |   5 +-
 training_data_distribution/jusuf_job.sh       |  28 ++
 .../juwels_booster_job.sh                     |  28 ++
 .../juwels_job.sh                             |   2 +-
 .../mnist_data_distributed.py                 |   0
 utils/data_utils.py                           |  10 +-
 54 files changed, 439 insertions(+), 1950 deletions(-)
 delete mode 100644 datasets/mnist/caffe/mnist_test_lmdb/data.mdb
 delete mode 100644 datasets/mnist/caffe/mnist_test_lmdb/lock.mdb
 delete mode 100644 datasets/mnist/caffe/mnist_train_lmdb/data.mdb
 delete mode 100644 datasets/mnist/caffe/mnist_train_lmdb/lock.mdb
 delete mode 100644 horovod/README.md
 delete mode 100644 horovod/keras/mnist.py
 delete mode 100644 horovod/keras/mnist_advanced.py
 delete mode 100644 horovod/keras/run_on_localMachine.sh
 delete mode 100755 horovod/keras/submit_job_jureca.sh
 delete mode 100755 horovod/keras/submit_job_juron.sh
 delete mode 100755 horovod/keras/submit_job_juwels.sh
 delete mode 100644 horovod/tensorflow/checkpoints/.gitkeep
 delete mode 100644 horovod/tensorflow/mnist.py
 delete mode 100644 horovod/tensorflow/mnist_estimator.py
 delete mode 100644 horovod/tensorflow/run_on_localMachine.sh
 delete mode 100755 horovod/tensorflow/submit_job_jureca.sh
 delete mode 100644 horovod/tensorflow/submit_job_juron.sh
 delete mode 100755 horovod/tensorflow/submit_job_juwels.sh
 delete mode 100644 horovod/tensorflow/synthetic_benchmark.py
 delete mode 100644 horovod_data_distributed/README.md
 delete mode 100755 horovod_data_distributed/setup_juron.sh
 delete mode 100755 horovod_data_distributed/submit_job_jureca.sh
 delete mode 100755 horovod_data_distributed/submit_job_juron.sh
 delete mode 100755 horovod_data_distributed/submit_job_juwels.sh
 delete mode 100644 keras/README.md
 delete mode 100644 keras/mnist.py
 delete mode 100644 keras/run_on_localMachine.sh
 delete mode 100755 keras/submit_job_jureca.sh
 delete mode 100644 keras/submit_job_juron.sh
 delete mode 100755 keras/submit_job_juwels.sh
 rename {horovod/keras => tensorflow}/checkpoints/.gitkeep (100%)
 create mode 100755 tensorflow/jureca_job.sh
 create mode 100755 tensorflow/jusuf_job.sh
 create mode 100755 tensorflow/juwels_booster_job.sh
 create mode 100755 tensorflow/juwels_job.sh
 rename {tensorflow2 => tensorflow}/keras_mnist.py (97%)
 delete mode 100644 tensorflow/run_on_localMachine.sh
 delete mode 100755 tensorflow/submit_job_jureca.sh
 delete mode 100644 tensorflow/submit_job_juron.sh
 delete mode 100755 tensorflow/submit_job_juwels.sh
 delete mode 100644 tensorflow2/checkpoints/.gitkeep
 delete mode 100644 tensorflow2/mnist.py
 create mode 100644 training_data_distribution/README.md
 rename tensorflow2/juwels_booster_job => training_data_distribution/jureca_job.sh (83%)
 create mode 100755 training_data_distribution/jusuf_job.sh
 create mode 100755 training_data_distribution/juwels_booster_job.sh
 rename horovod_data_distributed/juwels_booster_job => training_data_distribution/juwels_job.sh (95%)
 rename {horovod_data_distributed => training_data_distribution}/mnist_data_distributed.py (100%)

diff --git a/.gitattributes b/.gitattributes
index dbf6f0e..36df28f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,7 +1,3 @@
-datasets/mnist/caffe/mnist_test_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text
-datasets/mnist/caffe/mnist_test_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text
-datasets/mnist/caffe/mnist_train_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text
-datasets/mnist/caffe/mnist_train_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/keras/mnist.npz filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/pytorch/data/processed/training.pt filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/pytorch/data/processed/test.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index 9c4d6d5..05043c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,3 +118,6 @@ mnist_convnet_model/
 # Error and output files from the supercomputers
 *.er
 *.out
+
+# MacOS
+.DS_Store
\ No newline at end of file
diff --git a/README.md b/README.md
index 0990dfd..7ac59ff 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,46 @@
 # Getting started with Deep Learning on Supercomputers
 
-This repository is intended to serve as a tutorial for anyone interested in utilizing the supercomputers 
-available at the Jülich Supercomputing Center (JSC) for deep learning based projects. It is assumed that 
-the reader is proficient in one or more of the following frameworks:
+This repository is intended to serve as a tutorial for anyone interested in 
+utilizing the supercomputers available at the Jülich Supercomputing Center (JSC) 
+for deep learning based projects. It is assumed that the reader is proficient in 
+the following frameworks:
 
 *    [Tensorflow](https://www.tensorflow.org/)
-*    [Keras](https://keras.io/)
 *    [Horovod](https://github.com/horovod/horovod)
-*    [Caffe](http://caffe.berkeleyvision.org/) (limited support)
 
-**Note:** This tutorial is by no means intended as an introduction to deep learning, or to any of the
-above mentioned frameworks. If you are interested in educational resources for beginners, please
-visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/Education) page.
+**Note:** This tutorial is by no means intended as an introduction to deep 
+learning, or to any of the above mentioned frameworks. If you are interested in 
+educational resources for beginners, please visit 
+[this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/-/wikis/home) 
+page.
 
 ### Announcements
 
-*  **November 28, 2019:** Slides and code samples for the "Deep Learning on Supercomputers" talk given
-as part of the [Introduction to the programming and usage of the supercomputer resources at Jülich](https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) 
-course are now available in the `course_material` directory.
-*  **November 22, 2019:** Samples for Caffe are no longer supported on JURECA due to system-wide
-MVAPICH2 module changes.
-*  **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code 
-samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism 
-where the training data is distributed instead of epochs. Further information is available in the 
-directory-local `README.md`.
-*  **September 02, 2019:** Even though PyTorch is available as a system-wide module on the JSC supercomputers, all PyTorch 
-examples have been removed from this tutorial. This is due to the fact that the tutorial
-developers are not currently working with PyTorch, and are therefore not in a position to provide
-support for PyTorch related issues.
+* **April 26, 2021:** The tutorial has been updated to use Tensorflow2. Also, 
+  code samples and datasets that are no longer relevant, e.g., those for Caffe, 
+  have been removed.
+*  **November 28, 2019:** Slides and code samples for the "Deep Learning on 
+   Supercomputers" talk given as part of the [Introduction to the programming 
+   and usage of the supercomputer resources at Jülich](
+   https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) 
+   course are now available in the `course_material` directory.
+*  **November 22, 2019:** Samples for Caffe are no longer supported on JURECA 
+   due to system-wide MVAPICH2 module changes.
+*  **November 18, 2019:** The `horovod_data_distributed` directory has been 
+   added that contains code samples to illustrate proper data-distributed 
+   training with Horovod, i.e., a distribution mechanism where the training data 
+   is distributed instead of epochs. Further information is available in the 
+   directory-local `README.md`.
+*  **September 02, 2019:** Even though PyTorch is available as a system-wide 
+   module on the JSC supercomputers, all PyTorch examples have been removed from 
+   this tutorial. This is due to the fact that the tutorial developers are not 
+   currently working with PyTorch, and are therefore not in a position to 
+   provide support for PyTorch related issues.
 *  **August 23, 2019:**
-   *  Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well.
-   *  Python 2 support has been removed from the tutorial for all frameworks except Caffe.
+   *  Tensorflow and Keras examples (with and without Horovod) are now fully 
+      functional on JUWELS as well.
+   *  Python 2 support has been removed from the tutorial for all frameworks 
+      except Caffe.
 
 # Table of contents
 <!-- TOC -->
@@ -38,133 +48,97 @@ support for PyTorch related issues.
 1. [A word regarding the code samples](#1-a-word-regarding-the-code-samples)
 2. [Changes made to support loading of pre-downloaded datasets](#2-changes-made-to-support-loading-of-pre-downloaded-datasets)
 3. [Applying for user accounts on supercomputers](#3-applying-for-user-accounts-on-supercomputers)
-    * [3.1. JURECA and JUWELS](#31-jureca-and-juwels)
-    * [3.2. JURON](#32-juron)
 4. [Logging on to the supercomputers](#4-logging-on-to-the-supercomputers)
-    * [4.1. JURECA and JUWELS](#41-jureca-and-juwels)
-    * [4.2. JURON](#42-juron)
 5. [Cloning the repository](#5-cloning-the-repository)
-    * [5.1. JURECA and JUWELS](#51-jureca-and-juwels)
-    * [5.2. JURON](#52-juron)
 6. [Running a sample](#6-running-a-sample)
-    * [6.1. JURECA and JUWELS](#61-jureca-and-juwels)
-    * [6.2. JURON](#62-juron)
-7. [Python 2 support](#7-python-2-support)
-8. [Distributed training](#8-distributed-training)
-9. [Credits](#9-credits)
+7. [Distributed training](#8-distributed-training)
+8. [Credits](#9-credits)
 
 <!-- /TOC -->
 
 ## 1. A word regarding the code samples
 
-Samples for each framework are available in the correspondingly named directory. Each such 
-directory typically contains at least one code sample, which trains a simple artificial neural 
-network on the canonical MNIST hand-written digit classification task. Moreover, job submission 
-scripts are included for all the supercomputers on which this tutorial has been tested. The job 
-scripts will hopefully make it easier to figure out which modules to load. Finally, 
-a `README.md` file contains further information about the contents of the directory.
+Samples for each framework are available in the correspondingly named directory. 
+Each such directory typically contains at least one code sample, which trains a 
+simple artificial neural network on the canonical MNIST hand-written digit 
+classification task. Moreover, job submission scripts are included for all the 
+supercomputers on which this tutorial has been tested. The job scripts will 
+hopefully make it easier to figure out which modules to load. Finally, a 
+`README.md` file contains further information about the contents of the 
+directory.
 
-**Disclaimer:** Neither are the samples intended to serve as examples of optimized code, nor do these 
-represent programming best practices.
+**Disclaimer:** Neither are the samples intended to serve as examples of 
+optimized code, nor do these represent programming best practices.
 
 ## 2. Changes made to support loading of pre-downloaded datasets
 
-It is worth mentioning that all the code samples were taken from the corresponding framework's 
-official samples/tutorials repository, as practitioners are likely familiar with these (links 
-to the original code samples are included in the directory-local `README.md`). However, the 
-original examples are designed to automatically download the required dataset in a 
-framework-defined directory. This is not a feasible option while working with supercomputers as compute nodes 
-do not have access to the Internet. Therefore, the samples have been slightly modified to load data from 
-the `datasets` directory included in this repository; specific code changes, at least for now, 
-have been marked by comments prefixed with the `[HPCNS]` tag. For more information see the `README.md` 
-available in the `datasets` directory.
+It is worth mentioning that all the code samples were taken from the 
+corresponding framework's official samples/tutorials repository, as 
+practitioners are likely familiar with these (links to the original code samples 
+are included in the directory-local `README.md`). However, the original examples 
+are designed to automatically download the required dataset in a 
+framework-defined directory. This is not a feasible option while working with 
+supercomputers as compute nodes do not have access to the Internet. Therefore, 
+the samples have been slightly modified to load data from the `datasets` 
+directory included in this repository; specific code changes, at least for now, 
+have been marked by comments prefixed with the `[HPCNS]` tag. For more 
+information see the `README.md` available in the `datasets` directory.
 
 ## 3. Applying for user accounts on supercomputers
 
-In case you do not already have an account on your supercomputer of interest, please take a look at the 
-instructions provided in the following sub-sections.
-
-### 3.1 JURECA and JUWELS
-
-For more information on getting accounts on JURECA and JUWELS, click 
-[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html).
-
-### 3.2 JURON
-
-To get a user account on JURON, please follow the steps below:
-
-1.  Write an email to [Dirk Pleiter](http://www.fz-juelich.de/SharedDocs/Personen/IAS/JSC/EN/staff/pleiter_d.html?nn=362224), 
-in which please introduce yourself and mention why you need the account.
-2.  Apply for the account via the [JuDoor](https://dspserv.zam.kfa-juelich.de/judoor/login) portal 
-(more information about JuDoor is available [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/JuDoor.html?nn=945700)).
-If your work is related to the Human Brain Project (HBP), please join the `PCP0` and `CPCP0` projects. 
-Otherwise please join the `PADC` and `CPADC` projects.
+In case you do not already have an account on your supercomputer of interest, 
+please refer to the instructions available [here](
+http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html), 
+as you will need to apply for computing time before an account is created for you.
 
 ## 4. Logging on to the supercomputers
 
-**Note:** From here on it is assumed that you already have an account on your required supercomputer.
+**Note:** From here on it is assumed that you already have an account on your 
+required supercomputer.
 
-### 4.1 JURECA and JUWELS
+**Note:** This tutorial is supported for the following supercomputers: JURECA, 
+JUWELS, JUWELS Booster, and JUSUF.
 
 Following are the steps required to login (more information: 
 [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/access.html#access), 
-[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access)).
+[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access),
+[JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/access.html)).
+
+For the purpose of this tutorial, we will assume that our system of interest is 
+JURECA. If you intend to use a different system, you can simply replace the 
+system name in the commands below; the procedure is precisely the same for all 
+machines.
 
-1.  Use SSH to login. Use one of the following commands, depending on your target system:
+1.  Use SSH to login:
     
-    `ssh <username>@jureca.fz-juelich.de` or `ssh <username>@juwels.fz-juelich.de`
+    `ssh -i ~/.ssh/<keyfile> <username>@jureca.fz-juelich.de`
 2.  Upon successful login, activate your project environment:
 
     `jutil env activate -p <name of compute project> -A <name of budget>`
     
-    **Note:** To view a list of all project and budget names available to you, please use the following command: 
-    `jutil user projects -o columns`. Each name under the column titled "project" has a corresponding type under the
-    column titled "project-type". All projects with "project-type" "C" are compute projects, and 
-    can be used in the `<name of compute project>` field for the command above. The `<name of budget>` field should then
-    contain the corresponding name under the "budgets" column. Please click [here](
+    **Note:** To view a list of all project and budget names available to you, 
+    please use the following command: `jutil user projects -o columns`. Each 
+    name under the column titled "project" has a corresponding type under the 
+    column titled "project-type". All projects with "project-type" "C" are 
+    compute projects, and can be used in the `<name of compute project>` field 
+    for the command above. The `<name of budget>` field should then contain the 
+    corresponding name under the "budgets" column. Please click [here](
     http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)
     for more information.
 3.  Change to the project directory:
 
     `cd $PROJECT`
 
-You should be in your project directory at this point. As the project directory is shared with other project 
-members, it is recommended to create a new directory with your username, and change to that directory. If 
-you'd like to clone this repository elsewhere, please change to the directory of your choice.
-
-### 4.2 JURON
-
-Following are the steps required to login.
-
-1.  Use SSH to login:
-    
-    `ssh <username>@juron.fz-juelich.de`
-2.  Upon successful login, activate your project environment (more information 
-[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)).
- 
-    `jutil env activate -p <name of compute project>`
-    
-    The `<name of compute project>` can be either `CPCP0` or `CPADC`, depending on whether you are a member
-    of `CPCP0` or `CPADC` (to view a list of all project names available to you, please use the following 
-    command: `jutil user projects -o columns`). Note that as opposed to the corresponding section on JURECA, 
-    the `<name of budget>` is not included. This is because the `CPCP0` and `CPADC` projects do not support 
-    accounting.
-3.  Change to the project directory:
-
-    `cd $PROJECT`
-
-You should be in your project directory at this point. As the `CPCP0` and `CPADC` project directories 
-are shared amongst many users from different institutes and organizations, it is recommended to create 
-a personal directory (named after your username) withing the project directory. You can then use your 
-personal directory for all your work, including cloning this tutorial.
+You should be in your project directory at this point. As the project directory 
+is shared with other project members, it is recommended to create a new 
+directory with your username, and change to that directory. If you'd like to 
+clone this repository elsewhere, please change to the directory of your choice.
 
 ## 5. Cloning the repository
 
-In order to store the datasets within the repository, we use Git LFS. This makes cloning the 
-repository a little bit different. Please find below the instructions on how to clone on different 
-systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/).
-
-### 5.1 JURECA and JUWELS
+In order to store the datasets within the repository, we use Git LFS. This makes 
+cloning the repository slightly different. Please find below the instructions 
+on how to clone the repository. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/).
 
 1.  Load the Git LFS module:
 
@@ -176,107 +150,66 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju
 
     `git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git`
 
-### 5.2 JURON
-
-The process is simpler on JURON. You can simply clone the repository along with the datasets using 
-the following command:
-
-    git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git
-
 ## 6. Running a sample
 
-Let us consider a scenario where you would like to run the `mnist.py` sample available in the `keras` 
-directory. This sample trains a CNN on MNIST using Keras on a single GPU. The following sub-sections list 
-the steps required for different supercomputers.
-
-### 6.1 JURECA and JUWELS
+Let us consider a scenario where you would like to run the `keras_mnist.py` 
+sample available in the `tensorflow` directory. This sample trains a CNN on 
+MNIST using Tensorflow's Keras API. Following steps can be used to run the 
+sample:
 
 1.  Change directory to the repository root:
 
     `cd dl_on_supercomputers`
-2.  Change to the keras sub-directory:
+2.  Change to the tensorflow sub-directory:
 
-    `cd keras`
+    `cd tensorflow`
 3.  Submit the job to run the sample:
 
-    `sbatch submit_job_jureca.sh` or `sbatch submit_job_juwels.sh`
+    `sbatch jureca_job.sh`
 
-That's it; this is all you need for job submission. If you'd like to receive email notifications 
-regarding the status of the job, add the following statement to the "SLURM job configuration" 
-block in the `submit_job_jureca.sh` (or `submit_job_juwels.sh`) script (replace `<your email address here>` with your 
-email address).
+That's it; this is all you need for job submission. If you'd like to receive 
+email notifications regarding the status of the job, add the following statement 
+to the "SLURM job configuration" block in the `jureca_job.sh`script (replace 
+`<your email address here>` with your email address).
 
     #SBATCH --mail-user=<your email address here>
 
-Output from the job is available in the `error` and `output` files, as specified in the job 
-configuration.
-
-**Note:** In the job submission scripts, the `--partition` value is set to `develgpus`, as jobs 
-are often (but not always) scheduled faster on this partition than the `gpus` partition. However,
-resources in `develgpus` are limited 
-(as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), 
-[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions)). Therefore, 
-it is highly recommended that users familiarize themselves with the limitations, and use the `gpus` 
-partition for all production use, as well as when developing/testing with more resources than are 
-available on the `develgpus` partition. 
-
-### 6.2 JURON
-
-1.  Change directory to the repository root:
-
-    `cd dl_on_supercomputers`
-2.  Change to the keras sub-directory:
-
-    `cd keras`
-3.  Submit the job to run the sample:
-
-    `bsub < submit_job_juron.sh`
-
-Please note that unlike JURECA and JUWELS, JURON uses LSF for job submission, which is why a different 
-syntax is required for job configuration and submission. Moreover, email notifications are not 
-supported on JURON. For more information on how to use LSF on JURON, use the following command:
-
-    man 7 juron-lsf
-    
-Output from the job is available in the `error` and `output` files, as specified in the job 
-configuration.
-
-## 7. Python 2 support
-
-As the official support for Python 2 will be be discontinued in 2020, we decided to encourage our
-users to make the switch to Python 3 already. This also enables us to provide better support for
-Python 3 based modules, as we no longer have to spend time maintaining Python 2 modules.
-
-The only exception is Caffe, as on JURECA it is available with Python 2 only. Please note however that
-other than on JURON, Caffe is only available in the JURECA Stage 2018b, i.e., one of the previous stages. 
-We do not intend to provide support for Caffe from Stage 2019a and onward. This is due to the fact that 
-Caffe is no longer being developed.
-
-## 8. Distributed training
-
-[Horovod](https://github.com/horovod/horovod) provides a simple and efficient solution for 
-training artificial neural networks on multiple GPUs across multiple nodes in a cluster. It can 
-be used with Tensorflow and Keras (some other frameworks are supported as well, but 
-not Caffe). In this repository, the `horovod` directory contains further sub-directories; one 
-for each compatible framework that has been tested. E.g., there is a `keras` sub-directory that 
-contains samples that utilize distributed training with Keras and Horovod (more information is available 
-in the directory-local `README.md`).
-
-Please note that Horovod currently only supports a distribution strategy where the entire model is 
-replicated on every GPU. It is the data that is distributed across the GPUs. If you are interested 
-in model-parallel training, where the model itself can be split and distributed, a different 
-solution is required. We hope to add a sample for model-parallel training at a later time.
-
-Caffe does not support multi-node training. However, it has built-in support for [multi-GPU 
-training](https://github.com/BVLC/caffe/blob/master/docs/multigpu.md) on a single node (only
-via the C/C++ interface). The `mnist_cmd` sample in the `caffe` directory contains the job 
-script that can be used to train the model on multiple GPUs. Please see the 
-directory-local `README.md` for further information.
-
-## 9. Credits
+Output from the job is available in the `error` and `output` files, as specified 
+in the job configuration.
+
+**Note:** Please note that the job scripts for all systems are almost exactly 
+the same, except for the `--partition` value. This is because partition names 
+vary from system to system. Nevertheless, for each system, this tutorial uses 
+the corresponding development partition, e.g., `dc-gpu-devel` on JURECA. This is 
+because jobs are often (but not always) scheduled faster on this partition than 
+the production partition. However, resources in the development partitions are 
+limited (as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), 
+[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions), 
+and [JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/quickintro.html#quick-avail-partitions)). 
+Therefore, it is highly recommended that users familiarize themselves with the 
+limitations, and use the production partition for all production use, as well as 
+when developing/testing with more resources than are available on the 
+development partition. 
+
+## 7. Distributed training
+
+[Horovod](https://github.com/horovod/horovod) provides a simple and efficient 
+solution for training artificial neural networks on multiple GPUs across 
+multiple nodes in a cluster. It can be used with Tensorflow (some 
+other frameworks are supported as well). Since this tutorial primarily concerns 
+distributed training, only code samples that utilize Horovod are included.
+
+Please note that Horovod currently only supports a distribution strategy where 
+the entire model is replicated on every GPU. It is the data that is distributed 
+across the GPUs. If you are interested in model-parallel training, where the 
+model itself can be split and distributed, a different solution is required. We 
+hope to add a sample for model-parallel training at a later time.
+
+## 8. Credits
 
 *  **Created by:** Fahad Khalid (SLNS/HPCNS, JSC)
 *  **Installation of modules on JURON:** Andreas Herten (HPCNS, JSC)
-*  **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi Deepu (SLNS/HPCNS, JSC)
-*  **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea Kirchner (SLNS/HPCNS, JSC), 
-Susanne Wenzel (INM-1)
+*  **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi 
+   Deepu (SLNS/HPCNS, JSC)
+*  **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea 
+   Kirchner (SLNS/HPCNS, JSC), Susanne Wenzel (INM-1)
diff --git a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb
deleted file mode 100644
index 760ab42..0000000
--- a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a70974534a27eaa5dc42638940ad311981b0259f1f089ea46c695bfd9c1862da
-size 8749056
diff --git a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb
deleted file mode 100644
index eda8c00..0000000
--- a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e0667461174c505913de02429312bcbd9c6cab774b4495c7a2bbe7061ce3ccea
-size 8192
diff --git a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb
deleted file mode 100644
index 4432b2e..0000000
--- a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3eea94f5e1ea128f16ff0e18f9e287cc2676a54a3218105c525e602f375666c1
-size 50757632
diff --git a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb
deleted file mode 100644
index d961b47..0000000
--- a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:33569d983c9d6d527cd7d3202c31a2a7395b254fb8076f59b84ecaecb9207906
-size 8192
diff --git a/horovod/README.md b/horovod/README.md
deleted file mode 100644
index 3d63a23..0000000
--- a/horovod/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Notes
-
-All source code samples were taken from the Horovod examples repository 
-[here](https://github.com/uber/horovod/tree/master/examples) 
-(last checked: September 02, 2019). The samples that work with MNIST data have been 
-slightly modified. Our changes are limited to,
-
-*  The data loading mechanism
-*  A bit of code cleanup
-*  A few additional comments pertaining to our custom data loading mechanism
-
-**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All 
-statements that demonstrate the use of Horovod follow a comment beginning with 
-`[Horovod]` (as added by Horovod developers).
-
-## Keras samples
-
-The following Keras samples are included:
-
-1.  `mnist.py`: A simple MNIST processing example with only the essential Horovod code 
-for distributed training.
-2.  `mnist_advanced.py`: This sample is primarily the same as `mnist.py`. However, a 
-few more advanced Horovod features are used.
-
-## Tensorflow samples
-
-The following Tensorflow samples are included:
-
-1.  `mnist.py`: Demonstrates distributed training using Horovod with the low-level
-Tensorflow API. A simple convolutional neural network is trained on the MNIST dataset.
-2.  `mnist_estimator.py`: Demonstrates distributed training using Horovod with the
-high-level Estimator API in Tensorflow. A simple convolutional neural network is 
-trained on the MNIST dataset.
-3.  `synthetic_benchmark.py`: A simple benchmark that can be used to measure performance 
-of Tensorflow with Horovod without using any external dataset.
diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py
deleted file mode 100644
index 0c46a77..0000000
--- a/horovod/keras/mnist.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py,
-# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
-
-from __future__ import print_function
-import os
-import sys
-import keras
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers import Dense, Dropout, Flatten
-from keras.layers import Conv2D, MaxPooling2D
-from keras import backend as K
-import math
-import tensorflow as tf
-import horovod.keras as hvd
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-# Horovod: initialize Horovod.
-hvd.init()
-
-# Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.ConfigProto()
-config.gpu_options.allow_growth = True
-config.gpu_options.visible_device_list = str(hvd.local_rank())
-K.set_session(tf.Session(config=config))
-
-batch_size = 128
-num_classes = 10
-
-# Horovod: adjust number of epochs based on number of GPUs.
-epochs = int(math.ceil(16.0 / hvd.size()))
-
-# Input image dimensions
-img_rows, img_cols = 28, 28
-
-# [HPCNS] Fully qualified dataset file name
-dataset_file = os.path.join(data_dir, data_file)
-
-# [HPCNS] Load MNIST dataset
-(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
-
-if K.image_data_format() == 'channels_first':
-    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
-    input_shape = (1, img_rows, img_cols)
-else:
-    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
-    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
-    input_shape = (img_rows, img_cols, 1)
-
-x_train = x_train.astype('float32')
-x_test = x_test.astype('float32')
-x_train /= 255
-x_test /= 255
-print('x_train shape:', x_train.shape)
-print(x_train.shape[0], 'train samples')
-print(x_test.shape[0], 'test samples')
-
-# Convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
-
-model = Sequential()
-model.add(Conv2D(32, kernel_size=(3, 3),
-                 activation='relu',
-                 input_shape=input_shape))
-model.add(Conv2D(64, (3, 3), activation='relu'))
-model.add(MaxPooling2D(pool_size=(2, 2)))
-model.add(Dropout(0.25))
-model.add(Flatten())
-model.add(Dense(128, activation='relu'))
-model.add(Dropout(0.5))
-model.add(Dense(num_classes, activation='softmax'))
-
-# Horovod: adjust learning rate based on number of GPUs.
-opt = keras.optimizers.Adadelta(1.0 * hvd.size())
-
-# Horovod: add Horovod Distributed Optimizer.
-opt = hvd.DistributedOptimizer(opt)
-
-model.compile(loss=keras.losses.categorical_crossentropy,
-              optimizer=opt,
-              metrics=['accuracy'])
-
-callbacks = [
-    # Horovod: broadcast initial variable states from rank 0 to all other processes.
-    # This is necessary to ensure consistent initialization of all workers when
-    # training is started with random weights or restored from a checkpoint.
-    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
-]
-
-# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
-if hvd.rank() == 0:
-    callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
-
-model.fit(x_train, y_train,
-          batch_size=batch_size,
-          callbacks=callbacks,
-          epochs=epochs,
-          verbose=1 if hvd.rank() == 0 else 0,
-          validation_data=(x_test, y_test))
-score = model.evaluate(x_test, y_test, verbose=0)
-print('Test loss:', score[0])
-print('Test accuracy:', score[1])
diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py
deleted file mode 100644
index ba60b6d..0000000
--- a/horovod/keras/mnist_advanced.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist_advanced.py,
-# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
-
-
-from __future__ import print_function
-import os
-import sys
-import keras
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers import Dense, Dropout, Flatten
-from keras.layers import Conv2D, MaxPooling2D
-from keras.preprocessing.image import ImageDataGenerator
-from keras import backend as K
-import tensorflow as tf
-import horovod.keras as hvd
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-# Horovod: initialize Horovod.
-hvd.init()
-
-# Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.ConfigProto()
-config.gpu_options.allow_growth = True
-config.gpu_options.visible_device_list = str(hvd.local_rank())
-K.set_session(tf.Session(config=config))
-
-batch_size = 128
-num_classes = 10
-
-# Enough epochs to demonstrate learning rate warmup and the reduction of
-# learning rate when training plateaues.
-epochs = 16
-
-# Input image dimensions
-img_rows, img_cols = 28, 28
-
-# [HPCNS] Fully qualified dataset file name
-dataset_file = os.path.join(data_dir, data_file)
-
-# [HPCNS] Load MNIST dataset.
-(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
-
-# Determine how many batches are there in train and test sets
-train_batches = len(x_train) // batch_size
-test_batches = len(x_test) // batch_size
-
-if K.image_data_format() == 'channels_first':
-    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
-    input_shape = (1, img_rows, img_cols)
-else:
-    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
-    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
-    input_shape = (img_rows, img_cols, 1)
-
-x_train = x_train.astype('float32')
-x_test = x_test.astype('float32')
-x_train /= 255
-x_test /= 255
-print('x_train shape:', x_train.shape)
-print(x_train.shape[0], 'train samples')
-print(x_test.shape[0], 'test samples')
-
-# Convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
-
-model = Sequential()
-model.add(Conv2D(32, kernel_size=(3, 3),
-                 activation='relu',
-                 input_shape=input_shape))
-model.add(Conv2D(64, (3, 3), activation='relu'))
-model.add(MaxPooling2D(pool_size=(2, 2)))
-model.add(Dropout(0.25))
-model.add(Flatten())
-model.add(Dense(128, activation='relu'))
-model.add(Dropout(0.5))
-model.add(Dense(num_classes, activation='softmax'))
-
-# Horovod: adjust learning rate based on number of GPUs.
-opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size())
-
-# Horovod: add Horovod Distributed Optimizer.
-opt = hvd.DistributedOptimizer(opt)
-
-model.compile(loss=keras.losses.categorical_crossentropy,
-              optimizer=opt,
-              metrics=['accuracy'])
-
-callbacks = [
-    # Horovod: broadcast initial variable states from rank 0 to all other processes.
-    # This is necessary to ensure consistent initialization of all workers when
-    # training is started with random weights or restored from a checkpoint.
-    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
-
-    # Horovod: average metrics among workers at the end of every epoch.
-    #
-    # Note: This callback must be in the list before the ReduceLROnPlateau,
-    # TensorBoard or other metrics-based callbacks.
-    hvd.callbacks.MetricAverageCallback(),
-
-    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
-    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
-    # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
-    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
-
-    # Reduce the learning rate if training plateaues.
-    keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
-]
-
-# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
-if hvd.rank() == 0:
-    callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
-
-# Set up ImageDataGenerators to do data augmentation for the training images.
-train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
-                               height_shift_range=0.08, zoom_range=0.08)
-test_gen = ImageDataGenerator()
-
-# Train the model.
-# Horovod: the training will randomly sample 1 / N batches of training data and
-# 3 / N batches of validation data on every worker, where N is the number of workers.
-# Over-sampling of validation data helps to increase probability that every validation
-# example will be evaluated.
-model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
-                    steps_per_epoch=train_batches // hvd.size(),
-                    callbacks=callbacks,
-                    epochs=epochs,
-                    verbose=1,
-                    validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size),
-                    validation_steps=3 * test_batches // hvd.size())
-
-# Evaluate the model on the full data set.
-score = model.evaluate(x_test, y_test, verbose=0)
-print('Test loss:', score[0])
-print('Test accuracy:', score[1])
diff --git a/horovod/keras/run_on_localMachine.sh b/horovod/keras/run_on_localMachine.sh
deleted file mode 100644
index 9c9afb4..0000000
--- a/horovod/keras/run_on_localMachine.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Run the program
-mpirun -np 1 -H localhost:1 \
-    -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-    -mca pml ob1 -mca btl ^openib \
-    python -u mnist.py
diff --git a/horovod/keras/submit_job_jureca.sh b/horovod/keras/submit_job_jureca.sh
deleted file mode 100755
index 3591bba..0000000
--- a/horovod/keras/submit_job_jureca.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HOROVOD_KERAS_MNIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/horovod/keras/submit_job_juron.sh b/horovod/keras/submit_job_juron.sh
deleted file mode 100755
index 0318278..0000000
--- a/horovod/keras/submit_job_juron.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 4
-#BSUB -R "span[ptile=2]"
-#BSUB -gpu "num=2"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J HOROVOD_KERAS_MNIST
-
-# Load the required modules
-module load python/3.6.1
-module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
-module load horovod/0.15.2
-module load keras/2.2.4
-
-# Run the program
-mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
-        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/keras/submit_job_juwels.sh b/horovod/keras/submit_job_juwels.sh
deleted file mode 100755
index 3591bba..0000000
--- a/horovod/keras/submit_job_juwels.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HOROVOD_KERAS_MNIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/horovod/tensorflow/checkpoints/.gitkeep b/horovod/tensorflow/checkpoints/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py
deleted file mode 100644
index 3c780ac..0000000
--- a/horovod/tensorflow/mnist.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py,
-# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
-
-import os
-import sys
-import tensorflow as tf
-import horovod.tensorflow as hvd
-import numpy as np
-import shutil
-
-from tensorflow import keras
-
-layers = tf.layers
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-
-def conv_model(feature, target, mode):
-    """2-layer convolution model."""
-    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
-    # with a on-value of 1 for each one-hot vector of length 10.
-    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
-
-    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
-    # image width and height final dimension being the number of color channels.
-    feature = tf.reshape(feature, [-1, 28, 28, 1])
-
-    # First conv layer will compute 32 features for each 5x5 patch
-    with tf.variable_scope('conv_layer1'):
-        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
-                                activation=tf.nn.relu, padding="SAME")
-        h_pool1 = tf.nn.max_pool(
-            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
-
-    # Second conv layer will compute 64 features for each 5x5 patch.
-    with tf.variable_scope('conv_layer2'):
-        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
-                                activation=tf.nn.relu, padding="SAME")
-        h_pool2 = tf.nn.max_pool(
-            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
-        # reshape tensor into a batch of vectors
-        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
-
-    # Densely connected layer with 1024 neurons.
-    h_fc1 = layers.dropout(
-        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
-        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
-
-    # Compute logits (1 per class) and compute loss.
-    logits = layers.dense(h_fc1, 10, activation=None)
-    loss = tf.losses.softmax_cross_entropy(target, logits)
-
-    return tf.argmax(logits, 1), loss
-
-
-def train_input_generator(x_train, y_train, batch_size=64):
-    assert len(x_train) == len(y_train)
-    while True:
-        p = np.random.permutation(len(x_train))
-        x_train, y_train = x_train[p], y_train[p]
-        index = 0
-        while index <= len(x_train) - batch_size:
-            yield x_train[index:index + batch_size], \
-                  y_train[index:index + batch_size],
-            index += batch_size
-
-
-def main(_):
-    # Horovod: initialize Horovod.
-    hvd.init()
-
-    # [HPCNS] Fully qualified dataset file name
-    dataset_file = os.path.join(data_dir, data_file)
-
-    # [HPCNS] Dataset filename for this rank
-    dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank())
-
-    # [HPCNS] Make a copy of the dataset for this rank
-    shutil.copyfile(dataset_file, dataset_for_rank)
-
-    # [HPCNS] Load MNIST dataset
-    (x_train, y_train), (x_test, y_test) = \
-        keras.datasets.mnist.load_data(dataset_for_rank)
-
-    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
-    # into (-1, 784) to feed into our network. Also, need to normalize the
-    # features between 0 and 1.
-    x_train = np.reshape(x_train, (-1, 784)) / 255.0
-    x_test = np.reshape(x_test, (-1, 784)) / 255.0
-
-    # Build model...
-    with tf.name_scope('input'):
-        image = tf.placeholder(tf.float32, [None, 784], name='image')
-        label = tf.placeholder(tf.float32, [None], name='label')
-    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
-
-    # Horovod: adjust learning rate based on number of GPUs.
-    opt = tf.train.AdamOptimizer(0.001 * hvd.size())
-
-    # Horovod: add Horovod Distributed Optimizer.
-    opt = hvd.DistributedOptimizer(opt)
-
-    global_step = tf.train.get_or_create_global_step()
-    train_op = opt.minimize(loss, global_step=global_step)
-
-    hooks = [
-        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
-        # from rank 0 to all other processes. This is necessary to ensure consistent
-        # initialization of all workers when training is started with random weights
-        # or restored from a checkpoint.
-        hvd.BroadcastGlobalVariablesHook(0),
-
-        # Horovod: adjust number of steps based on number of GPUs.
-        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
-
-        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
-                                   every_n_iter=10),
-    ]
-
-    # Horovod: pin GPU to be used to process local rank (one GPU per process)
-    config = tf.ConfigProto()
-    config.gpu_options.allow_growth = True
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-
-    # Horovod: save checkpoints only on worker 0 to prevent other workers from
-    # corrupting them.
-    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
-    training_batch_generator = train_input_generator(x_train,
-                                                     y_train, batch_size=100)
-    # The MonitoredTrainingSession takes care of session initialization,
-    # restoring from a checkpoint, saving to a checkpoint, and closing when done
-    # or an error occurs.
-    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
-                                           hooks=hooks,
-                                           config=config) as mon_sess:
-        while not mon_sess.should_stop():
-            # Run a training step synchronously.
-            image_, label_ = next(training_batch_generator)
-            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
-
-    # [HPCNS] Remove the copied dataset
-    os.remove(dataset_for_rank)
-
-
-if __name__ == "__main__":
-    tf.app.run()
diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py
deleted file mode 100644
index 792c057..0000000
--- a/horovod/tensorflow/mnist_estimator.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_estimator.py,
-# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
-
-"""Convolutional Neural Network Estimator for MNIST, built with tf.layers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import shutil
-import numpy as np
-import tensorflow as tf
-import horovod.tensorflow as hvd
-
-from tensorflow import keras
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-
-def cnn_model_fn(features, labels, mode):
-    """Model function for CNN."""
-    # Input Layer
-    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
-    # MNIST images are 28x28 pixels, and have one color channel
-    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
-
-    # Convolutional Layer #1
-    # Computes 32 features using a 5x5 filter with ReLU activation.
-    # Padding is added to preserve width and height.
-    # Input Tensor Shape: [batch_size, 28, 28, 1]
-    # Output Tensor Shape: [batch_size, 28, 28, 32]
-    conv1 = tf.layers.conv2d(
-        inputs=input_layer,
-        filters=32,
-        kernel_size=[5, 5],
-        padding="same",
-        activation=tf.nn.relu)
-
-    # Pooling Layer #1
-    # First max pooling layer with a 2x2 filter and stride of 2
-    # Input Tensor Shape: [batch_size, 28, 28, 32]
-    # Output Tensor Shape: [batch_size, 14, 14, 32]
-    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
-
-    # Convolutional Layer #2
-    # Computes 64 features using a 5x5 filter.
-    # Padding is added to preserve width and height.
-    # Input Tensor Shape: [batch_size, 14, 14, 32]
-    # Output Tensor Shape: [batch_size, 14, 14, 64]
-    conv2 = tf.layers.conv2d(
-        inputs=pool1,
-        filters=64,
-        kernel_size=[5, 5],
-        padding="same",
-        activation=tf.nn.relu)
-
-    # Pooling Layer #2
-    # Second max pooling layer with a 2x2 filter and stride of 2
-    # Input Tensor Shape: [batch_size, 14, 14, 64]
-    # Output Tensor Shape: [batch_size, 7, 7, 64]
-    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
-
-    # Flatten tensor into a batch of vectors
-    # Input Tensor Shape: [batch_size, 7, 7, 64]
-    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
-    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
-
-    # Dense Layer
-    # Densely connected layer with 1024 neurons
-    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
-    # Output Tensor Shape: [batch_size, 1024]
-    dense = tf.layers.dense(inputs=pool2_flat, units=1024,
-                            activation=tf.nn.relu)
-
-    # Add dropout operation; 0.6 probability that element will be kept
-    dropout = tf.layers.dropout(
-        inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
-
-    # Logits layer
-    # Input Tensor Shape: [batch_size, 1024]
-    # Output Tensor Shape: [batch_size, 10]
-    logits = tf.layers.dense(inputs=dropout, units=10)
-
-    predictions = {
-        # Generate predictions (for PREDICT and EVAL mode)
-        "classes": tf.argmax(input=logits, axis=1),
-        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
-        # `logging_hook`.
-        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
-    }
-    if mode == tf.estimator.ModeKeys.PREDICT:
-        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
-
-    # Calculate Loss (for both TRAIN and EVAL modes)
-    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
-
-    # Configure the Training Op (for TRAIN mode)
-    if mode == tf.estimator.ModeKeys.TRAIN:
-        # Horovod: scale learning rate by the number of workers.
-        optimizer = tf.train.MomentumOptimizer(
-            learning_rate=0.001 * hvd.size(), momentum=0.9)
-
-        # Horovod: add Horovod Distributed Optimizer.
-        optimizer = hvd.DistributedOptimizer(optimizer)
-
-        train_op = optimizer.minimize(
-            loss=loss,
-            global_step=tf.train.get_global_step())
-        return tf.estimator.EstimatorSpec(mode=mode, loss=loss,
-                                          train_op=train_op)
-
-    # Add evaluation metrics (for EVAL mode)
-    eval_metric_ops = {
-        "accuracy": tf.metrics.accuracy(
-            labels=labels, predictions=predictions["classes"])}
-    return tf.estimator.EstimatorSpec(
-        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-    # Horovod: initialize Horovod.
-    hvd.init()
-
-    # [HPCNS] Fully qualified dataset file name
-    dataset_file = os.path.join(data_dir, data_file)
-
-    # [HPCNS] Dataset filename for this rank
-    dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank())
-
-    # [HPCNS] Make a copy of the dataset for this rank
-    shutil.copyfile(dataset_file, dataset_for_rank)
-
-    # [HPCNS] Load MNIST dataset
-    (train_data, train_labels), (eval_data, eval_labels) = \
-        keras.datasets.mnist.load_data(dataset_for_rank)
-
-    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
-    # into (-1, 784) to feed into our network. Also, need to normalize the
-    # features between 0 and 1.
-    train_data = np.reshape(train_data, (-1, 784)) / 255.0
-    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0
-
-    # Horovod: pin GPU to be used to process local rank (one GPU per process)
-    config = tf.ConfigProto()
-    config.gpu_options.allow_growth = True
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-
-    # Horovod: save checkpoints only on worker 0 to prevent other workers from
-    # corrupting them.
-    model_dir = 'checkpoints/mnist_convnet_model' if hvd.rank() == 0 else None
-
-    # Create the Estimator
-    mnist_classifier = tf.estimator.Estimator(
-        model_fn=cnn_model_fn, model_dir=model_dir,
-        config=tf.estimator.RunConfig(session_config=config))
-
-    # Set up logging for predictions
-    # Log the values in the "Softmax" tensor with label "probabilities"
-    tensors_to_log = {"probabilities": "softmax_tensor"}
-    logging_hook = tf.train.LoggingTensorHook(
-        tensors=tensors_to_log, every_n_iter=500)
-
-    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
-    # rank 0 to all other processes. This is necessary to ensure consistent
-    # initialization of all workers when training is started with random weights or
-    # restored from a checkpoint.
-    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
-
-    # Train the model
-    train_input_fn = tf.estimator.inputs.numpy_input_fn(
-        x={"x": train_data},
-        y=train_labels,
-        batch_size=100,
-        num_epochs=None,
-        shuffle=True)
-
-    # Horovod: adjust number of steps based on number of GPUs.
-    mnist_classifier.train(
-        input_fn=train_input_fn,
-        steps=500 // hvd.size(),
-        hooks=[logging_hook, bcast_hook])
-
-    # Evaluate the model and print results
-    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-        x={"x": eval_data},
-        y=eval_labels,
-        num_epochs=1,
-        shuffle=False)
-    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
-    print(eval_results)
-
-    # [HPCNS] Remove the copied dataset
-    os.remove(dataset_for_rank)
-
-
-if __name__ == "__main__":
-    tf.app.run()
diff --git a/horovod/tensorflow/run_on_localMachine.sh b/horovod/tensorflow/run_on_localMachine.sh
deleted file mode 100644
index 9c9afb4..0000000
--- a/horovod/tensorflow/run_on_localMachine.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Run the program
-mpirun -np 1 -H localhost:1 \
-    -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
-    -mca pml ob1 -mca btl ^openib \
-    python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_jureca.sh b/horovod/tensorflow/submit_job_jureca.sh
deleted file mode 100755
index fd12487..0000000
--- a/horovod/tensorflow/submit_job_jureca.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HOROVOD_TFLOW_MNIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_juron.sh b/horovod/tensorflow/submit_job_juron.sh
deleted file mode 100644
index 0107547..0000000
--- a/horovod/tensorflow/submit_job_juron.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 4
-#BSUB -R "span[ptile=2]"
-#BSUB -gpu "num=2"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J HOROVOD_TFLOW_MNIST
-
-# Load the required modules
-module load python/3.6.1
-module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
-module load horovod/0.15.2
-
-# Run the program
-mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
-        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_juwels.sh b/horovod/tensorflow/submit_job_juwels.sh
deleted file mode 100755
index fd12487..0000000
--- a/horovod/tensorflow/submit_job_juwels.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HOROVOD_TFLOW_MNIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/horovod/tensorflow/synthetic_benchmark.py b/horovod/tensorflow/synthetic_benchmark.py
deleted file mode 100644
index ee401a5..0000000
--- a/horovod/tensorflow/synthetic_benchmark.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import os
-import numpy as np
-import timeit
-
-import tensorflow as tf
-import horovod.tensorflow as hvd
-from tensorflow.keras import applications
-
-# Benchmark settings
-parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--fp16-allreduce', action='store_true', default=False,
-                    help='use fp16 compression during allreduce')
-
-parser.add_argument('--model', type=str, default='ResNet50',
-                    help='model to benchmark')
-parser.add_argument('--batch-size', type=int, default=32,
-                    help='input batch size')
-
-parser.add_argument('--num-warmup-batches', type=int, default=10,
-                    help='number of warm-up batches that don\'t count towards benchmark')
-parser.add_argument('--num-batches-per-iter', type=int, default=10,
-                    help='number of batches per benchmark iteration')
-parser.add_argument('--num-iters', type=int, default=10,
-                    help='number of benchmark iterations')
-
-parser.add_argument('--eager', action='store_true', default=False,
-                    help='enables eager execution')
-parser.add_argument('--no-cuda', action='store_true', default=False,
-                    help='disables CUDA training')
-
-args = parser.parse_args()
-args.cuda = not args.no_cuda
-
-hvd.init()
-
-# Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.ConfigProto()
-if args.cuda:
-    config.gpu_options.allow_growth = True
-    config.gpu_options.visible_device_list = str(hvd.local_rank())
-else:
-    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-    config.gpu_options.allow_growth = False
-    config.gpu_options.visible_device_list = ''
-
-if args.eager:
-    tf.enable_eager_execution(config)
-
-# Set up standard model.
-model = getattr(applications, args.model)(weights=None)
-
-opt = tf.train.GradientDescentOptimizer(0.01)
-
-# Horovod: (optional) compression algorithm.
-compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
-
-# Horovod: wrap optimizer with DistributedOptimizer.
-opt = hvd.DistributedOptimizer(opt, compression=compression)
-
-init = tf.global_variables_initializer()
-bcast_op = hvd.broadcast_global_variables(0)
-
-data = tf.random_uniform([args.batch_size, 224, 224, 3])
-target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
-
-
-def loss_function():
-    probs = model(data, training=True)
-    return tf.losses.sparse_softmax_cross_entropy(target, probs)
-
-
-def log(s, nl=True):
-    if hvd.rank() != 0:
-        return
-    print(s, end='\n' if nl else '')
-
-
-log('Model: %s' % args.model)
-log('Batch size: %d' % args.batch_size)
-device = 'GPU' if args.cuda else 'CPU'
-log('Number of %ss: %d' % (device, hvd.size()))
-
-
-def run(benchmark_step):
-    # Warm-up
-    log('Running warmup...')
-    timeit.timeit(benchmark_step, number=args.num_warmup_batches)
-
-    # Benchmark
-    log('Running benchmark...')
-    img_secs = []
-    for x in range(args.num_iters):
-        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
-        img_sec = args.batch_size * args.num_batches_per_iter / time
-        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
-        img_secs.append(img_sec)
-
-    # Results
-    img_sec_mean = np.mean(img_secs)
-    img_sec_conf = 1.96 * np.std(img_secs)
-    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
-    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
-        (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
-
-
-if tf.executing_eagerly():
-    with tf.device(device):
-        run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
-else:
-    with tf.Session(config=config) as session:
-        init.run()
-        bcast_op.run()
-
-        loss = loss_function()
-        train_opt = opt.minimize(loss)
-        run(lambda: session.run(train_opt))
diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md
deleted file mode 100644
index 3a13e2b..0000000
--- a/horovod_data_distributed/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Introduction
-
-Please see the main docstring in each program for details.
-
-# Notes
-
-On JURECA and JUWELS, the `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`](
-https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution)
-module for distribution of training data filenames across multiple ranks. On JURON, multiple additional
-package are required. Please follow the steps below to setup the environment before submitting the
-training job.
-
-Note that a maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there
-are eight training files.
-
-## JURECA and JUWELS
-
-1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
-2.  Load the system-wide Python module: `module load Python/3.6.8`    
-3.  Install the `hpc4neuro` package: 
-
-    `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git`
-
-4.  Submit the job
-
-## JURON
-
-1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
-2.  Setup a Python virtual environment with the required packages (may take upto 5 minutes): `./setup_juron.sh`
-3.  Submit the job: `bsub < submit_job_juron.sh`
-
-**Note:** The setup is required only once. Unless you explicitly remove the virtual environment, the same
-setup can be used to run the example multiple times.
diff --git a/horovod_data_distributed/setup_juron.sh b/horovod_data_distributed/setup_juron.sh
deleted file mode 100755
index 7fa1a24..0000000
--- a/horovod_data_distributed/setup_juron.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-
-# Load the Python module
-module load python/3.6.1
-
-# Create a virtual environment
-python -m venv venv_dl_hpc4neuro
-
-# Activate the virtual environment
-source venv_dl_hpc4neuro/bin/activate
-
-# Upgrade pip and setuptools
-pip install -U pip setuptools
-
-# Install mpi4py
-env MPICC=/gpfs/software/opt/openmpi/3.1.2-gcc_5.4.0-cuda_10.0.130/bin/mpicc pip install mpi4py
-
-# Install six
-pip install six
-
-# Install hpc4neuro
-pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git
-
-printf "%s\n\n" "Setup complete."
diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh
deleted file mode 100755
index eedbaca..0000000
--- a/horovod_data_distributed/submit_job_jureca.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HVD_DATA_DIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load mpi4py/3.0.1-Python-3.6.8
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh
deleted file mode 100755
index a71bc47..0000000
--- a/horovod_data_distributed/submit_job_juron.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 4
-#BSUB -R "span[ptile=4]"
-#BSUB -gpu "num=4"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J HVD_DATA_DIST
-
-# Load the required modules
-module load python/3.6.1
-module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
-module load horovod/0.15.2
-
-# Activate the virtual environment
-source venv_dl_hpc4neuro/bin/activate
-
-# Run the program
-mpirun -bind-to none \
-        -map-by slot \
-        -x NCCL_DEBUG=INFO \
-        -x LD_LIBRARY_PATH \
-        -x PATH \
-        -mca pml ob1 \
-        -mca btl ^openib \
-        python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh
deleted file mode 100755
index eedbaca..0000000
--- a/horovod_data_distributed/submit_job_juwels.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=8
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=HVD_DATA_DIST
-#SBATCH --gres=gpu:4 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load MVAPICH2/2.3.2-GDR
-module load mpi4py/3.0.1-Python-3.6.8
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Horovod/0.16.2-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist_data_distributed.py
diff --git a/keras/README.md b/keras/README.md
deleted file mode 100644
index 4e8462d..0000000
--- a/keras/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Notes
-
-The `mnist.py` sample is a slightly modified version of `mnist_cnn.py`
-available in the Keras examples repository 
-[here](https://github.com/keras-team/keras/tree/master/examples) 
-(last checked: September 02, 2019). Our changes are 
-limited to,
-
-*  The data loading mechanism
-*  A bit of code cleanup
-*  A few additional comments pertaining to our custom data loading mechanism
-
-**Note:** All newly added statements follow a comment beginning with `[HPCNS]`.
\ No newline at end of file
diff --git a/keras/mnist.py b/keras/mnist.py
deleted file mode 100644
index 9fc93f2..0000000
--- a/keras/mnist.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py,
-# which is also licensed under The MIT License (see the NOTICE file for details).
-
-
-"""Trains a simple convnet on the MNIST dataset.
-
-Gets to 99.25% test accuracy after 12 epochs
-(there is still a lot of margin for parameter tuning).
-16 seconds per epoch on a GRID K520 GPU.
-"""
-
-from __future__ import print_function
-import os
-import sys
-import keras
-from keras.datasets import mnist
-from keras.models import Sequential
-from keras.layers import Dense, Dropout, Flatten
-from keras.layers import Conv2D, MaxPooling2D
-from keras import backend as K
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-# [HPCNS] Fully qualified dataset file name
-dataset_file = os.path.join(data_dir, data_file)
-
-batch_size = 128
-num_classes = 10
-epochs = 12
-
-# input image dimensions
-img_rows, img_cols = 28, 28
-
-# [HPCNS] Load MNIST dataset
-# the data, split between train and test sets
-(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
-
-if K.image_data_format() == 'channels_first':
-    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
-    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
-    input_shape = (1, img_rows, img_cols)
-else:
-    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
-    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
-    input_shape = (img_rows, img_cols, 1)
-
-x_train = x_train.astype('float32')
-x_test = x_test.astype('float32')
-x_train /= 255
-x_test /= 255
-print('x_train shape:', x_train.shape)
-print(x_train.shape[0], 'train samples')
-print(x_test.shape[0], 'test samples')
-
-# convert class vectors to binary class matrices
-y_train = keras.utils.to_categorical(y_train, num_classes)
-y_test = keras.utils.to_categorical(y_test, num_classes)
-
-model = Sequential()
-model.add(Conv2D(32, kernel_size=(3, 3),
-                 activation='relu',
-                 input_shape=input_shape))
-model.add(Conv2D(64, (3, 3), activation='relu'))
-model.add(MaxPooling2D(pool_size=(2, 2)))
-model.add(Dropout(0.25))
-model.add(Flatten())
-model.add(Dense(128, activation='relu'))
-model.add(Dropout(0.5))
-model.add(Dense(num_classes, activation='softmax'))
-
-model.compile(loss=keras.losses.categorical_crossentropy,
-              optimizer=keras.optimizers.Adadelta(),
-              metrics=['accuracy'])
-
-model.fit(x_train, y_train,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=1,
-          validation_data=(x_test, y_test))
-score = model.evaluate(x_test, y_test, verbose=0)
-print('Test loss:', score[0])
-print('Test accuracy:', score[1])
diff --git a/keras/run_on_localMachine.sh b/keras/run_on_localMachine.sh
deleted file mode 100644
index 1895ec1..0000000
--- a/keras/run_on_localMachine.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-
-# Run the program
-python -u mnist.py
diff --git a/keras/submit_job_jureca.sh b/keras/submit_job_jureca.sh
deleted file mode 100755
index 55feebb..0000000
--- a/keras/submit_job_jureca.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=KERAS_MNIST_CNN
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCCcore/.8.3.0
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/keras/submit_job_juron.sh b/keras/submit_job_juron.sh
deleted file mode 100644
index 7927b03..0000000
--- a/keras/submit_job_juron.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J KERAS_MNIST_CNN
-
-# Load the required modules
-module load python/3.6.1
-module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
-module load keras/2.2.4
-
-# Run the program
-python -u mnist.py
diff --git a/keras/submit_job_juwels.sh b/keras/submit_job_juwels.sh
deleted file mode 100755
index 429c440..0000000
--- a/keras/submit_job_juwels.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=KERAS_MNIST
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-module load Keras/2.2.4-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/tensorflow/README.md b/tensorflow/README.md
index 3bf439c..a35d643 100644
--- a/tensorflow/README.md
+++ b/tensorflow/README.md
@@ -1,13 +1,22 @@
 # Notes
 
-The `mnist.py` sample is a slightly modified version of `convolutional.py`
-available in the Tensorflow models repository 
-[here](https://github.com/tensorflow/models/blob/master/tutorials/image/mnist) 
-(last checked: September 02, 2019). Our changes are 
-limited to,
+All source code samples were taken from the Horovod examples repository 
+[here](https://github.com/horovod/horovod/tree/master/examples/tensorflow2) 
+(last checked: April 26, 2021). The samples have been slightly modified. Our 
+changes are limited to,
 
 *  The data loading mechanism
 *  A bit of code cleanup
 *  A few additional comments pertaining to our custom data loading mechanism
 
-**Note:** All newly added statements follow a comment beginning with `[HPCNS]`.
\ No newline at end of file
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All 
+statements that demonstrate the use of Horovod follow a comment beginning with 
+`[Horovod]` (as added by Horovod developers).
+
+The following samples are included:
+
+1.  `keras_mnist.py`: A simple training program for an MNIST classifier that 
+    uses the Keras API with Horovod.
+2.  `mnist.py`: Also a training program for an MNIST classifier, this sample 
+    demonstrates using Horovod's `DistributedGradientTape` with a custom 
+    training loop.
diff --git a/horovod/keras/checkpoints/.gitkeep b/tensorflow/checkpoints/.gitkeep
similarity index 100%
rename from horovod/keras/checkpoints/.gitkeep
rename to tensorflow/checkpoints/.gitkeep
diff --git a/tensorflow/jureca_job.sh b/tensorflow/jureca_job.sh
new file mode 100755
index 0000000..e818bc0
--- /dev/null
+++ b/tensorflow/jureca_job.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=dc-gpu-devel
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/jusuf_job.sh b/tensorflow/jusuf_job.sh
new file mode 100755
index 0000000..24f3c83
--- /dev/null
+++ b/tensorflow/jusuf_job.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:1
+#SBATCH --partition=develgpus
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/juwels_booster_job.sh b/tensorflow/juwels_booster_job.sh
new file mode 100755
index 0000000..df9cdef
--- /dev/null
+++ b/tensorflow/juwels_booster_job.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=develbooster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/juwels_job.sh b/tensorflow/juwels_job.sh
new file mode 100755
index 0000000..55831d0
--- /dev/null
+++ b/tensorflow/juwels_job.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=develgpus
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow2/keras_mnist.py b/tensorflow/keras_mnist.py
similarity index 97%
rename from tensorflow2/keras_mnist.py
rename to tensorflow/keras_mnist.py
index e444560..b07950c 100644
--- a/tensorflow2/keras_mnist.py
+++ b/tensorflow/keras_mnist.py
@@ -104,4 +104,4 @@ verbose = 1 if hvd.rank() == 0 else 0
 
 # Train the model.
 # Horovod: adjust number of steps based on number of GPUs.
-mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose)
\ No newline at end of file
+mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose)
\ No newline at end of file
diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py
index 30477e1..7e56a70 100644
--- a/tensorflow/mnist.py
+++ b/tensorflow/mnist.py
@@ -1,328 +1,109 @@
-# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
-# This code is licensed under MIT license (see the LICENSE file for details).
-# This code is derived from https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py,
-# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 
-"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example.
-
-This should achieve a test error of 0.7%. Please keep this model as simple and
-linear as possible, it is meant as a tutorial for simple convolutional models.
-Run with --self_test on the command line to execute a short self-test.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import gzip
 import os
 import sys
-import time
 
-import numpy
-from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
+import horovod.tensorflow as hvd
 
 # [HPCNS] Import the DataValidator, which can then be used to
 # validate and load the path to the already downloaded dataset.
 sys.path.insert(0, '../utils')
 from data_utils import DataValidator
 
-IMAGE_SIZE = 28
-NUM_CHANNELS = 1
-PIXEL_DEPTH = 255
-NUM_LABELS = 10
-VALIDATION_SIZE = 5000  # Size of the validation set.
-SEED = 66478  # Set to None for random seed.
-BATCH_SIZE = 64
-NUM_EPOCHS = 10
-EVAL_BATCH_SIZE = 64
-EVAL_FREQUENCY = 100  # Number of steps between evaluations.
-
-FLAGS = None
-
-
-def data_type():
-    """Return the type of the activations, weights, and placeholder variables."""
-    if FLAGS.use_fp16:
-        return tf.float16
-    else:
-        return tf.float32
-
-
-def extract_data(filename, num_images):
-    """Extract the images into a 4D tensor [image index, y, x, channels].
-
-  Values are rescaled from [0, 255] down to [-0.5, 0.5].
-  """
-    print('Extracting', filename)
-    with gzip.open(filename) as bytestream:
-        bytestream.read(16)
-        buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
-        data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
-        data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
-        data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
-        return data
-
-
-def extract_labels(filename, num_images):
-    """Extract the labels into a vector of int64 label IDs."""
-    print('Extracting', filename)
-    with gzip.open(filename) as bytestream:
-        bytestream.read(8)
-        buf = bytestream.read(1 * num_images)
-        labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
-    return labels
-
-
-def fake_data(num_images):
-    """Generate a fake dataset that matches the dimensions of MNIST."""
-    data = numpy.ndarray(
-        shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS),
-        dtype=numpy.float32)
-    labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64)
-    for image in xrange(num_images):
-        label = image % 2
-        data[image, :, :, 0] = label - 0.5
-        labels[image] = label
-    return data, labels
-
-
-def error_rate(predictions, labels):
-    """Return the error rate based on dense predictions and sparse labels."""
-    return 100.0 - (
-            100.0 *
-            numpy.sum(numpy.argmax(predictions, 1) == labels) /
-            predictions.shape[0])
-
-
-def main(_):
-    if FLAGS.self_test:
-        print('Running self-test.')
-        train_data, train_labels = fake_data(256)
-        validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE)
-        test_data, test_labels = fake_data(EVAL_BATCH_SIZE)
-        num_epochs = 1
-    else:
-        # [HPCNS]: Data files relative to the 'datasets' directory
-        train_data_filename = 'mnist/raw/train-images-idx3-ubyte.gz'
-        train_labels_filename = 'mnist/raw/train-labels-idx1-ubyte.gz'
-        test_data_filename = 'mnist/raw/t10k-images-idx3-ubyte.gz'
-        test_labels_filename = 'mnist/raw/t10k-labels-idx1-ubyte.gz'
-
-        # [HPCNS]: Update data file information with validated and fully qualified filenames
-        train_data_filename = os.path.join(
-            DataValidator.validated_data_dir(train_data_filename), train_data_filename)
-        train_labels_filename = os.path.join(
-            DataValidator.validated_data_dir(train_labels_filename), train_labels_filename)
-        test_data_filename = os.path.join(
-            DataValidator.validated_data_dir(test_data_filename), test_data_filename)
-        test_labels_filename = os.path.join(
-            DataValidator.validated_data_dir(test_labels_filename), test_labels_filename)
-
-        # Extract it into numpy arrays.
-        train_data = extract_data(train_data_filename, 60000)
-        train_labels = extract_labels(train_labels_filename, 60000)
-        test_data = extract_data(test_data_filename, 10000)
-        test_labels = extract_labels(test_labels_filename, 10000)
-
-        # Generate a validation set.
-        validation_data = train_data[:VALIDATION_SIZE, ...]
-        validation_labels = train_labels[:VALIDATION_SIZE]
-        train_data = train_data[VALIDATION_SIZE:, ...]
-        train_labels = train_labels[VALIDATION_SIZE:]
-        num_epochs = NUM_EPOCHS
-
-    train_size = train_labels.shape[0]
-
-    # This is where training samples and labels are fed to the graph.
-    # These placeholder nodes will be fed a batch of training data at each
-    # training step using the {feed_dict} argument to the Run() call below.
-    train_data_node = tf.placeholder(
-        data_type(),
-        shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
-    train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
-    eval_data = tf.placeholder(
-        data_type(),
-        shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
-
-    # The variables below hold all the trainable weights. They are passed an
-    # initial value which will be assigned when we call:
-    # {tf.global_variables_initializer().run()}
-    conv1_weights = tf.Variable(
-        tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
-                            stddev=0.1,
-                            seed=SEED, dtype=data_type()))
-    conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
-    conv2_weights = tf.Variable(tf.truncated_normal(
-        [5, 5, 32, 64], stddev=0.1,
-        seed=SEED, dtype=data_type()))
-    conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
-    fc1_weights = tf.Variable(  # fully connected, depth 512.
-        tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
-                            stddev=0.1,
-                            seed=SEED,
-                            dtype=data_type()))
-    fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
-    fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
-                                                  stddev=0.1,
-                                                  seed=SEED,
-                                                  dtype=data_type()))
-    fc2_biases = tf.Variable(tf.constant(
-        0.1, shape=[NUM_LABELS], dtype=data_type()))
-
-    # We will replicate the model structure for the training subgraph, as well
-    # as the evaluation subgraphs, while sharing the trainable parameters.
-    def model(data, train=False):
-        """The Model definition."""
-        # 2D convolution, with 'SAME' padding (i.e. the output feature map has
-        # the same size as the input). Note that {strides} is a 4D array whose
-        # shape matches the data layout: [image index, y, x, depth].
-        conv = tf.nn.conv2d(data,
-                            conv1_weights,
-                            strides=[1, 1, 1, 1],
-                            padding='SAME')
-        # Bias and rectified linear non-linearity.
-        relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
-        # Max pooling. The kernel size spec {ksize} also follows the layout of
-        # the data. Here we have a pooling window of 2, and a stride of 2.
-        pool = tf.nn.max_pool(relu,
-                              ksize=[1, 2, 2, 1],
-                              strides=[1, 2, 2, 1],
-                              padding='SAME')
-        conv = tf.nn.conv2d(pool,
-                            conv2_weights,
-                            strides=[1, 1, 1, 1],
-                            padding='SAME')
-        relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
-        pool = tf.nn.max_pool(relu,
-                              ksize=[1, 2, 2, 1],
-                              strides=[1, 2, 2, 1],
-                              padding='SAME')
-        # Reshape the feature map cuboid into a 2D matrix to feed it to the
-        # fully connected layers.
-        pool_shape = pool.get_shape().as_list()
-        reshape = tf.reshape(
-            pool,
-            [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
-        # Fully connected layer. Note that the '+' operation automatically
-        # broadcasts the biases.
-        hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
-        # Add a 50% dropout during training only. Dropout also scales
-        # activations such that no rescaling is needed at evaluation time.
-        if train:
-            hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
-        return tf.matmul(hidden, fc2_weights) + fc2_biases
-
-    # Training computation: logits + cross-entropy loss.
-    logits = model(train_data_node, True)
-    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
-        labels=train_labels_node, logits=logits))
-
-    # L2 regularization for the fully connected parameters.
-    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
-                    tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
-    # Add the regularization term to the loss.
-    loss += 5e-4 * regularizers
-
-    # Optimizer: set up a variable that's incremented once per batch and
-    # controls the learning rate decay.
-    batch = tf.Variable(0, dtype=data_type())
-    # Decay once per epoch, using an exponential schedule starting at 0.01.
-    learning_rate = tf.train.exponential_decay(
-        0.01,  # Base learning rate.
-        batch * BATCH_SIZE,  # Current index into the dataset.
-        train_size,  # Decay step.
-        0.95,  # Decay rate.
-        staircase=True)
-    # Use simple momentum for the optimization.
-    optimizer = tf.train.MomentumOptimizer(learning_rate,
-                                           0.9).minimize(loss,
-                                                         global_step=batch)
-
-    # Predictions for the current training minibatch.
-    train_prediction = tf.nn.softmax(logits)
-
-    # Predictions for the test and validation, which we'll compute less often.
-    eval_prediction = tf.nn.softmax(model(eval_data))
-
-    # Small utility function to evaluate a dataset by feeding batches of data to
-    # {eval_data} and pulling the results from {eval_predictions}.
-    # Saves memory and enables this to run on smaller GPUs.
-    def eval_in_batches(data, sess):
-        """Get all predictions for a dataset by running it in small batches."""
-        size = data.shape[0]
-        if size < EVAL_BATCH_SIZE:
-            raise ValueError("batch size for evals larger than dataset: %d" % size)
-        predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
-        for begin in xrange(0, size, EVAL_BATCH_SIZE):
-            end = begin + EVAL_BATCH_SIZE
-            if end <= size:
-                predictions[begin:end, :] = sess.run(
-                    eval_prediction,
-                    feed_dict={eval_data: data[begin:end, ...]})
-            else:
-                batch_predictions = sess.run(
-                    eval_prediction,
-                    feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
-                predictions[begin:, :] = batch_predictions[begin - size:, :]
-        return predictions
-
-    # Create a local session to run the training.
-    start_time = time.time()
-    with tf.Session() as sess:
-        # Run all the initializers to prepare the trainable parameters.
-        tf.global_variables_initializer().run()
-        print('Initialized!')
-        # Loop through training steps.
-        for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
-            # Compute the offset of the current minibatch in the data.
-            # Note that we could use better randomization across epochs.
-            offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
-            batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
-            batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
-            # This dictionary maps the batch data (as a numpy array) to the
-            # node in the graph it should be fed to.
-            feed_dict = {train_data_node: batch_data,
-                         train_labels_node: batch_labels}
-            # Run the optimizer to update weights.
-            sess.run(optimizer, feed_dict=feed_dict)
-            # print some extra information once reach the evaluation frequency
-            if step % EVAL_FREQUENCY == 0:
-                # fetch some extra nodes' data
-                l, lr, predictions = sess.run([loss, learning_rate, train_prediction],
-                                              feed_dict=feed_dict)
-                elapsed_time = time.time() - start_time
-                start_time = time.time()
-                print('Step %d (epoch %.2f), %.1f ms' %
-                      (step, float(step) * BATCH_SIZE / train_size,
-                       1000 * elapsed_time / EVAL_FREQUENCY))
-                print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
-                print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
-                print('Validation error: %.1f%%' % error_rate(
-                    eval_in_batches(validation_data, sess), validation_labels))
-                sys.stdout.flush()
-        # Finally print the result!
-        test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
-        print('Test error: %.1f%%' % test_error)
-        if FLAGS.self_test:
-            print('test_error', test_error)
-            assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % (
-                test_error,)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--use_fp16',
-        default=False,
-        help='Use half floats instead of full floats if True.',
-        action='store_true')
-    parser.add_argument(
-        '--self_test',
-        default=False,
-        action='store_true',
-        help='True if running a self test.')
-
-    FLAGS, unparsed = parser.parse_known_args()
-    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(dataset_file)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+             tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+loss = tf.losses.SparseCategoricalCrossentropy()
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = tf.optimizers.Adam(0.001 * hvd.size())
+
+checkpoint_dir = 'checkpoints/'
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
+
+
+@tf.function
+def training_step(images, labels, first_batch):
+    with tf.GradientTape() as tape:
+        probs = mnist_model(images, training=True)
+        loss_value = loss(labels, probs)
+
+    # Horovod: add Horovod Distributed GradientTape.
+    tape = hvd.DistributedGradientTape(tape)
+
+    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
+
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    #
+    # Note: broadcast should be done after the first gradient step to ensure optimizer
+    # initialization.
+    if first_batch:
+        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+    return loss_value
+
+
+# Horovod: adjust number of steps based on number of GPUs.
+for batch, (images, labels) in enumerate(dataset.take(10000 // hvd.size())):
+    loss_value = training_step(images, labels, batch == 0)
+
+    if batch % 10 == 0 and hvd.local_rank() == 0:
+        print('Step #%d\tLoss: %.6f' % (batch, loss_value))
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from
+# corrupting it.
+if hvd.rank() == 0:
+    checkpoint.save(checkpoint_dir)
diff --git a/tensorflow/run_on_localMachine.sh b/tensorflow/run_on_localMachine.sh
deleted file mode 100644
index 9c5737c..0000000
--- a/tensorflow/run_on_localMachine.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-
-# Run the program
-python -u mnist.py
\ No newline at end of file
diff --git a/tensorflow/submit_job_jureca.sh b/tensorflow/submit_job_jureca.sh
deleted file mode 100755
index fa294f1..0000000
--- a/tensorflow/submit_job_jureca.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TFLOW_MNIST
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCCcore/.8.3.0
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/tensorflow/submit_job_juron.sh b/tensorflow/submit_job_juron.sh
deleted file mode 100644
index 30fa204..0000000
--- a/tensorflow/submit_job_juron.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J TENSORFLOW_MNIST
-
-# Load the required modules
-module load python/3.6.1
-module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
-
-# Run the program
-python -u mnist.py
diff --git a/tensorflow/submit_job_juwels.sh b/tensorflow/submit_job_juwels.sh
deleted file mode 100755
index fda7d98..0000000
--- a/tensorflow/submit_job_juwels.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TFLOW_MNIST
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module load GCC/8.3.0
-module load TensorFlow/1.13.1-GPU-Python-3.6.8
-
-# Run the program
-srun python -u mnist.py
diff --git a/tensorflow2/checkpoints/.gitkeep b/tensorflow2/checkpoints/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py
deleted file mode 100644
index 53cb1da..0000000
--- a/tensorflow2/mnist.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import os
-import sys
-
-import tensorflow as tf
-import horovod.tensorflow as hvd
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../utils')
-from data_utils import DataValidator
-
-# [HPCNS] Name of the dataset file
-data_file = 'mnist/keras/mnist.npz'
-
-# [HPCNS] Path to the directory containing the dataset file
-data_dir = DataValidator.validated_data_dir(data_file)
-
-# Horovod: initialize Horovod.
-hvd.init()
-
-# Horovod: pin GPU to be used to process local rank (one GPU per process)
-gpus = tf.config.experimental.list_physical_devices('GPU')
-for gpu in gpus:
-    tf.config.experimental.set_memory_growth(gpu, True)
-if gpus:
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
-
-# [HPCNS] Fully qualified dataset file name
-dataset_file = os.path.join(data_dir, data_file)
-
-(mnist_images, mnist_labels), _ = \
-    tf.keras.datasets.mnist.load_data(dataset_file)
-
-dataset = tf.data.Dataset.from_tensor_slices(
-    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
-             tf.cast(mnist_labels, tf.int64))
-)
-dataset = dataset.repeat().shuffle(10000).batch(128)
-
-mnist_model = tf.keras.Sequential([
-    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
-    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
-    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-    tf.keras.layers.Dropout(0.25),
-    tf.keras.layers.Flatten(),
-    tf.keras.layers.Dense(128, activation='relu'),
-    tf.keras.layers.Dropout(0.5),
-    tf.keras.layers.Dense(10, activation='softmax')
-])
-loss = tf.losses.SparseCategoricalCrossentropy()
-
-# Horovod: adjust learning rate based on number of GPUs.
-opt = tf.optimizers.Adam(0.001 * hvd.size())
-
-checkpoint_dir = 'checkpoints/'
-checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
-
-
-@tf.function
-def training_step(images, labels, first_batch):
-    with tf.GradientTape() as tape:
-        probs = mnist_model(images, training=True)
-        loss_value = loss(labels, probs)
-
-    # Horovod: add Horovod Distributed GradientTape.
-    tape = hvd.DistributedGradientTape(tape)
-
-    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
-    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
-
-    # Horovod: broadcast initial variable states from rank 0 to all other processes.
-    # This is necessary to ensure consistent initialization of all workers when
-    # training is started with random weights or restored from a checkpoint.
-    #
-    # Note: broadcast should be done after the first gradient step to ensure optimizer
-    # initialization.
-    if first_batch:
-        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
-        hvd.broadcast_variables(opt.variables(), root_rank=0)
-
-    return loss_value
-
-
-# Horovod: adjust number of steps based on number of GPUs.
-for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())):
-    loss_value = training_step(images, labels, batch == 0)
-
-    if batch % 10 == 0 and hvd.local_rank() == 0:
-        print('Step #%d\tLoss: %.6f' % (batch, loss_value))
-
-# Horovod: save checkpoints only on worker 0 to prevent other workers from
-# corrupting it.
-if hvd.rank() == 0:
-    checkpoint.save(checkpoint_dir)
diff --git a/training_data_distribution/README.md b/training_data_distribution/README.md
new file mode 100644
index 0000000..6e4028e
--- /dev/null
+++ b/training_data_distribution/README.md
@@ -0,0 +1,27 @@
+# Introduction
+
+This example distributes the partitioned MNIST data across multiple ranks
+for truly data distributed training of a shallow ANN for handwritten digit
+classification.
+
+The Horovod framework is used for seamless distributed training. However,
+instead of distributing epochs, this example distributes data amongst the
+ranks, so that each rank contributes training based on its local subset of
+the training data.
+
+# Notes
+
+The `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`](
+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution)
+module for distribution of training data filenames across multiple ranks. Please 
+follow the steps below to install this package before submitting the training 
+job.
+
+1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/training_data_distribution` 
+2.  Load the system-wide Python module: `module load Python/3.8.5`    
+3.  Install the `hpc4neuro` package: 
+
+    `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git`
+
+**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, 
+as there are eight training files.
diff --git a/tensorflow2/juwels_booster_job b/training_data_distribution/jureca_job.sh
similarity index 83%
rename from tensorflow2/juwels_booster_job
rename to training_data_distribution/jureca_job.sh
index 625afac..96a239b 100755
--- a/tensorflow2/juwels_booster_job
+++ b/training_data_distribution/jureca_job.sh
@@ -9,11 +9,12 @@
 #SBATCH --time=00:10:00
 #SBATCH --job-name=TUTORIAL
 #SBATCH --gres=gpu:4
-#SBATCH --partition=booster
+#SBATCH --partition=dc-gpu-devel
 
 # Load the required modules
 module load GCC/9.3.0
 module load OpenMPI/4.1.0rc1
+module load mpi4py/3.0.3-Python-3.8.5
 module load TensorFlow/2.3.1-Python-3.8.5
 module load Horovod/0.20.3-Python-3.8.5
 
@@ -24,4 +25,4 @@ export HOROVOD_MPI_THREADS_DISABLE=0
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 
 # Run the program
-srun python -u mnist.py
+srun python -u mnist_data_distributed.py
diff --git a/training_data_distribution/jusuf_job.sh b/training_data_distribution/jusuf_job.sh
new file mode 100755
index 0000000..95c262d
--- /dev/null
+++ b/training_data_distribution/jusuf_job.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:1
+#SBATCH --partition=develgpus
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load mpi4py/3.0.3-Python-3.8.5
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0
+
+# Run the program
+srun python -u mnist_data_distributed.py
diff --git a/training_data_distribution/juwels_booster_job.sh b/training_data_distribution/juwels_booster_job.sh
new file mode 100755
index 0000000..374f63d
--- /dev/null
+++ b/training_data_distribution/juwels_booster_job.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=develbooster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load mpi4py/3.0.3-Python-3.8.5
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/juwels_booster_job b/training_data_distribution/juwels_job.sh
similarity index 95%
rename from horovod_data_distributed/juwels_booster_job
rename to training_data_distribution/juwels_job.sh
index 803e764..b2b7641 100755
--- a/horovod_data_distributed/juwels_booster_job
+++ b/training_data_distribution/juwels_job.sh
@@ -9,7 +9,7 @@
 #SBATCH --time=00:10:00
 #SBATCH --job-name=TUTORIAL
 #SBATCH --gres=gpu:4
-#SBATCH --partition=booster
+#SBATCH --partition=develgpus
 
 # Load the required modules
 module load GCC/9.3.0
diff --git a/horovod_data_distributed/mnist_data_distributed.py b/training_data_distribution/mnist_data_distributed.py
similarity index 100%
rename from horovod_data_distributed/mnist_data_distributed.py
rename to training_data_distribution/mnist_data_distributed.py
diff --git a/utils/data_utils.py b/utils/data_utils.py
index f2d10e4..21a57ad 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -47,19 +47,19 @@ class DataValidator:
             if not os.path.exists(data_dir):
                 data_dir = os.path.join(os.path.abspath('../../datasets'))
 
-            print('Using {} as the data directory.'.format(data_dir))
+            print(f'Using {data_dir} as the data directory.')
 
         # Check if the directory exists
         if not os.path.exists(data_dir):
             raise DatasetNotFoundError(
-                '{} refers to a non-existing directory. Please either correctly set '
-                'the DL_TEST_DATA_HOME environment variable, or make sure the datasets are '
-                'available in the project root.'.format(data_dir)
+                f'{data_dir} refers to a non-existing directory. Please either '
+                f'correctly set the DL_TEST_DATA_HOME environment variable, or '
+                f'make sure the datasets are available in the project root.'
             )
 
         if not os.path.exists(os.path.join(data_dir, filename)):
             raise DatasetNotFoundError(
-                'Unable to locate {} in {}'.format(filename, data_dir)
+                f'Unable to locate {filename} in {data_dir}'
             )
 
         return data_dir
-- 
GitLab


From 794b4304d8453b01ad20d31b50a9438a6982f5c0 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Mon, 26 Apr 2021 20:11:26 +0200
Subject: [PATCH 3/8] Minor corrections to the TOC.

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7ac59ff..7c86418 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,8 @@ page.
 4. [Logging on to the supercomputers](#4-logging-on-to-the-supercomputers)
 5. [Cloning the repository](#5-cloning-the-repository)
 6. [Running a sample](#6-running-a-sample)
-7. [Distributed training](#8-distributed-training)
-8. [Credits](#9-credits)
+7. [Distributed training](#7-distributed-training)
+8. [Credits](#8-credits)
 
 <!-- /TOC -->
 
-- 
GitLab


From be3dce52425f9b760bfb691744381c2aee98fa32 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 27 Apr 2021 11:03:13 +0200
Subject: [PATCH 4/8] Added a PyTorch sample. The code does work on all
 machines, but there are a couple of issues. It is very slow, in fact slower
 than on a worksation. Also, the job does not end after training and testing;
 it continues without any output until the job time expires.

---
 pytorch/jureca_job.sh         |  25 ++++
 pytorch/jusuf_job.sh          |  25 ++++
 pytorch/juwels_booster_job.sh |  25 ++++
 pytorch/juwels_job.sh         |  25 ++++
 pytorch/mnist.py              | 227 ++++++++++++++++++++++++++++++++++
 5 files changed, 327 insertions(+)
 create mode 100755 pytorch/jureca_job.sh
 create mode 100755 pytorch/jusuf_job.sh
 create mode 100755 pytorch/juwels_booster_job.sh
 create mode 100755 pytorch/juwels_job.sh
 create mode 100644 pytorch/mnist.py

diff --git a/pytorch/jureca_job.sh b/pytorch/jureca_job.sh
new file mode 100755
index 0000000..3959b01
--- /dev/null
+++ b/pytorch/jureca_job.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=dc-gpu-devel
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load PyTorch/1.7.0-Python-3.8.5
+module load torchvision/0.8.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/jusuf_job.sh b/pytorch/jusuf_job.sh
new file mode 100755
index 0000000..3ac1490
--- /dev/null
+++ b/pytorch/jusuf_job.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:1
+#SBATCH --partition=develgpus
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load PyTorch/1.7.0-Python-3.8.5
+module load torchvision/0.8.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/juwels_booster_job.sh b/pytorch/juwels_booster_job.sh
new file mode 100755
index 0000000..fd58b1d
--- /dev/null
+++ b/pytorch/juwels_booster_job.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=develbooster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load PyTorch/1.7.0-Python-3.8.5
+module load torchvision/0.8.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/juwels_job.sh b/pytorch/juwels_job.sh
new file mode 100755
index 0000000..b91e237
--- /dev/null
+++ b/pytorch/juwels_job.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=develgpus
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load PyTorch/1.7.0-Python-3.8.5
+module load torchvision/0.8.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/mnist.py b/pytorch/mnist.py
new file mode 100644
index 0000000..3fa9c44
--- /dev/null
+++ b/pytorch/mnist.py
@@ -0,0 +1,227 @@
+
+import os
+import sys
+import shutil
+import argparse
+
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+import torch.utils.data.distributed
+import horovod.torch as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+
+# Training settings
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                    help='input batch size for training (default: 64)')
+parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                    help='input batch size for testing (default: 1000)')
+parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                    help='number of epochs to train (default: 10)')
+parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                    help='SGD momentum (default: 0.5)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+parser.add_argument('--use-adasum', action='store_true', default=False,
+                    help='use adasum algorithm to do reduction')
+parser.add_argument('--gradient-predivide-factor', type=float, default=1.0,
+                    help='apply gradient predivide factor in optimizer (default: 1.0)')
+parser.add_argument('--data-dir',
+                    help='location of the training dataset in the local filesystem (will be downloaded if needed)')
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x)
+
+
+def train(epoch):
+    model.train()
+    # Horovod: set epoch to sampler for shuffling.
+    train_sampler.set_epoch(epoch)
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            # Horovod: use train_sampler to determine the number of examples in
+            # this worker's partition.
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_sampler),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def metric_average(val, name):
+    tensor = torch.tensor(val)
+    avg_tensor = hvd.allreduce(tensor, name=name)
+    return avg_tensor.item()
+
+
+def test():
+    model.eval()
+    test_loss = 0.
+    test_accuracy = 0.
+    for data, target in test_loader:
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        output = model(data)
+        # sum up batch loss
+        test_loss += F.nll_loss(output, target, size_average=False).item()
+        # get the index of the max log-probability
+        pred = output.data.max(1, keepdim=True)[1]
+        test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()
+
+    # Horovod: use test_sampler to determine the number of examples in
+    # this worker's partition.
+    test_loss /= len(test_sampler)
+    test_accuracy /= len(test_sampler)
+
+    # Horovod: average metric values across workers.
+    test_loss = metric_average(test_loss, 'avg_loss')
+    test_accuracy = metric_average(test_accuracy, 'avg_accuracy')
+
+    # Horovod: print output only on first rank.
+    if hvd.rank() == 0:
+        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
+            test_loss, 100. * test_accuracy))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+    # Horovod: initialize library.
+    hvd.init()
+    torch.manual_seed(args.seed)
+
+    if args.cuda:
+        # Horovod: pin GPU to local rank.
+        torch.cuda.set_device(hvd.local_rank())
+        torch.cuda.manual_seed(args.seed)
+
+    # Horovod: limit # of CPU threads to be used per worker.
+    torch.set_num_threads(1)
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
+    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
+    # issues with Infiniband implementations that are not fork-safe
+    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
+            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
+        kwargs['multiprocessing_context'] = 'forkserver'
+
+    # data_dir = args.data_dir or './data'
+
+    # [HPCNS] Name of the dataset file
+    data_file = 'mnist/pytorch/data'
+
+    # [HPCNS] Path to the directory containing the dataset file
+    data_dir = DataValidator.validated_data_dir(data_file)
+
+    # [HPCNS] Fully qualified dataset file name
+    dataset_file = os.path.join(data_dir, data_file)
+
+    # [HPCNS] Dataset filename for this rank
+    dataset_root_for_rank = f'MNIST-data-{hvd.rank()}'
+    dataset_for_rank = f'{dataset_root_for_rank}/MNIST'
+
+    # [HPCNS] If the path already exists, remove it
+    if os.path.exists(dataset_for_rank):
+        shutil.rmtree(dataset_for_rank)
+
+    # [HPCNS] Make a copy of the dataset for this rank
+    shutil.copytree(dataset_file, dataset_for_rank)
+
+    train_dataset = \
+        datasets.MNIST(dataset_root_for_rank, train=True, download=False,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ]))
+
+    # Horovod: use DistributedSampler to partition the training data.
+    train_sampler = torch.utils.data.distributed.DistributedSampler(
+        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
+
+    test_dataset = \
+        datasets.MNIST(dataset_root_for_rank, train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ]))
+    # Horovod: use DistributedSampler to partition the test data.
+    test_sampler = torch.utils.data.distributed.DistributedSampler(
+        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
+    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
+                                              sampler=test_sampler, **kwargs)
+
+    model = Net()
+
+    # By default, Adasum doesn't need scaling up learning rate.
+    lr_scaler = hvd.size() if not args.use_adasum else 1
+
+    if args.cuda:
+        # Move model to GPU.
+        model.cuda()
+        # If using GPU Adasum allreduce, scale learning rate by local_size.
+        if args.use_adasum and hvd.nccl_built():
+            lr_scaler = hvd.local_size()
+
+    # Horovod: scale learning rate by lr_scaler.
+    optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
+                          momentum=args.momentum)
+
+    # Horovod: broadcast parameters & optimizer state.
+    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+    # Horovod: (optional) compression algorithm.
+    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+    # Horovod: wrap optimizer with DistributedOptimizer.
+    optimizer = hvd.DistributedOptimizer(optimizer,
+                                         named_parameters=model.named_parameters(),
+                                         compression=compression,
+                                         op=hvd.Adasum if args.use_adasum else hvd.Average,
+                                         gradient_predivide_factor=args.gradient_predivide_factor)
+
+    for epoch in range(1, args.epochs + 1):
+        train(epoch)
+        test()
+
+    # [HPCNS] Remove the copied dataset
+    shutil.rmtree(dataset_root_for_rank)
-- 
GitLab


From cc8ad9a910b9fabc7ef349770ef2887453a80a05 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 27 Apr 2021 11:13:29 +0200
Subject: [PATCH 5/8] Added readme for the PyTorch directory.

---
 pytorch/README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 pytorch/README.md

diff --git a/pytorch/README.md b/pytorch/README.md
new file mode 100644
index 0000000..def300c
--- /dev/null
+++ b/pytorch/README.md
@@ -0,0 +1,20 @@
+# Notes
+
+The source code sample was taken from the Horovod examples repository 
+[here](https://github.com/horovod/horovod/tree/master/examples/pytorch) 
+(last checked: April 27, 2021). The sample has been slightly modified. Our 
+changes are limited to,
+
+*  The data loading mechanism.
+*  Removal of `filelock` to eliminate dependence on a package that is not 
+   available on the supercomputers.
+*  A few additional comments pertaining to our custom data loading mechanism.
+
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. 
+All statements that demonstrate the use of Horovod follow a comment beginning 
+with `[Horovod]` (as added by Horovod developers).
+
+The following sample is included:
+
+1.  `mnist.py`: A simple training program for an MNIST classifier that 
+    uses Horovod for data distribution.
-- 
GitLab


From efc3bd4a90deb407af59a951e44a83f56993f6ed Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 4 May 2021 08:32:43 +0200
Subject: [PATCH 6/8] Updated the course material so that the examples comply
 with TF2.

---
 .../examples/mnist_epoch_distributed.py         | 17 ++++++++---------
 course_material/examples/mnist_single_gpu.py    | 13 ++++++-------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/course_material/examples/mnist_epoch_distributed.py b/course_material/examples/mnist_epoch_distributed.py
index 7c9080e..504b2a8 100644
--- a/course_material/examples/mnist_epoch_distributed.py
+++ b/course_material/examples/mnist_epoch_distributed.py
@@ -4,8 +4,6 @@
 # Version 2.0 (see the NOTICE file for details).
 
 """
-    This program is an adaptation of the following code sample:
-    https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py.
     The program creates and trains a shallow ANN for handwritten digit
     classification using the MNIST dataset.
 
@@ -13,14 +11,14 @@
     example epochs are distributed across the Horovod ranks, not data.
 
     To run this sample use the following command on your
-    workstation/laptop equipped with a GPU:
+    workstation/laptop:
 
-    mpirun -np 1 python -u mnist_epoch_distributed.py
+        mpirun -np 1 python -u mnist_epoch_distributed.py
 
     If you have more than one GPU on your system, you can increase the
     number of ranks accordingly.
 
-    The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and
+    The code has been tested with Python 3.8.7, tensorflow 2.3.1, and
     horovod 0.16.2.
 
     Note: This code will NOT work on the supercomputers.
@@ -30,16 +28,17 @@
 import math
 import tensorflow as tf
 import horovod.tensorflow.keras as hvd
-from tensorflow.python.keras import backend as K
 
 
 # Horovod: initialize Horovod.
 hvd.init()
 
 # Horovod: pin GPU to be used to process local rank (one GPU per process)
-config = tf.ConfigProto()
-config.gpu_options.visible_device_list = str(hvd.local_rank())
-K.set_session(tf.Session(config=config))
+gpus = tf.config.experimental.list_physical_devices('GPU')
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
 
 # Reference to the MNIST dataset
 mnist = tf.keras.datasets.mnist
diff --git a/course_material/examples/mnist_single_gpu.py b/course_material/examples/mnist_single_gpu.py
index 794150f..2918cd0 100644
--- a/course_material/examples/mnist_single_gpu.py
+++ b/course_material/examples/mnist_single_gpu.py
@@ -4,17 +4,16 @@
 # Version 2.0 (see the NOTICE file for details).
 
 """
-    This program is an adaptation of the code sample available at
-    https://www.tensorflow.org/tutorials/. The program creates
-    and trains a shallow ANN for handwritten digit classification
-    using the MNIST dataset.
+    This program is an adaptation of a previously available code sample
+    at https://www.tensorflow.org/tutorials/. The program creates and trains a
+    shallow ANN for handwritten digit classification using the MNIST dataset.
 
     To run this sample use the following command on your
-    workstation/laptop equipped with a GPU:
+    workstation/laptop:
 
-    python -u mnist.py
+        python -u mnist.py
 
-    The code has been tested with Python 3.7.5 and tensorflow-gpu 1.13.1.
+    The code has been tested with Python 3.8.7 and tensorflow 2.3.1
 
     Note: This code will NOT work on the supercomputers.
 
-- 
GitLab


From 66fcb392f79084c45902829e686f9341fd8c59be Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 18 May 2021 09:48:45 +0200
Subject: [PATCH 7/8] Updated 3rd party license information.

---
 NOTICE                               | 44 +++++++---------------------
 training_data_distribution/README.md |  4 +--
 2 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/NOTICE b/NOTICE
index 22a9d69..11aba54 100644
--- a/NOTICE
+++ b/NOTICE
@@ -18,7 +18,7 @@ limitations under the License.
 
 
 Tensorflow
-Copyright 2016 The TensorFlow Authors.  All rights reserved.
+Copyright 2019 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -34,38 +34,16 @@ limitations under the License.
 
 
 Keras
-All contributions by François Chollet:
-Copyright (c) 2015 - 2019, François Chollet.
-All rights reserved.
+Copyright 2015 The TensorFlow Authors.  All rights reserved.
 
-All contributions by Google:
-Copyright (c) 2015 - 2019, Google, Inc.
-All rights reserved.
-
-All contributions by Microsoft:
-Copyright (c) 2017 - 2019, Microsoft, Inc.
-All rights reserved.
-
-All other contributions:
-Copyright (c) 2015 - 2019, the respective contributors.
-All rights reserved.
-
-Licensed under The MIT License (MIT)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+    http://www.apache.org/licenses/LICENSE-2.0
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/training_data_distribution/README.md b/training_data_distribution/README.md
index 6e4028e..374f623 100644
--- a/training_data_distribution/README.md
+++ b/training_data_distribution/README.md
@@ -1,8 +1,8 @@
 # Introduction
 
 This example distributes the partitioned MNIST data across multiple ranks
-for truly data distributed training of a shallow ANN for handwritten digit
-classification.
+for truly data distributed training of a shallow Artificial Neural Network for 
+handwritten digit classification.
 
 The Horovod framework is used for seamless distributed training. However,
 instead of distributing epochs, this example distributes data amongst the
-- 
GitLab


From 912f6813f32aab1d7221aba4523c5e2bf787b894 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 18 May 2021 10:09:54 +0200
Subject: [PATCH 8/8] Removed the pytorch sample, as it has been moved to the
 tf2_pytorch branch.

---
 pytorch/README.md             |  20 ---
 pytorch/jureca_job.sh         |  25 ----
 pytorch/jusuf_job.sh          |  25 ----
 pytorch/juwels_booster_job.sh |  25 ----
 pytorch/juwels_job.sh         |  25 ----
 pytorch/mnist.py              | 227 ----------------------------------
 6 files changed, 347 deletions(-)
 delete mode 100644 pytorch/README.md
 delete mode 100755 pytorch/jureca_job.sh
 delete mode 100755 pytorch/jusuf_job.sh
 delete mode 100755 pytorch/juwels_booster_job.sh
 delete mode 100755 pytorch/juwels_job.sh
 delete mode 100644 pytorch/mnist.py

diff --git a/pytorch/README.md b/pytorch/README.md
deleted file mode 100644
index def300c..0000000
--- a/pytorch/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Notes
-
-The source code sample was taken from the Horovod examples repository 
-[here](https://github.com/horovod/horovod/tree/master/examples/pytorch) 
-(last checked: April 27, 2021). The sample has been slightly modified. Our 
-changes are limited to,
-
-*  The data loading mechanism.
-*  Removal of `filelock` to eliminate dependence on a package that is not 
-   available on the supercomputers.
-*  A few additional comments pertaining to our custom data loading mechanism.
-
-**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. 
-All statements that demonstrate the use of Horovod follow a comment beginning 
-with `[Horovod]` (as added by Horovod developers).
-
-The following sample is included:
-
-1.  `mnist.py`: A simple training program for an MNIST classifier that 
-    uses Horovod for data distribution.
diff --git a/pytorch/jureca_job.sh b/pytorch/jureca_job.sh
deleted file mode 100755
index 3959b01..0000000
--- a/pytorch/jureca_job.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TUTORIAL
-#SBATCH --gres=gpu:4
-#SBATCH --partition=dc-gpu-devel
-
-# Load the required modules
-module load GCC/9.3.0
-module load OpenMPI/4.1.0rc1
-module load PyTorch/1.7.0-Python-3.8.5
-module load torchvision/0.8.1-Python-3.8.5
-module load Horovod/0.20.3-Python-3.8.5
-
-# Make all GPUs visible per node
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-# Run the program
-srun python -u mnist.py
diff --git a/pytorch/jusuf_job.sh b/pytorch/jusuf_job.sh
deleted file mode 100755
index 3ac1490..0000000
--- a/pytorch/jusuf_job.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=2
-#SBATCH --ntasks=2
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TUTORIAL
-#SBATCH --gres=gpu:1
-#SBATCH --partition=develgpus
-
-# Load the required modules
-module load GCC/9.3.0
-module load OpenMPI/4.1.0rc1
-module load PyTorch/1.7.0-Python-3.8.5
-module load torchvision/0.8.1-Python-3.8.5
-module load Horovod/0.20.3-Python-3.8.5
-
-# Make all GPUs visible per node
-export CUDA_VISIBLE_DEVICES=0
-
-# Run the program
-srun python -u mnist.py
diff --git a/pytorch/juwels_booster_job.sh b/pytorch/juwels_booster_job.sh
deleted file mode 100755
index fd58b1d..0000000
--- a/pytorch/juwels_booster_job.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TUTORIAL
-#SBATCH --gres=gpu:4
-#SBATCH --partition=develbooster
-
-# Load the required modules
-module load GCC/9.3.0
-module load OpenMPI/4.1.0rc1
-module load PyTorch/1.7.0-Python-3.8.5
-module load torchvision/0.8.1-Python-3.8.5
-module load Horovod/0.20.3-Python-3.8.5
-
-# Make all GPUs visible per node
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-# Run the program
-srun python -u mnist.py
diff --git a/pytorch/juwels_job.sh b/pytorch/juwels_job.sh
deleted file mode 100755
index b91e237..0000000
--- a/pytorch/juwels_job.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=4
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=TUTORIAL
-#SBATCH --gres=gpu:4
-#SBATCH --partition=develgpus
-
-# Load the required modules
-module load GCC/9.3.0
-module load OpenMPI/4.1.0rc1
-module load PyTorch/1.7.0-Python-3.8.5
-module load torchvision/0.8.1-Python-3.8.5
-module load Horovod/0.20.3-Python-3.8.5
-
-# Make all GPUs visible per node
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-
-# Run the program
-srun python -u mnist.py
diff --git a/pytorch/mnist.py b/pytorch/mnist.py
deleted file mode 100644
index 3fa9c44..0000000
--- a/pytorch/mnist.py
+++ /dev/null
@@ -1,227 +0,0 @@
-
-import os
-import sys
-import shutil
-import argparse
-
-import torch.multiprocessing as mp
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-import torch.utils.data.distributed
-import horovod.torch as hvd
-
-# [HPCNS] Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../utils')
-from data_utils import DataValidator
-
-
-# Training settings
-parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                    help='input batch size for training (default: 64)')
-parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                    help='input batch size for testing (default: 1000)')
-parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                    help='number of epochs to train (default: 10)')
-parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-                    help='learning rate (default: 0.01)')
-parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-                    help='SGD momentum (default: 0.5)')
-parser.add_argument('--no-cuda', action='store_true', default=False,
-                    help='disables CUDA training')
-parser.add_argument('--seed', type=int, default=42, metavar='S',
-                    help='random seed (default: 42)')
-parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                    help='how many batches to wait before logging training status')
-parser.add_argument('--fp16-allreduce', action='store_true', default=False,
-                    help='use fp16 compression during allreduce')
-parser.add_argument('--use-adasum', action='store_true', default=False,
-                    help='use adasum algorithm to do reduction')
-parser.add_argument('--gradient-predivide-factor', type=float, default=1.0,
-                    help='apply gradient predivide factor in optimizer (default: 1.0)')
-parser.add_argument('--data-dir',
-                    help='location of the training dataset in the local filesystem (will be downloaded if needed)')
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
-        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
-        self.conv2_drop = nn.Dropout2d()
-        self.fc1 = nn.Linear(320, 50)
-        self.fc2 = nn.Linear(50, 10)
-
-    def forward(self, x):
-        x = F.relu(F.max_pool2d(self.conv1(x), 2))
-        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
-        x = x.view(-1, 320)
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, training=self.training)
-        x = self.fc2(x)
-        return F.log_softmax(x)
-
-
-def train(epoch):
-    model.train()
-    # Horovod: set epoch to sampler for shuffling.
-    train_sampler.set_epoch(epoch)
-    for batch_idx, (data, target) in enumerate(train_loader):
-        if args.cuda:
-            data, target = data.cuda(), target.cuda()
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        loss.backward()
-        optimizer.step()
-        if batch_idx % args.log_interval == 0:
-            # Horovod: use train_sampler to determine the number of examples in
-            # this worker's partition.
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_sampler),
-                100. * batch_idx / len(train_loader), loss.item()))
-
-
-def metric_average(val, name):
-    tensor = torch.tensor(val)
-    avg_tensor = hvd.allreduce(tensor, name=name)
-    return avg_tensor.item()
-
-
-def test():
-    model.eval()
-    test_loss = 0.
-    test_accuracy = 0.
-    for data, target in test_loader:
-        if args.cuda:
-            data, target = data.cuda(), target.cuda()
-        output = model(data)
-        # sum up batch loss
-        test_loss += F.nll_loss(output, target, size_average=False).item()
-        # get the index of the max log-probability
-        pred = output.data.max(1, keepdim=True)[1]
-        test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()
-
-    # Horovod: use test_sampler to determine the number of examples in
-    # this worker's partition.
-    test_loss /= len(test_sampler)
-    test_accuracy /= len(test_sampler)
-
-    # Horovod: average metric values across workers.
-    test_loss = metric_average(test_loss, 'avg_loss')
-    test_accuracy = metric_average(test_accuracy, 'avg_accuracy')
-
-    # Horovod: print output only on first rank.
-    if hvd.rank() == 0:
-        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
-            test_loss, 100. * test_accuracy))
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    args.cuda = not args.no_cuda and torch.cuda.is_available()
-
-    # Horovod: initialize library.
-    hvd.init()
-    torch.manual_seed(args.seed)
-
-    if args.cuda:
-        # Horovod: pin GPU to local rank.
-        torch.cuda.set_device(hvd.local_rank())
-        torch.cuda.manual_seed(args.seed)
-
-    # Horovod: limit # of CPU threads to be used per worker.
-    torch.set_num_threads(1)
-
-    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
-    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
-    # issues with Infiniband implementations that are not fork-safe
-    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
-            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
-        kwargs['multiprocessing_context'] = 'forkserver'
-
-    # data_dir = args.data_dir or './data'
-
-    # [HPCNS] Name of the dataset file
-    data_file = 'mnist/pytorch/data'
-
-    # [HPCNS] Path to the directory containing the dataset file
-    data_dir = DataValidator.validated_data_dir(data_file)
-
-    # [HPCNS] Fully qualified dataset file name
-    dataset_file = os.path.join(data_dir, data_file)
-
-    # [HPCNS] Dataset filename for this rank
-    dataset_root_for_rank = f'MNIST-data-{hvd.rank()}'
-    dataset_for_rank = f'{dataset_root_for_rank}/MNIST'
-
-    # [HPCNS] If the path already exists, remove it
-    if os.path.exists(dataset_for_rank):
-        shutil.rmtree(dataset_for_rank)
-
-    # [HPCNS] Make a copy of the dataset for this rank
-    shutil.copytree(dataset_file, dataset_for_rank)
-
-    train_dataset = \
-        datasets.MNIST(dataset_root_for_rank, train=True, download=False,
-                       transform=transforms.Compose([
-                           transforms.ToTensor(),
-                           transforms.Normalize((0.1307,), (0.3081,))
-                       ]))
-
-    # Horovod: use DistributedSampler to partition the training data.
-    train_sampler = torch.utils.data.distributed.DistributedSampler(
-        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
-
-    test_dataset = \
-        datasets.MNIST(dataset_root_for_rank, train=False, transform=transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.1307,), (0.3081,))
-        ]))
-    # Horovod: use DistributedSampler to partition the test data.
-    test_sampler = torch.utils.data.distributed.DistributedSampler(
-        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
-    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
-                                              sampler=test_sampler, **kwargs)
-
-    model = Net()
-
-    # By default, Adasum doesn't need scaling up learning rate.
-    lr_scaler = hvd.size() if not args.use_adasum else 1
-
-    if args.cuda:
-        # Move model to GPU.
-        model.cuda()
-        # If using GPU Adasum allreduce, scale learning rate by local_size.
-        if args.use_adasum and hvd.nccl_built():
-            lr_scaler = hvd.local_size()
-
-    # Horovod: scale learning rate by lr_scaler.
-    optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
-                          momentum=args.momentum)
-
-    # Horovod: broadcast parameters & optimizer state.
-    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
-    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
-
-    # Horovod: (optional) compression algorithm.
-    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
-
-    # Horovod: wrap optimizer with DistributedOptimizer.
-    optimizer = hvd.DistributedOptimizer(optimizer,
-                                         named_parameters=model.named_parameters(),
-                                         compression=compression,
-                                         op=hvd.Adasum if args.use_adasum else hvd.Average,
-                                         gradient_predivide_factor=args.gradient_predivide_factor)
-
-    for epoch in range(1, args.epochs + 1):
-        train(epoch)
-        test()
-
-    # [HPCNS] Remove the copied dataset
-    shutil.rmtree(dataset_root_for_rank)
-- 
GitLab