diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..775c8feb9d6f8d5925ddcf5dc75f13c612e17a3e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,11 @@
+datasets/mnist/caffe/mnist_test_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/caffe/mnist_test_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/caffe/mnist_train_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/caffe/mnist_train_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/keras/mnist.npz filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/pytorch/data/processed/training.pt filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/pytorch/data/processed/test.pt filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/raw/t10k-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/raw/t10k-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/raw/train-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/raw/train-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..51cee13fcf0d14f8f0314f9878f75a4e92248e14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,115 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+venv3/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# PyCharm
+.idea
+
+keras.json
+
+# Tensorflow/keras Checkpoints
+mnist_convnet_model/
diff --git a/README.md b/README.md
index 3128749d575b9b7c91e337269e46c5af01494f64..a832b01d94925dc39a81ee36a55d8d817b2a3b38 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,190 @@
-# ml_dl_on_supercomputers
+# Getting started with ML/DL on Supercomputers
 
-Samples and documentation for the "Getting started with ML/DL on Supercomputers" tutorial.
\ No newline at end of file
+This repository is intended to serve as a tutorial for anyone interested in utilizing the supercomputers 
+available at the JSC for ML/DL related projects. It is assumed that the reader is proficient in one or 
+more of the following frameworks:
+
+*    [Tensorflow](https://www.tensorflow.org/)
+*    [Keras](https://keras.io/)
+*    [PyTorch](https://pytorch.org/)
+*    [Caffe](http://caffe.berkeleyvision.org/)
+*    [Horovod](https://github.com/horovod/horovod)
+
+**Note:** This tutorial is by no means intended as an introduction to ML/DL, or to any of the
+above mentioned frameworks. If you are interested in educational resources for beginners, please
+visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/Education) page.
+
+### A word regarding the code samples
+
+Samples for each framework are available in the correspondingly named directory. Each such 
+directory typically contains at least one code sample, which trains a simple artificial neural 
+network on the canonical MNIST hand-written digit classification task. Moreover, job submission 
+scripts are included for all the supercomputers on which this tutorial has been tested. The job 
+scripts will hopefully make it easier to figure out which modules to load. Finally, 
+a `README.md` file contains further information about the contents of the directory.
+
+**Disclaimer:** Neither are the samples intended to serve as examples of optimized code, nor do these 
+represent programming best practices.
+
+### Changes made to support loading of pre-downloaded datasets
+
+It is worth mentioning that all the code samples were taken from the corresponding framework's 
+official samples/tutorials repository, as practitioners are likely familiar with these (links 
+to the original code samples are included in the directory-local `README.md`). However, the 
+original examples are designed to automatically download the required dataset in a 
+framework-defined directory. This is not a feasible option as compute nodes on the supercomputers 
+do not have access to the Internet. Therefore, the samples have been slightly modified to load data from 
+the `datasets` directory included in this repository; specific code changes, at least for now, 
+have been marked by comments prefixed with the `[HPCNS]` tag. For more information see the `README.md` 
+available in the `datasets` directory.
+
+## 1. Applying for user accounts on supercomputers
+
+In case you do not already have an account on your supercomputer of interest, please take a look at the 
+instructions provided in the following sub-sections.
+
+### 1.1 JURECA and JUWELS
+
+For more information on getting accounts on JURECA and JUWELS, click 
+[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html).
+
+### 1.2 JURON
+
+To get a user account on JURON, please follow the steps below:
+
+1.  Write an email to [Dirk Pleiter](http://www.fz-juelich.de/SharedDocs/Personen/IAS/JSC/EN/staff/pleiter_d.html?nn=362224), 
+in which please introduce yourself and mention why you need the account.
+2.  Apply for the account via the [JuDoor](https://dspserv.zam.kfa-juelich.de/judoor/login) portal 
+(more information about JuDoor is available [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/JuDoor.html?nn=945700)).
+If your work is related to the Human Brain Project (HBP), please join the `PCP0` and `CPCP0` projects. 
+Otherwise please join the `PADC` and `CPADC` projects.
+
+## 2. Logging on to the supercomputers
+
+Assuming JURECA is the target supercomputer, following are the steps required to login 
+(more information [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/JURECA/UserInfo/QuickIntroduction.html?nn=1803700)).
+
+1.  Use SSH to login:
+    
+    `ssh <username>@jureca.fz-juelich.de`
+2.  Upon successful login, activate your project environment (more information [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)):
+
+ 
+    `jutil env activate -p <project name> -A <accounting project name>`
+3.  Change to the project directory:
+
+    `cd $PROJECT`
+
+You should be in your project directory at this point. If you'd like to clone this repository 
+elsewhere, please change to that directory.
+
+**Note:** The same steps are valid for logging on to JURON, except that the server address in 
+step 1 should be: `juron.fz-juelich.de`
+
+## 3. Cloning the repository
+
+In order to store the datasets within the repository, we use Git LFS. This makes cloning the 
+repository a little bit different. Please find below the instructions on how to clone on different 
+systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/).
+
+**Note:** During the cloning process you will most likely be prompted for your username and 
+password twice; this is as expected.
+
+### 3.1 JURECA
+
+1.  Load the Git LFS module:
+
+    `module load git-lfs/2.6.1`
+2.  Initialize Git LFS:
+
+    `git lfs install`
+3.  Clone the repository, including the datasets:
+
+    `git lfs clone https://gitlab.version.fz-juelich.de/khalid1/dl_framework_testing.git`
+
+### 3.2 JURON
+
+No additional setup is required on JURON. You can simply clone the repository along with the
+datasets using the following command:
+
+    git lfs clone https://gitlab.version.fz-juelich.de/khalid1/dl_framework_testing.git
+
+## 4. Running a sample
+
+Let us consider a scenario where you would like to run the `mnist.py` sample available in the `keras` 
+directory. This sample trains a CNN on MNIST using Keras on a single GPU. The following sub-sections list 
+the steps required for different supercomputers.
+
+### 4.1 JURECA
+
+1.  Assuming you are in the repository root, change to the keras directory:
+
+    `cd keras`
+2.  Submit the job to run the sample:
+
+    `sbatch submit_job_jureca_python3.sh`
+
+That's it; this is all you need for job submission. If you'd like to receive email notifications 
+regarding the status of the job, add the following statement to the "SLURM job configuration" 
+block in the `submit_job_jureca_python3.sh` script (replace `<your email address here>` with your 
+email address).
+
+    #SBATCH --mail-user=<your email address here>
+
+Output from the job is available in the `error` and `output` files, as specified in the job 
+configuration.
+
+### 4.2 JURON
+
+1.  Assuming you are in the repository root, change to the keras directory:
+
+    `cd keras`
+2.  Submit the job to run the sample:
+
+    `bsub < submit_job_juron_python3.sh`
+
+Please note that unlike JURECA, JURON uses LSF for job submission, which is why a different 
+syntax is required for job configuration and submission. Moreover, email notifications are not 
+supported on JURON. For more information on how to use LSF on JURON, use the following command:
+
+    man 7 juron-lsf
+    
+Output from the job is available in the `error` and `output` files, as specified in the job 
+configuration.
+
+## 5. Python 2 support
+
+All the code samples are compatible with both Python 2 and Python 3. However, not all frameworks on all 
+machines are available for Python 2 (yet); in certain cases these are only available for Python 3. We have 
+included separate job submission scripts for Python 2 and Python 3. In cases where Python 2 is not 
+supported, only the job submission script for Python 3 is available. We will try our best to make 
+all frameworks available with Python 2 as well, but this will not be a priority as the official support 
+for Python 2 will be discontinued in the year 2020.
+
+## 6. Distributed training
+
+[Horovod](https://github.com/horovod/horovod) provides a simple and efficient solution for 
+training artificial neural networks on multiple GPUs across multiple nodes in a cluster. It can 
+be used with Tensorflow, Keras, and PyTorch (some other frameworks are supported as well, but 
+not Caffe). In this repository, the `horovod` directory contains further sub-directories; one 
+for each compatible framework that has been tested. E.g., there is a `keras` sub-directory that 
+contains samples that utilize distributed training with Keras and Horovod (more information is available 
+in the directory-local `README.md`).
+
+Please note that Horovod currently only supports a distribution strategy where the entire model is 
+replicated on all GPUs. It is the data that is distributed across the GPUs. If you are interested 
+in model-parallel training, where the model itself can be split and distributed, a different 
+solution is required. We hope to add a sample for model-parallel training at a later time.
+
+Caffe does not support multi-node training. However, it has built-in support for [multi-GPU 
+training](https://github.com/BVLC/caffe/blob/master/docs/multigpu.md) on a single node (only
+via the C/C++ interface). The `mnist_cmd` sample in the `caffe` directory contains the job 
+script that can be used to train the model on multiple GPUs. Please see the 
+directory-local `README.md` for further information.
+
+## Credits
+
+*  **Created by:** Fahad Khalid (SLNS/HPCNS, JSC)
+*  **Installation of modules on JURON:** Andreas Herten (HPCNS, JSC)
+*  **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi Deepu (SLNS/HPCNS, JSC)
+*  **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC)
diff --git a/caffe/README.md b/caffe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db7976c5db0695311aaced72d53bea908f6481a1
--- /dev/null
+++ b/caffe/README.md
@@ -0,0 +1,38 @@
+# Notes
+
+There are three ways in which Caffe can be used,
+1.  As a command line tool with only built-in layers
+2.  As a library from within a Python program. Either only built-in layers can be used, 
+or one or more custom layers can be written in Python.
+3.  As a command line tool with one or more custom C++ layers.
+
+## Caffe as a command line tool
+
+The `mnist_cmd` sub-directory contains configuration and job scripts for running 
+Caffe as a command line tool with only built-in layers. This example represents use 
+case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` 
+were taken from the MNIST examples directory available in the Caffe repository available 
+[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have 
+been made just so the path to the input dataset is correct. The `caffe` command 
+in the job submission scripts can be modified as follows to run training on 
+all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`):
+
+    caffe train --solver=lenet_solver.prototxt -gpu all
+
+## Using Caffe within a Python program
+
+The `lenet_python` sub-directory contains the required files for an example of 
+using Caffe as a library from within a Python program. This corresponds to use case 
+2 as described above. The `train_lenet.py` file contains source code adapted from 
+the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples 
+[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example 
+results in the generation of a learning curve plot in the current directory.
+
+## Caffe with custom C++ layers
+
+Working with custom C++ layers requires recompiling Caffe with the custom code. As 
+this is not possible with a system-wide installation, we have decided not to 
+include an example of this use case. Nevertheless, if you must work with custom 
+C++ layers and require assistance, please send an email to the mailing list 
+(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)).
+
diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..44af3ad6cecd7a8090902160666e5453622f8be6
--- /dev/null
+++ b/caffe/lenet_python/lenet_auto_solver.prototxt
@@ -0,0 +1,24 @@
+# The train/test net protocol buffer definition
+train_net: "lenet_auto_train.prototxt"
+test_net: "lenet_auto_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "snapshots/lenet"
diff --git a/caffe/lenet_python/snapshots/.gitkeep b/caffe/lenet_python/snapshots/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/caffe/lenet_python/submit_job_jureca_python2.sh b/caffe/lenet_python/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..75069256157eb55f4122b0ebc2f390b925f89396
--- /dev/null
+++ b/caffe/lenet_python/submit_job_jureca_python2.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=CAFFE_LENET_PYTHON
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load Caffe/1.0-Python-2.7.15
+
+# Run the program
+srun python -u train_lenet.py
diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2025a389b89bb90c6593b598231f14c8fb1fdcf0
--- /dev/null
+++ b/caffe/lenet_python/submit_job_juron_python2.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J CAFFE_LENET_PYTHON
+
+# Load the Python and Caffe modules
+module load python/2.7.14
+module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
+
+# Train LeNet
+python -u train_lenet.py
diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7e737766bcb4ee609fdefab0d52f6adcc95e12e8
--- /dev/null
+++ b/caffe/lenet_python/submit_job_juron_python3.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J CAFFE_LENET_PYTHON
+
+# Load the Python and Caffe modules
+module load python/3.6.1
+module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
+
+# Train LeNet
+python -u train_lenet.py
diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad5cae3bf4d6a7f1f9a418b802418714efb6ee67
--- /dev/null
+++ b/caffe/lenet_python/train_lenet.py
@@ -0,0 +1,107 @@
+import os
+import sys
+import matplotlib
+
+# Force matplotlib to not use any Xwindows backend.
+matplotlib.use('Agg')
+import pylab
+
+import caffe
+from caffe import layers as L, params as P
+
+# Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+
+# Prepares network specification
+def lenet(lmdb, batch_size):
+    # Caffe's version of LeNet: a series of linear and simple nonlinear transformations
+    n = caffe.NetSpec()
+
+    n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb,
+                             transform_param=dict(scale=1. / 255), ntop=2)
+
+    n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier'))
+    n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
+    n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier'))
+    n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
+    n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier'))
+    n.relu1 = L.ReLU(n.fc1, in_place=True)
+    n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier'))
+    n.loss = L.SoftmaxWithLoss(n.score, n.label)
+
+    return n.to_proto()
+
+
+# Names of the directories containing the LMDB files for TRAIN and TEST phases
+test_dir = 'mnist/caffe/mnist_test_lmdb'
+train_dir = 'mnist/caffe/mnist_train_lmdb'
+
+# Validated path to the data root
+DataValidator.validated_data_dir(train_dir)
+data_dir = DataValidator.validated_data_dir(test_dir)
+
+# Write the prototxt for TRAIN phase
+with open('lenet_auto_train.prototxt', 'w') as f:
+    f.write(str(lenet(os.path.join(data_dir, train_dir), 64)))
+
+# Write the prototxt for TEST phase
+with open('lenet_auto_test.prototxt', 'w') as f:
+    f.write(str(lenet(os.path.join(data_dir, test_dir), 100)))
+
+# Use the GPU for training
+caffe.set_device(0)
+caffe.set_mode_gpu()
+
+# Load the solver and create train and test nets
+solver = None  # ignore this workaround for lmdb data (can't instantiate two solvers on the same data)
+solver = caffe.SGDSolver('lenet_auto_solver.prototxt')
+
+solver.net.forward()  # train net
+solver.test_nets[0].forward()  # test net (there can be more than one)
+
+niter = 200
+test_interval = 25
+# losses will also be stored in the log
+train_loss = pylab.zeros(niter)
+test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval)))
+output = pylab.zeros((niter, 8, 10))
+
+# the main solver loop
+for it in range(niter):
+    solver.step(1)  # SGD by Caffe
+
+    # store the train loss
+    train_loss[it] = solver.net.blobs['loss'].data
+
+    # store the output on the first test batch
+    # (start the forward pass at conv1 to avoid loading new data)
+    solver.test_nets[0].forward(start='conv1')
+    output[it] = solver.test_nets[0].blobs['score'].data[:8]
+
+    # run a full test every so often
+    # (Caffe can also do this for us and write to a log, but we show here
+    #  how to do it directly in Python, where more complicated things are easier.)
+    if it % test_interval == 0:
+        print('Iteration', it, 'testing...')
+        correct = 0
+        for test_it in range(100):
+            solver.test_nets[0].forward()
+            correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1)
+                           == solver.test_nets[0].blobs['label'].data)
+        test_acc[it // test_interval] = correct / 1e4
+
+# Plot the training curve
+_, ax1 = pylab.subplots()
+ax2 = ax1.twinx()
+ax1.plot(pylab.arange(niter), train_loss)
+ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r')
+ax1.set_xlabel('iteration')
+ax1.set_ylabel('train loss')
+ax2.set_ylabel('test accuracy')
+ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1]))
+
+# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace
+pylab.savefig('learning_curve.png', bbox_inches='tight')
diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..103b2e757061c84e3bb00a83a54f55606b3ce64b
--- /dev/null
+++ b/caffe/mnist_cmd/lenet_solver.prototxt
@@ -0,0 +1,25 @@
+# The train/test net protocol buffer definition
+net: "lenet_train_test.prototxt"
+# test_iter specifies how many forward passes the test should carry out.
+# In the case of MNIST, we have test batch size 100 and 100 test iterations,
+# covering the full 10,000 testing images.
+test_iter: 100
+# Carry out testing every 500 training iterations.
+test_interval: 500
+# The base learning rate, momentum and the weight decay of the network.
+base_lr: 0.01
+momentum: 0.9
+weight_decay: 0.0005
+# The learning rate policy
+lr_policy: "inv"
+gamma: 0.0001
+power: 0.75
+# Display every 100 iterations
+display: 100
+# The maximum number of iterations
+max_iter: 10000
+# snapshot intermediate results
+snapshot: 5000
+snapshot_prefix: "snapshots/lenet"
+# solver mode: CPU or GPU
+solver_mode: GPU
diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..f34ab716ec5467584ac059af3bd5d087a9d2fb34
--- /dev/null
+++ b/caffe/mnist_cmd/lenet_train_test.prototxt
@@ -0,0 +1,168 @@
+name: "LeNet"
+layer {
+  name: "mnist"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "../../datasets/mnist/caffe/mnist_train_lmdb"
+    batch_size: 64
+    backend: LMDB
+  }
+}
+layer {
+  name: "mnist"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  data_param {
+    source: "../../datasets/mnist/caffe/mnist_test_lmdb"
+    batch_size: 100
+    backend: LMDB
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 50
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool2"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "ip1"
+  top: "ip1"
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/caffe/mnist_cmd/submit_job_jureca_python2.sh b/caffe/mnist_cmd/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..029520e3308a4e322cfd14c3d863e982fb5ac02e
--- /dev/null
+++ b/caffe/mnist_cmd/submit_job_jureca_python2.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=CAFFE_MNIST_CMD
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load Caffe/1.0-Python-2.7.15
+
+# Train the model using the 'caffe' binary
+srun caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b5ee63c60aa1dddad9708367d6623deccc57022f
--- /dev/null
+++ b/caffe/mnist_cmd/submit_job_juron_python2.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J CAFFE_MNIST_CMD
+
+# Load the Python and Caffe modules
+module load python/2.7.14
+module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
+
+# Train a model for MNIST
+caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bdac4a2aef6d670bff2fcf4a928bf3586df3781b
--- /dev/null
+++ b/caffe/mnist_cmd/submit_job_juron_python3.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J CAFFE_MNIST_CMD
+
+# Load the Python and Caffe modules
+module load python/3.6.1
+module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
+
+# Train a model for MNIST
+caffe train --solver=lenet_solver.prototxt -gpu 0
diff --git a/datasets/README.md b/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..19e9a405851d6d230941357bb39546e4d31284e0
--- /dev/null
+++ b/datasets/README.md
@@ -0,0 +1,19 @@
+# Notes
+
+To keep the code samples as simple as possible, all examples use the 
+[MNIST](http://yann.lecun.com/exdb/mnist/) dataset for training a Convolutional 
+Neural Network on the hand-written digit classification problem. Furthermore, we 
+decided to take code samples from the official models/examples repositories 
+maintained by the respective framework developers, as these are the same samples one 
+uses when getting started with the framework. 
+
+However, the original examples are designed to automatically download the required 
+dataset in a framework-defined directory. This is not a feasible option as compute 
+nodes on the supercomputers do not have access to the Internet. Therefore, the samples 
+have been slightly modified to load data from this `datasets` directory. It contains 
+the MNIST dataset in different formats because samples for different frameworks expect 
+the dataset in a different format.
+
+It is possible to set the `DL_TEST_DATA_HOME` environment variable to point to a 
+different directory, however, the contents of that directory must contain a 
+recursive copy of the `mnist` sub-directory as available here.
\ No newline at end of file
diff --git a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb
new file mode 100644
index 0000000000000000000000000000000000000000..760ab4233ddcb5b432bac7ad418179c380c18127
--- /dev/null
+++ b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a70974534a27eaa5dc42638940ad311981b0259f1f089ea46c695bfd9c1862da
+size 8749056
diff --git a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb
new file mode 100644
index 0000000000000000000000000000000000000000..eda8c00824c606c2c5eb4d5db6ccbbfb85da9a01
--- /dev/null
+++ b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0667461174c505913de02429312bcbd9c6cab774b4495c7a2bbe7061ce3ccea
+size 8192
diff --git a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb
new file mode 100644
index 0000000000000000000000000000000000000000..4432b2e157c90b01c117caabfd241e9e54e46bee
--- /dev/null
+++ b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3eea94f5e1ea128f16ff0e18f9e287cc2676a54a3218105c525e602f375666c1
+size 50757632
diff --git a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb
new file mode 100644
index 0000000000000000000000000000000000000000..d961b47989b1ea9cda34eb5a19ed516938c40482
--- /dev/null
+++ b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33569d983c9d6d527cd7d3202c31a2a7395b254fb8076f59b84ecaecb9207906
+size 8192
diff --git a/datasets/mnist/keras/mnist.npz b/datasets/mnist/keras/mnist.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0329306fa8ab17b093038c6fc3033f6a5314f61
--- /dev/null
+++ b/datasets/mnist/keras/mnist.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1
+size 11490434
diff --git a/datasets/mnist/pytorch/data/processed/test.pt b/datasets/mnist/pytorch/data/processed/test.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94b65e861140519fae72363621049cc6a0c231c7
--- /dev/null
+++ b/datasets/mnist/pytorch/data/processed/test.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:351753ceb47ffe74395c0689c44f4e5f3eacd8f8c9d9382531d0e1b86a72eb82
+size 7920442
diff --git a/datasets/mnist/pytorch/data/processed/training.pt b/datasets/mnist/pytorch/data/processed/training.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f292961bf2fbad4e68d0b39fba704b3f0df41cc
--- /dev/null
+++ b/datasets/mnist/pytorch/data/processed/training.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ca0471d295e5146aed72b4bc651509d4aa83210a8b23cab01e6472152c825ed
+size 47520442
diff --git a/datasets/mnist/raw/t10k-images-idx3-ubyte.gz b/datasets/mnist/raw/t10k-images-idx3-ubyte.gz
new file mode 100644
index 0000000000000000000000000000000000000000..aa17dfe485689242a90be276702dcadd17d406f4
--- /dev/null
+++ b/datasets/mnist/raw/t10k-images-idx3-ubyte.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6
+size 1648877
diff --git a/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz b/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d1995bebe8e5b3faeaae99149ce4eb7a68c5764d
--- /dev/null
+++ b/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6
+size 4542
diff --git a/datasets/mnist/raw/train-images-idx3-ubyte.gz b/datasets/mnist/raw/train-images-idx3-ubyte.gz
new file mode 100644
index 0000000000000000000000000000000000000000..9e9852c14333d6b633709fec2c6df84941243c9d
--- /dev/null
+++ b/datasets/mnist/raw/train-images-idx3-ubyte.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609
+size 9912422
diff --git a/datasets/mnist/raw/train-labels-idx1-ubyte.gz b/datasets/mnist/raw/train-labels-idx1-ubyte.gz
new file mode 100644
index 0000000000000000000000000000000000000000..a7ebf9b5b685e9014530844158807071ae717f7f
--- /dev/null
+++ b/datasets/mnist/raw/train-labels-idx1-ubyte.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c
+size 28881
diff --git a/horovod/README.md b/horovod/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35499659b5f85b8b1de15b0ea310bd0a1b84cba2
--- /dev/null
+++ b/horovod/README.md
@@ -0,0 +1,53 @@
+# Notes
+
+All source code samples were taken from the Horovod examples repository 
+[here](https://github.com/uber/horovod/tree/master/examples) 
+(last checked: February 19, 2019). The samples that work with MNIST data have been 
+slightly modified. Our changes are limited to,
+
+*  The data loading mechanism
+*  A bit of code cleanup
+*  A few additional comments pertaining to our custom data loading mechanism
+
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All 
+statements that demonstrate the use of Horovod follow a comment beginning with 
+`[Horovod]` (as added by Horovod developers).
+
+**Caution:** Where job submission scripts are available for both Python 2 and Python 3, please 
+do not submit both Python 2 and Python 3 jobs simultaneously, as one of the jobs might fail. If 
+you would like to try both, please run these in tandem.
+
+## Keras samples
+
+The following Keras samples are included:
+
+1.  `mnist.py`: A simple MNIST processing example with only the essential Horovod code 
+for distributed training.
+2.  `mnist_advanced.py`: This sample is primarily the same as `mnist.py`. However, a 
+few more advanced Horovod features are used.
+
+## PyTorch samples
+
+The following PyTorch samples are included:
+
+1.  `mnist.py`: Demonstrates distributed training using Horovod with PyTorch. A 
+simple convolutional neural network is trained on the MNIST dataset.
+2.  `synthetic_benchmark.py`: A benchmark that can be used to measure performance 
+of PyTorch with Horovod without using any external dataset.
+
+**Note:** The job scripts for JURECA are prefixed with `.` for these samples, so that 
+these scripts do not appear in the directory listing. The reason for doing this is
+that our testing revealed issues with multi-node training. As soon as the issue has 
+been resolved, we'll make the scripts available.
+
+## Tensorflow samples
+
+The following Tensorflow samples are included:
+
+1.  `mnist.py`: Demonstrates distributed training using Horovod with the low-level
+Tensorflow API. A simple convolutional neural network is trained on the MNIST dataset.
+2.  `mnist_estimator.py`: Demonstrates distributed training using Horovod with the
+high-level Estimator API in Tensorflow. A simple convolutional neural network is 
+trained on the MNIST dataset.
+3.  `synthetic_benchmark.py`: A simple benchmark that can be used to measure performance 
+of Tensorflow with Horovod without using any external dataset.
diff --git a/horovod/keras/checkpoints/.gitkeep b/horovod/keras/checkpoints/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..b098f2602a721c5e5f1089aca9abc352b21645f1
--- /dev/null
+++ b/horovod/keras/mnist.py
@@ -0,0 +1,111 @@
+from __future__ import print_function
+import os
+import sys
+import keras
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras import backend as K
+import math
+import tensorflow as tf
+import horovod.keras as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+config.gpu_options.visible_device_list = str(hvd.local_rank())
+K.set_session(tf.Session(config=config))
+
+batch_size = 128
+num_classes = 10
+
+# Horovod: adjust number of epochs based on number of GPUs.
+epochs = int(math.ceil(12.0 / hvd.size()))
+
+# Input image dimensions
+img_rows, img_cols = 28, 28
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+# [HPCNS] Load MNIST dataset
+(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
+
+if K.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+    input_shape = (1, img_rows, img_cols)
+else:
+    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+    input_shape = (img_rows, img_cols, 1)
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# Convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, kernel_size=(3, 3),
+                 activation='relu',
+                 input_shape=input_shape))
+model.add(Conv2D(64, (3, 3), activation='relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+model.add(Flatten())
+model.add(Dense(128, activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes, activation='softmax'))
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = keras.optimizers.Adadelta(1.0 * hvd.size())
+
+# Horovod: add Horovod Distributed Optimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+model.compile(loss=keras.losses.categorical_crossentropy,
+              optimizer=opt,
+              metrics=['accuracy'])
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+    callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
+
+model.fit(x_train, y_train,
+          batch_size=batch_size,
+          callbacks=callbacks,
+          epochs=epochs,
+          verbose=1,
+          validation_data=(x_test, y_test))
+score = model.evaluate(x_test, y_test, verbose=0)
+print('Test loss:', score[0])
+print('Test accuracy:', score[1])
diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py
new file mode 100644
index 0000000000000000000000000000000000000000..9337026bc92b2bd2b4b570976381f8c01f2f87b8
--- /dev/null
+++ b/horovod/keras/mnist_advanced.py
@@ -0,0 +1,143 @@
+from __future__ import print_function
+import os
+import sys
+import keras
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras.preprocessing.image import ImageDataGenerator
+from keras import backend as K
+import tensorflow as tf
+import horovod.keras as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+config.gpu_options.allow_growth = True
+config.gpu_options.visible_device_list = str(hvd.local_rank())
+K.set_session(tf.Session(config=config))
+
+batch_size = 128
+num_classes = 10
+
+# Enough epochs to demonstrate learning rate warmup and the reduction of
+# learning rate when training plateaues.
+epochs = 12
+
+# Input image dimensions
+img_rows, img_cols = 28, 28
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+# [HPCNS] Load MNIST dataset.
+(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
+
+# Determine how many batches are there in train and test sets
+train_batches = len(x_train) // batch_size
+test_batches = len(x_test) // batch_size
+
+if K.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+    input_shape = (1, img_rows, img_cols)
+else:
+    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+    input_shape = (img_rows, img_cols, 1)
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# Convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, kernel_size=(3, 3),
+                 activation='relu',
+                 input_shape=input_shape))
+model.add(Conv2D(64, (3, 3), activation='relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+model.add(Flatten())
+model.add(Dense(128, activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes, activation='softmax'))
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size())
+
+# Horovod: add Horovod Distributed Optimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+model.compile(loss=keras.losses.categorical_crossentropy,
+              optimizer=opt,
+              metrics=['accuracy'])
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+    # Horovod: average metrics among workers at the end of every epoch.
+    #
+    # Note: This callback must be in the list before the ReduceLROnPlateau,
+    # TensorBoard or other metrics-based callbacks.
+    hvd.callbacks.MetricAverageCallback(),
+
+    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+    # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
+    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
+
+    # Reduce the learning rate if training plateaues.
+    keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+    callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
+
+# Set up ImageDataGenerators to do data augmentation for the training images.
+train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
+                               height_shift_range=0.08, zoom_range=0.08)
+test_gen = ImageDataGenerator()
+
+# Train the model.
+# Horovod: the training will randomly sample 1 / N batches of training data and
+# 3 / N batches of validation data on every worker, where N is the number of workers.
+# Over-sampling of validation data helps to increase probability that every validation
+# example will be evaluated.
+model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
+                    steps_per_epoch=train_batches // hvd.size(),
+                    callbacks=callbacks,
+                    epochs=epochs,
+                    verbose=1,
+                    validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size),
+                    validation_steps=3 * test_batches // hvd.size())
+
+# Evaluate the model on the full data set.
+score = model.evaluate(x_test, y_test, verbose=0)
+print('Test loss:', score[0])
+print('Test accuracy:', score[1])
diff --git a/horovod/keras/run_on_localMachine.sh b/horovod/keras/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35
--- /dev/null
+++ b/horovod/keras/run_on_localMachine.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Run the program
+mpirun -np 1 -H localhost:1 \
+    -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
+    -mca pml ob1 -mca btl ^openib \
+    python -u mnist.py
diff --git a/horovod/keras/submit_job_jureca_python2.sh b/horovod/keras/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d3f39c54154eec58a32d10ecc61f44516af76301
--- /dev/null
+++ b/horovod/keras/submit_job_jureca_python2.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_KERAS_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load TensorFlow/1.12.0-GPU-Python-2.7.15
+module load Keras/2.2.4-GPU-Python-2.7.15
+module load Horovod/0.15.2-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..33ba711d3a5f77acc09241b76dc82b404cb48220
--- /dev/null
+++ b/horovod/keras/submit_job_jureca_python3.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_KERAS_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load TensorFlow/1.12.0-GPU-Python-3.6.6
+module load Keras/2.2.4-GPU-Python-3.6.6
+module load Horovod/0.15.2-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/keras/submit_job_juron_python2.sh b/horovod/keras/submit_job_juron_python2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cd5f8dd051c5b46502ac9b3256a7ae0e01dc3572
--- /dev/null
+++ b/horovod/keras/submit_job_juron_python2.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 4
+#BSUB -R "span[ptile=2]"
+#BSUB -gpu "num=2"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J HOROVOD_KERAS_MNIST
+
+# Load the required modules
+module load python/2.7.14
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load horovod/0.15.2
+module load keras/2.2.4
+
+# Run the program
+mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
+        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/keras/submit_job_juron_python3.sh b/horovod/keras/submit_job_juron_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..03182786d1f52c2cb8cacd9e8c709f1c9d93cc40
--- /dev/null
+++ b/horovod/keras/submit_job_juron_python3.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 4
+#BSUB -R "span[ptile=2]"
+#BSUB -gpu "num=2"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J HOROVOD_KERAS_MNIST
+
+# Load the required modules
+module load python/3.6.1
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load horovod/0.15.2
+module load keras/2.2.4
+
+# Run the program
+mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
+        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/pytorch/.submit_job_jureca_python2.sh b/horovod/pytorch/.submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..885763864fc144947211309994ec8eb5bf539291
--- /dev/null
+++ b/horovod/pytorch/.submit_job_jureca_python2.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_PYTORCH_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load PyTorch/1.0.0-GPU-Python-2.7.15
+module load torchvision/0.2.1-GPU-Python-2.7.15
+module load Horovod/0.15.2-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/pytorch/.submit_job_jureca_python3.sh b/horovod/pytorch/.submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..41628882e0e202fb5fea56afabb1c0e3e2dc2a3b
--- /dev/null
+++ b/horovod/pytorch/.submit_job_jureca_python3.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_PYTORCH_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load PyTorch/1.0.0-GPU-Python-3.6.6
+module load torchvision/0.2.1-GPU-Python-3.6.6
+module load Horovod/0.15.2-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b9c584ab4079dfddc9fe5f6633ad9ab2145b4
--- /dev/null
+++ b/horovod/pytorch/mnist.py
@@ -0,0 +1,195 @@
+from __future__ import print_function
+import os
+import sys
+import shutil
+import argparse
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+import torch.utils.data.distributed
+import horovod.torch as hvd
+
+# Training settings
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                    help='input batch size for training (default: 64)')
+parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                    help='input batch size for testing (default: 1000)')
+parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                    help='number of epochs to train (default: 10)')
+parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                    help='learning rate (default: 0.01)')
+parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                    help='SGD momentum (default: 0.5)')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--seed', type=int, default=42, metavar='S',
+                    help='random seed (default: 42)')
+parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                    help='how many batches to wait before logging training status')
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+args = parser.parse_args()
+args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/pytorch/data'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize library.
+hvd.init()
+torch.manual_seed(args.seed)
+
+if args.cuda:
+    # Horovod: pin GPU to local rank.
+    torch.cuda.set_device(hvd.local_rank())
+    torch.cuda.manual_seed(args.seed)
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+# [HPCNS] Dataset filename for this rank
+dataset_for_rank = 'MNIST-data-%d' % hvd.rank()
+
+# [HPCNS] If the path already exists, remove it
+if os.path.exists(dataset_for_rank):
+    shutil.rmtree(dataset_for_rank)
+
+# [HPCNS] Make a copy of the dataset for this rank
+shutil.copytree(dataset_file, dataset_for_rank)
+
+kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
+train_dataset = \
+    datasets.MNIST(dataset_for_rank, train=True, download=False,
+                   transform=transforms.Compose([
+                       transforms.ToTensor(),
+                       transforms.Normalize((0.1307,), (0.3081,))
+                   ]))
+# Horovod: use DistributedSampler to partition the training data.
+train_sampler = torch.utils.data.distributed.DistributedSampler(
+    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
+train_loader = torch.utils.data.DataLoader(
+    train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
+
+test_dataset = \
+    datasets.MNIST(dataset_for_rank, train=False, download=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ]))
+# Horovod: use DistributedSampler to partition the test data.
+test_sampler = torch.utils.data.distributed.DistributedSampler(
+    test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
+test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
+                                          sampler=test_sampler, **kwargs)
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x)
+
+
+model = Net()
+
+if args.cuda:
+    # Move model to GPU.
+    model.cuda()
+
+# Horovod: broadcast parameters.
+hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+
+# Horovod: scale learning rate by the number of GPUs.
+optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
+                      momentum=args.momentum)
+
+# Horovod: (optional) compression algorithm.
+compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+# Horovod: wrap optimizer with DistributedOptimizer.
+optimizer = hvd.DistributedOptimizer(optimizer,
+                                     named_parameters=model.named_parameters(),
+                                     compression=compression)
+
+
+def train(epoch):
+    model.train()
+    # Horovod: set epoch to sampler for shuffling.
+    train_sampler.set_epoch(epoch)
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            # Horovod: use train_sampler to determine the number of examples in
+            # this worker's partition.
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_sampler),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def metric_average(val, name):
+    tensor = torch.tensor(val)
+    avg_tensor = hvd.allreduce(tensor, name=name)
+    return avg_tensor.item()
+
+
+def test():
+    model.eval()
+    test_loss = 0.
+    test_accuracy = 0.
+    for data, target in test_loader:
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        output = model(data)
+        # sum up batch loss
+        test_loss += F.nll_loss(output, target, size_average=False).item()
+        # get the index of the max log-probability
+        pred = output.data.max(1, keepdim=True)[1]
+        test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum()
+
+    # Horovod: use test_sampler to determine the number of examples in
+    # this worker's partition.
+    test_loss /= len(test_sampler)
+    test_accuracy /= len(test_sampler)
+
+    # Horovod: average metric values across workers.
+    test_loss = metric_average(test_loss, 'avg_loss')
+    test_accuracy = metric_average(test_accuracy, 'avg_accuracy')
+
+    # Horovod: print output only on first rank.
+    if hvd.rank() == 0:
+        print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
+            test_loss, 100. * test_accuracy))
+
+
+for epoch in range(1, args.epochs + 1):
+    train(epoch)
+    test()
+
+# [HPCNS] Remove the copied dataset
+shutil.rmtree(dataset_for_rank)
diff --git a/horovod/pytorch/run_on_localMachine.sh b/horovod/pytorch/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35
--- /dev/null
+++ b/horovod/pytorch/run_on_localMachine.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Run the program
+mpirun -np 1 -H localhost:1 \
+    -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
+    -mca pml ob1 -mca btl ^openib \
+    python -u mnist.py
diff --git a/horovod/pytorch/submit_job_juron_python3.sh b/horovod/pytorch/submit_job_juron_python3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..126c939b04c3f0cf8b3180e251b009c03ad69d0e
--- /dev/null
+++ b/horovod/pytorch/submit_job_juron_python3.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 4
+#BSUB -R "span[ptile=2]"
+#BSUB -gpu "num=2"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J PYTORCH_HOROVOD_MNIST
+
+# Load the required modules
+module load python/3.6.1
+module load pytorch/1.0.1-gcc_5.4.0-cuda_10.0.130
+module load torchvision/0.2.1
+module load horovod/0.15.2
+
+# Run the program
+mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
+        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/pytorch/synthetic_benchmark.py b/horovod/pytorch/synthetic_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a177f8b4e8583cb8169d308660fec8b7fc1664
--- /dev/null
+++ b/horovod/pytorch/synthetic_benchmark.py
@@ -0,0 +1,110 @@
+from __future__ import print_function
+
+import argparse
+import torch.backends.cudnn as cudnn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data.distributed
+from torchvision import models
+import horovod.torch as hvd
+import timeit
+import numpy as np
+
+# Benchmark settings
+parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+
+parser.add_argument('--model', type=str, default='resnet50',
+                    help='model to benchmark')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size')
+
+parser.add_argument('--num-warmup-batches', type=int, default=10,
+                    help='number of warm-up batches that don\'t count towards benchmark')
+parser.add_argument('--num-batches-per-iter', type=int, default=10,
+                    help='number of batches per benchmark iteration')
+parser.add_argument('--num-iters', type=int, default=10,
+                    help='number of benchmark iterations')
+
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+
+args = parser.parse_args()
+args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+hvd.init()
+
+if args.cuda:
+    # Horovod: pin GPU to local rank.
+    torch.cuda.set_device(hvd.local_rank())
+
+cudnn.benchmark = True
+
+# Set up standard model.
+model = getattr(models, args.model)()
+
+if args.cuda:
+    # Move model to GPU.
+    model.cuda()
+
+optimizer = optim.SGD(model.parameters(), lr=0.01)
+
+# Horovod: (optional) compression algorithm.
+compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+# Horovod: wrap optimizer with DistributedOptimizer.
+optimizer = hvd.DistributedOptimizer(optimizer,
+                                     named_parameters=model.named_parameters(),
+                                     compression=compression)
+
+# Horovod: broadcast parameters & optimizer state.
+hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
+# Set up fixed fake data
+data = torch.randn(args.batch_size, 3, 224, 224)
+target = torch.LongTensor(args.batch_size).random_() % 1000
+if args.cuda:
+    data, target = data.cuda(), target.cuda()
+
+
+def benchmark_step():
+    optimizer.zero_grad()
+    output = model(data)
+    loss = F.cross_entropy(output, target)
+    loss.backward()
+    optimizer.step()
+
+
+def log(s, nl=True):
+    if hvd.rank() != 0:
+        return
+    print(s, end='\n' if nl else '')
+
+
+log('Model: %s' % args.model)
+log('Batch size: %d' % args.batch_size)
+device = 'GPU' if args.cuda else 'CPU'
+log('Number of %ss: %d' % (device, hvd.size()))
+
+# Warm-up
+log('Running warmup...')
+timeit.timeit(benchmark_step, number=args.num_warmup_batches)
+
+# Benchmark
+log('Running benchmark...')
+img_secs = []
+for x in range(args.num_iters):
+    time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
+    img_sec = args.batch_size * args.num_batches_per_iter / time
+    log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
+    img_secs.append(img_sec)
+
+# Results
+img_sec_mean = np.mean(img_secs)
+img_sec_conf = 1.96 * np.std(img_secs)
+log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
+log('Total img/sec on %d %s(s): %.1f +-%.1f' %
+    (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
diff --git a/horovod/tensorflow/checkpoints/.gitkeep b/horovod/tensorflow/checkpoints/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..da37944b01335cb3d78b20e5245d9518fae8779e
--- /dev/null
+++ b/horovod/tensorflow/mnist.py
@@ -0,0 +1,169 @@
+# Copyright 2017 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+import tensorflow as tf
+import horovod.tensorflow as hvd
+import numpy as np
+import shutil
+
+from tensorflow import keras
+
+layers = tf.layers
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+
+def conv_model(feature, target, mode):
+    """2-layer convolution model."""
+    # Convert the target to a one-hot tensor of shape (batch_size, 10) and
+    # with a on-value of 1 for each one-hot vector of length 10.
+    target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
+
+    # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
+    # image width and height final dimension being the number of color channels.
+    feature = tf.reshape(feature, [-1, 28, 28, 1])
+
+    # First conv layer will compute 32 features for each 5x5 patch
+    with tf.variable_scope('conv_layer1'):
+        h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
+                                activation=tf.nn.relu, padding="SAME")
+        h_pool1 = tf.nn.max_pool(
+            h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+    # Second conv layer will compute 64 features for each 5x5 patch.
+    with tf.variable_scope('conv_layer2'):
+        h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
+                                activation=tf.nn.relu, padding="SAME")
+        h_pool2 = tf.nn.max_pool(
+            h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+        # reshape tensor into a batch of vectors
+        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
+
+    # Densely connected layer with 1024 neurons.
+    h_fc1 = layers.dropout(
+        layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
+        rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
+
+    # Compute logits (1 per class) and compute loss.
+    logits = layers.dense(h_fc1, 10, activation=None)
+    loss = tf.losses.softmax_cross_entropy(target, logits)
+
+    return tf.argmax(logits, 1), loss
+
+
+def train_input_generator(x_train, y_train, batch_size=64):
+    assert len(x_train) == len(y_train)
+    while True:
+        p = np.random.permutation(len(x_train))
+        x_train, y_train = x_train[p], y_train[p]
+        index = 0
+        while index <= len(x_train) - batch_size:
+            yield x_train[index:index + batch_size], \
+                  y_train[index:index + batch_size],
+            index += batch_size
+
+
+def main(_):
+    # Horovod: initialize Horovod.
+    hvd.init()
+
+    # [HPCNS] Fully qualified dataset file name
+    dataset_file = os.path.join(data_dir, data_file)
+
+    # [HPCNS] Dataset filename for this rank
+    dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank())
+
+    # [HPCNS] Make a copy of the dataset for this rank
+    shutil.copyfile(dataset_file, dataset_for_rank)
+
+    # [HPCNS] Load MNIST dataset
+    (x_train, y_train), (x_test, y_test) = \
+        keras.datasets.mnist.load_data(dataset_for_rank)
+
+    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
+    # into (-1, 784) to feed into our network. Also, need to normalize the
+    # features between 0 and 1.
+    x_train = np.reshape(x_train, (-1, 784)) / 255.0
+    x_test = np.reshape(x_test, (-1, 784)) / 255.0
+
+    # Build model...
+    with tf.name_scope('input'):
+        image = tf.placeholder(tf.float32, [None, 784], name='image')
+        label = tf.placeholder(tf.float32, [None], name='label')
+    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
+
+    # Horovod: adjust learning rate based on number of GPUs.
+    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
+
+    # Horovod: add Horovod Distributed Optimizer.
+    opt = hvd.DistributedOptimizer(opt)
+
+    global_step = tf.train.get_or_create_global_step()
+    train_op = opt.minimize(loss, global_step=global_step)
+
+    hooks = [
+        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
+        # from rank 0 to all other processes. This is necessary to ensure consistent
+        # initialization of all workers when training is started with random weights
+        # or restored from a checkpoint.
+        hvd.BroadcastGlobalVariablesHook(0),
+
+        # Horovod: adjust number of steps based on number of GPUs.
+        tf.train.StopAtStepHook(last_step=20000 // hvd.size()),
+
+        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
+                                   every_n_iter=10),
+    ]
+
+    # Horovod: pin GPU to be used to process local rank (one GPU per process)
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+
+    # Horovod: save checkpoints only on worker 0 to prevent other workers from
+    # corrupting them.
+    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
+    training_batch_generator = train_input_generator(x_train,
+                                                     y_train, batch_size=100)
+    # The MonitoredTrainingSession takes care of session initialization,
+    # restoring from a checkpoint, saving to a checkpoint, and closing when done
+    # or an error occurs.
+    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
+                                           hooks=hooks,
+                                           config=config) as mon_sess:
+        while not mon_sess.should_stop():
+            # Run a training step synchronously.
+            image_, label_ = next(training_batch_generator)
+            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
+
+    # [HPCNS] Remove the copied dataset
+    os.remove(dataset_for_rank)
+
+
+if __name__ == "__main__":
+    tf.app.run()
diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..861de50549b470685462a688643dbf3cd8e86288
--- /dev/null
+++ b/horovod/tensorflow/mnist_estimator.py
@@ -0,0 +1,223 @@
+#  Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Convolutional Neural Network Estimator for MNIST, built with tf.layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import shutil
+import numpy as np
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+from tensorflow import keras
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+
+def cnn_model_fn(features, labels, mode):
+    """Model function for CNN."""
+    # Input Layer
+    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
+    # MNIST images are 28x28 pixels, and have one color channel
+    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
+
+    # Convolutional Layer #1
+    # Computes 32 features using a 5x5 filter with ReLU activation.
+    # Padding is added to preserve width and height.
+    # Input Tensor Shape: [batch_size, 28, 28, 1]
+    # Output Tensor Shape: [batch_size, 28, 28, 32]
+    conv1 = tf.layers.conv2d(
+        inputs=input_layer,
+        filters=32,
+        kernel_size=[5, 5],
+        padding="same",
+        activation=tf.nn.relu)
+
+    # Pooling Layer #1
+    # First max pooling layer with a 2x2 filter and stride of 2
+    # Input Tensor Shape: [batch_size, 28, 28, 32]
+    # Output Tensor Shape: [batch_size, 14, 14, 32]
+    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+
+    # Convolutional Layer #2
+    # Computes 64 features using a 5x5 filter.
+    # Padding is added to preserve width and height.
+    # Input Tensor Shape: [batch_size, 14, 14, 32]
+    # Output Tensor Shape: [batch_size, 14, 14, 64]
+    conv2 = tf.layers.conv2d(
+        inputs=pool1,
+        filters=64,
+        kernel_size=[5, 5],
+        padding="same",
+        activation=tf.nn.relu)
+
+    # Pooling Layer #2
+    # Second max pooling layer with a 2x2 filter and stride of 2
+    # Input Tensor Shape: [batch_size, 14, 14, 64]
+    # Output Tensor Shape: [batch_size, 7, 7, 64]
+    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+
+    # Flatten tensor into a batch of vectors
+    # Input Tensor Shape: [batch_size, 7, 7, 64]
+    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
+    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+
+    # Dense Layer
+    # Densely connected layer with 1024 neurons
+    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
+    # Output Tensor Shape: [batch_size, 1024]
+    dense = tf.layers.dense(inputs=pool2_flat, units=1024,
+                            activation=tf.nn.relu)
+
+    # Add dropout operation; 0.6 probability that element will be kept
+    dropout = tf.layers.dropout(
+        inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
+
+    # Logits layer
+    # Input Tensor Shape: [batch_size, 1024]
+    # Output Tensor Shape: [batch_size, 10]
+    logits = tf.layers.dense(inputs=dropout, units=10)
+
+    predictions = {
+        # Generate predictions (for PREDICT and EVAL mode)
+        "classes": tf.argmax(input=logits, axis=1),
+        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
+        # `logging_hook`.
+        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
+    }
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+
+    # Calculate Loss (for both TRAIN and EVAL modes)
+    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+    loss = tf.losses.softmax_cross_entropy(
+        onehot_labels=onehot_labels, logits=logits)
+
+    # Configure the Training Op (for TRAIN mode)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        # Horovod: scale learning rate by the number of workers.
+        optimizer = tf.train.MomentumOptimizer(
+            learning_rate=0.001 * hvd.size(), momentum=0.9)
+
+        # Horovod: add Horovod Distributed Optimizer.
+        optimizer = hvd.DistributedOptimizer(optimizer)
+
+        train_op = optimizer.minimize(
+            loss=loss,
+            global_step=tf.train.get_global_step())
+        return tf.estimator.EstimatorSpec(mode=mode, loss=loss,
+                                          train_op=train_op)
+
+    # Add evaluation metrics (for EVAL mode)
+    eval_metric_ops = {
+        "accuracy": tf.metrics.accuracy(
+            labels=labels, predictions=predictions["classes"])}
+    return tf.estimator.EstimatorSpec(
+        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+
+
+def main(unused_argv):
+    # Horovod: initialize Horovod.
+    hvd.init()
+
+    # [HPCNS] Fully qualified dataset file name
+    dataset_file = os.path.join(data_dir, data_file)
+
+    # [HPCNS] Dataset filename for this rank
+    dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank())
+
+    # [HPCNS] Make a copy of the dataset for this rank
+    shutil.copyfile(dataset_file, dataset_for_rank)
+
+    # [HPCNS] Load MNIST dataset
+    (train_data, train_labels), (eval_data, eval_labels) = \
+        keras.datasets.mnist.load_data(dataset_for_rank)
+
+    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
+    # into (-1, 784) to feed into our network. Also, need to normalize the
+    # features between 0 and 1.
+    train_data = np.reshape(train_data, (-1, 784)) / 255.0
+    eval_data = np.reshape(eval_data, (-1, 784)) / 255.0
+
+    # Horovod: pin GPU to be used to process local rank (one GPU per process)
+    config = tf.ConfigProto()
+    config.gpu_options.allow_growth = True
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+
+    # Horovod: save checkpoints only on worker 0 to prevent other workers from
+    # corrupting them.
+    model_dir = 'checkpoints/mnist_convnet_model' if hvd.rank() == 0 else None
+
+    # Create the Estimator
+    mnist_classifier = tf.estimator.Estimator(
+        model_fn=cnn_model_fn, model_dir=model_dir,
+        config=tf.estimator.RunConfig(session_config=config))
+
+    # Set up logging for predictions
+    # Log the values in the "Softmax" tensor with label "probabilities"
+    tensors_to_log = {"probabilities": "softmax_tensor"}
+    logging_hook = tf.train.LoggingTensorHook(
+        tensors=tensors_to_log, every_n_iter=500)
+
+    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
+    # rank 0 to all other processes. This is necessary to ensure consistent
+    # initialization of all workers when training is started with random weights or
+    # restored from a checkpoint.
+    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
+
+    # Train the model
+    train_input_fn = tf.estimator.inputs.numpy_input_fn(
+        x={"x": train_data},
+        y=train_labels,
+        batch_size=100,
+        num_epochs=None,
+        shuffle=True)
+
+    # Horovod: adjust number of steps based on number of GPUs.
+    mnist_classifier.train(
+        input_fn=train_input_fn,
+        steps=500 // hvd.size(),
+        hooks=[logging_hook, bcast_hook])
+
+    # Evaluate the model and print results
+    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+        x={"x": eval_data},
+        y=eval_labels,
+        num_epochs=1,
+        shuffle=False)
+    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
+    print(eval_results)
+
+    # [HPCNS] Remove the copied dataset
+    os.remove(dataset_for_rank)
+
+
+if __name__ == "__main__":
+    tf.app.run()
diff --git a/horovod/tensorflow/run_on_localMachine.sh b/horovod/tensorflow/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35
--- /dev/null
+++ b/horovod/tensorflow/run_on_localMachine.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Run the program
+mpirun -np 1 -H localhost:1 \
+    -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \
+    -mca pml ob1 -mca btl ^openib \
+    python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_jureca_python2.sh b/horovod/tensorflow/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c9a386afeee610280ab4a6c51610f4261ec0ea11
--- /dev/null
+++ b/horovod/tensorflow/submit_job_jureca_python2.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_TFLOW_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load TensorFlow/1.12.0-GPU-Python-2.7.15
+module load Horovod/0.15.2-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..60122fe53c43e4635a16fdf626d5cb183a73d52d
--- /dev/null
+++ b/horovod/tensorflow/submit_job_jureca_python3.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HOROVOD_TFLOW_MNIST
+#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load TensorFlow/1.12.0-GPU-Python-3.6.6
+module load Horovod/0.15.2-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_juron_python2.sh b/horovod/tensorflow/submit_job_juron_python2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..85b2ee684ae7732ea48124530e5f3c4416eea69c
--- /dev/null
+++ b/horovod/tensorflow/submit_job_juron_python2.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 4
+#BSUB -R "span[ptile=2]"
+#BSUB -gpu "num=2"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J HOROVOD_TFLOW_MNIST
+
+# Load the required modules
+module load python/2.7.14
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load horovod/0.15.2
+
+# Run the program
+mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
+        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_juron_python3.sh b/horovod/tensorflow/submit_job_juron_python3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..01075474bae35cafb29c70239f29214de904a6ca
--- /dev/null
+++ b/horovod/tensorflow/submit_job_juron_python3.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 4
+#BSUB -R "span[ptile=2]"
+#BSUB -gpu "num=2"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J HOROVOD_TFLOW_MNIST
+
+# Load the required modules
+module load python/3.6.1
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load horovod/0.15.2
+
+# Run the program
+mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \
+        -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py
diff --git a/horovod/tensorflow/synthetic_benchmark.py b/horovod/tensorflow/synthetic_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..abbdd20fdb933dbde47f7d92f644da2454dbd8e7
--- /dev/null
+++ b/horovod/tensorflow/synthetic_benchmark.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import numpy as np
+import timeit
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+from tensorflow.keras import applications
+
+# Benchmark settings
+parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+
+parser.add_argument('--model', type=str, default='ResNet50',
+                    help='model to benchmark')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size')
+
+parser.add_argument('--num-warmup-batches', type=int, default=10,
+                    help='number of warm-up batches that don\'t count towards benchmark')
+parser.add_argument('--num-batches-per-iter', type=int, default=10,
+                    help='number of batches per benchmark iteration')
+parser.add_argument('--num-iters', type=int, default=10,
+                    help='number of benchmark iterations')
+
+parser.add_argument('--eager', action='store_true', default=False,
+                    help='enables eager execution')
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+
+args = parser.parse_args()
+args.cuda = not args.no_cuda
+
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+if args.cuda:
+    config.gpu_options.allow_growth = True
+    config.gpu_options.visible_device_list = str(hvd.local_rank())
+else:
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+    config.gpu_options.allow_growth = False
+    config.gpu_options.visible_device_list = ''
+
+if args.eager:
+    tf.enable_eager_execution(config)
+
+# Set up standard model.
+model = getattr(applications, args.model)(weights=None)
+
+opt = tf.train.GradientDescentOptimizer(0.01)
+
+# Horovod: (optional) compression algorithm.
+compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+# Horovod: wrap optimizer with DistributedOptimizer.
+opt = hvd.DistributedOptimizer(opt, compression=compression)
+
+init = tf.global_variables_initializer()
+bcast_op = hvd.broadcast_global_variables(0)
+
+data = tf.random_uniform([args.batch_size, 224, 224, 3])
+target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
+
+
+def loss_function():
+    logits = model(data, training=True)
+    return tf.losses.sparse_softmax_cross_entropy(target, logits)
+
+
+def log(s, nl=True):
+    if hvd.rank() != 0:
+        return
+    print(s, end='\n' if nl else '')
+
+
+log('Model: %s' % args.model)
+log('Batch size: %d' % args.batch_size)
+device = 'GPU' if args.cuda else 'CPU'
+log('Number of %ss: %d' % (device, hvd.size()))
+
+
+def run(benchmark_step):
+    # Warm-up
+    log('Running warmup...')
+    timeit.timeit(benchmark_step, number=args.num_warmup_batches)
+
+    # Benchmark
+    log('Running benchmark...')
+    img_secs = []
+    for x in range(args.num_iters):
+        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
+        img_sec = args.batch_size * args.num_batches_per_iter / time
+        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
+        img_secs.append(img_sec)
+
+    # Results
+    img_sec_mean = np.mean(img_secs)
+    img_sec_conf = 1.96 * np.std(img_secs)
+    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
+    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
+        (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
+
+
+if tf.executing_eagerly():
+    with tf.device(device):
+        run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
+else:
+    with tf.Session(config=config) as session:
+        init.run()
+        bcast_op.run()
+
+        loss = loss_function()
+        train_opt = opt.minimize(loss)
+        run(lambda: session.run(train_opt))
diff --git a/keras/README.md b/keras/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..598f4e1f95aca48216c4d10b1e48c18ef7466363
--- /dev/null
+++ b/keras/README.md
@@ -0,0 +1,13 @@
+# Notes
+
+The `mnist.py` sample is a slightly modified version of `mnist_cnn.py`
+available in the Keras examples repository 
+[here](https://github.com/keras-team/keras/tree/master/examples) 
+(last checked: February 19, 2019). Our changes are 
+limited to,
+
+*  The data loading mechanism
+*  A bit of code cleanup
+*  A few additional comments pertaining to our custom data loading mechanism
+
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`.
\ No newline at end of file
diff --git a/keras/mnist.py b/keras/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1831694e02ccc3d1546fb8955f5798474c870e6
--- /dev/null
+++ b/keras/mnist.py
@@ -0,0 +1,87 @@
+"""Trains a simple convnet on the MNIST dataset.
+
+Gets to 99.25% test accuracy after 12 epochs
+(there is still a lot of margin for parameter tuning).
+16 seconds per epoch on a GRID K520 GPU.
+"""
+
+from __future__ import print_function
+import os
+import sys
+import keras
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras import backend as K
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+batch_size = 128
+num_classes = 10
+epochs = 12
+
+# input image dimensions
+img_rows, img_cols = 28, 28
+
+# [HPCNS] Load MNIST dataset
+# the data, split between train and test sets
+(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file)
+
+if K.image_data_format() == 'channels_first':
+    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+    input_shape = (1, img_rows, img_cols)
+else:
+    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+    input_shape = (img_rows, img_cols, 1)
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, kernel_size=(3, 3),
+                 activation='relu',
+                 input_shape=input_shape))
+model.add(Conv2D(64, (3, 3), activation='relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+model.add(Flatten())
+model.add(Dense(128, activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes, activation='softmax'))
+
+model.compile(loss=keras.losses.categorical_crossentropy,
+              optimizer=keras.optimizers.Adadelta(),
+              metrics=['accuracy'])
+
+model.fit(x_train, y_train,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=1,
+          validation_data=(x_test, y_test))
+score = model.evaluate(x_test, y_test, verbose=0)
+print('Test loss:', score[0])
+print('Test accuracy:', score[1])
diff --git a/keras/run_on_localMachine.sh b/keras/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9dade0afcb3dbdad0e3570d1643511cc4bf206bb
--- /dev/null
+++ b/keras/run_on_localMachine.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# Run the program
+python -u mnist_cnn.py
diff --git a/keras/submit_job_jureca_python2.sh b/keras/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..59cfe31442312248018eedae4fb7ec7f14655875
--- /dev/null
+++ b/keras/submit_job_jureca_python2.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=KERAS_MNIST_CNN
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load TensorFlow/1.12.0-GPU-Python-2.7.15
+module load Keras/2.2.4-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/keras/submit_job_jureca_python3.sh b/keras/submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5057614a50135d9693248abe7ff7a70d44131d6b
--- /dev/null
+++ b/keras/submit_job_jureca_python3.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=KERAS_MNIST_CNN
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load TensorFlow/1.12.0-GPU-Python-3.6.6
+module load Keras/2.2.4-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/keras/submit_job_juron_python2.sh b/keras/submit_job_juron_python2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..91ae8c778668e2dd852fd75d59f00ad14d1a78d0
--- /dev/null
+++ b/keras/submit_job_juron_python2.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J KERAS_MNIST_CNN
+
+# Load the required modules
+module load python/2.7.14
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load keras/2.2.4
+
+# Run the program
+python -u mnist.py
diff --git a/keras/submit_job_juron_python3.sh b/keras/submit_job_juron_python3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7927b03679f2f4b515c90bcbc564447a23433e08
--- /dev/null
+++ b/keras/submit_job_juron_python3.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J KERAS_MNIST_CNN
+
+# Load the required modules
+module load python/3.6.1
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+module load keras/2.2.4
+
+# Run the program
+python -u mnist.py
diff --git a/pytorch/README.md b/pytorch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac1ac2f2d168d7843d479c27a82d288faf10176a
--- /dev/null
+++ b/pytorch/README.md
@@ -0,0 +1,13 @@
+# Notes
+
+The `mnist.py` sample is a slightly modified version of `main.py`
+available in the PyTorch examples repository 
+[here](https://github.com/pytorch/examples/tree/master/mnist) 
+(last checked: February 19, 2019). Our changes are 
+limited to,
+
+*  The data loading mechanism
+*  A bit of code cleanup
+*  A few additional comments pertaining to our custom data loading mechanism
+
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`.
\ No newline at end of file
diff --git a/pytorch/mnist.py b/pytorch/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4092b614e9cc2045952884199c63eafef5f7e5b
--- /dev/null
+++ b/pytorch/mnist.py
@@ -0,0 +1,151 @@
+from __future__ import print_function
+
+import os
+import sys
+import shutil
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = nn.Linear(500, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4 * 4 * 50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                       100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test(args, model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+
+    parser.add_argument('--save-model', action='store_true', default=False,
+                        help='For Saving the current Model')
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    # [HPCNS] Name of the dataset file
+    data_file = 'mnist/pytorch/data'
+
+    # [HPCNS] Path to the directory containing the dataset file
+    data_dir = DataValidator.validated_data_dir(data_file)
+
+    # [HPCNS] Fully qualified dataset file name
+    dataset_file = os.path.join(data_dir, data_file)
+
+    # [HPCNS] A copy of the dataset in the current directory
+    dataset_copy = 'MNIST-data'
+
+    # [HPCNS] If the path already exists, remove it
+    if os.path.exists(dataset_copy):
+        shutil.rmtree(dataset_copy)
+
+    # [HPCNS] Make a copy of the dataset, as the torch data loader used
+    # below expects the dataset in the current directory
+    shutil.copytree(dataset_file, dataset_copy)
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(dataset_copy, train=True, download=False,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.batch_size, shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(dataset_copy, train=False, download=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ])),
+        batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
+
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(args, model, device, test_loader)
+
+    if (args.save_model):
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+    # [HPCNS] Remove the copied dataset
+    shutil.rmtree(dataset_copy)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pytorch/run_on_localMachine.sh b/pytorch/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c5737c9fc9d6bca93e25fca9f785e52320131fc
--- /dev/null
+++ b/pytorch/run_on_localMachine.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# Run the program
+python -u mnist.py
\ No newline at end of file
diff --git a/pytorch/submit_job_jureca_python2.sh b/pytorch/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f757354a7784027bb813d98bef11ce4002a5480a
--- /dev/null
+++ b/pytorch/submit_job_jureca_python2.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=PYTORCH_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load PyTorch/1.0.0-GPU-Python-2.7.15
+module load torchvision/0.2.1-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/submit_job_jureca_python3.sh b/pytorch/submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0f66a30a0c87d45da544ed74025df0ee428933f7
--- /dev/null
+++ b/pytorch/submit_job_jureca_python3.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=PYTORCH_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load MVAPICH2/2.3-GDR
+module load PyTorch/1.0.0-GPU-Python-3.6.6
+module load torchvision/0.2.1-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/pytorch/submit_job_juron_python3.sh b/pytorch/submit_job_juron_python3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..061139f19cf8f9cdc03e8d4ced3d1c15f66ae49c
--- /dev/null
+++ b/pytorch/submit_job_juron_python3.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J PYTORCH_MNIST
+
+# Load the required modules
+module load python/3.6.1
+module load pytorch/1.0.1-gcc_5.4.0-cuda_10.0.130
+module load torchvision/0.2.1
+
+# Run the program
+python -u mnist.py
\ No newline at end of file
diff --git a/tensorflow/README.md b/tensorflow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbf485424ae35ac8a1e8fcdd4650ffa8a08114df
--- /dev/null
+++ b/tensorflow/README.md
@@ -0,0 +1,13 @@
+# Notes
+
+The `mnist.py` sample is a slightly modified version of `convolutional.py`
+available in the Tensorflow models repository 
+[here](https://github.com/tensorflow/models/blob/master/tutorials/image/mnist) 
+(last checked: February 19, 2019). Our changes are 
+limited to,
+
+*  The data loading mechanism
+*  A bit of code cleanup
+*  A few additional comments pertaining to our custom data loading mechanism
+
+**Note:** All newly added statements follow a comment beginning with `[HPCNS]`.
\ No newline at end of file
diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba4bdc5fb1b25bc0744308a26ad22856f729c26
--- /dev/null
+++ b/tensorflow/mnist.py
@@ -0,0 +1,338 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example.
+
+This should achieve a test error of 0.7%. Please keep this model as simple and
+linear as possible, it is meant as a tutorial for simple convolutional models.
+Run with --self_test on the command line to execute a short self-test.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import gzip
+import os
+import sys
+import time
+
+import numpy
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+IMAGE_SIZE = 28
+NUM_CHANNELS = 1
+PIXEL_DEPTH = 255
+NUM_LABELS = 10
+VALIDATION_SIZE = 5000  # Size of the validation set.
+SEED = 66478  # Set to None for random seed.
+BATCH_SIZE = 64
+NUM_EPOCHS = 10
+EVAL_BATCH_SIZE = 64
+EVAL_FREQUENCY = 100  # Number of steps between evaluations.
+
+FLAGS = None
+
+
+def data_type():
+    """Return the type of the activations, weights, and placeholder variables."""
+    if FLAGS.use_fp16:
+        return tf.float16
+    else:
+        return tf.float32
+
+
+def extract_data(filename, num_images):
+    """Extract the images into a 4D tensor [image index, y, x, channels].
+
+  Values are rescaled from [0, 255] down to [-0.5, 0.5].
+  """
+    print('Extracting', filename)
+    with gzip.open(filename) as bytestream:
+        bytestream.read(16)
+        buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
+        data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
+        data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
+        data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
+        return data
+
+
+def extract_labels(filename, num_images):
+    """Extract the labels into a vector of int64 label IDs."""
+    print('Extracting', filename)
+    with gzip.open(filename) as bytestream:
+        bytestream.read(8)
+        buf = bytestream.read(1 * num_images)
+        labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64)
+    return labels
+
+
+def fake_data(num_images):
+    """Generate a fake dataset that matches the dimensions of MNIST."""
+    data = numpy.ndarray(
+        shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS),
+        dtype=numpy.float32)
+    labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64)
+    for image in xrange(num_images):
+        label = image % 2
+        data[image, :, :, 0] = label - 0.5
+        labels[image] = label
+    return data, labels
+
+
+def error_rate(predictions, labels):
+    """Return the error rate based on dense predictions and sparse labels."""
+    return 100.0 - (
+            100.0 *
+            numpy.sum(numpy.argmax(predictions, 1) == labels) /
+            predictions.shape[0])
+
+
+def main(_):
+    if FLAGS.self_test:
+        print('Running self-test.')
+        train_data, train_labels = fake_data(256)
+        validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE)
+        test_data, test_labels = fake_data(EVAL_BATCH_SIZE)
+        num_epochs = 1
+    else:
+        # [HPCNS]: Data files relative to the 'datasets' directory
+        train_data_filename = 'mnist/raw/train-images-idx3-ubyte.gz'
+        train_labels_filename = 'mnist/raw/train-labels-idx1-ubyte.gz'
+        test_data_filename = 'mnist/raw/t10k-images-idx3-ubyte.gz'
+        test_labels_filename = 'mnist/raw/t10k-labels-idx1-ubyte.gz'
+
+        # [HPCNS]: Update data file information with validated and fully qualified filenames
+        train_data_filename = os.path.join(
+            DataValidator.validated_data_dir(train_data_filename), train_data_filename)
+        train_labels_filename = os.path.join(
+            DataValidator.validated_data_dir(train_labels_filename), train_labels_filename)
+        test_data_filename = os.path.join(
+            DataValidator.validated_data_dir(test_data_filename), test_data_filename)
+        test_labels_filename = os.path.join(
+            DataValidator.validated_data_dir(test_labels_filename), test_labels_filename)
+
+        # Extract it into numpy arrays.
+        train_data = extract_data(train_data_filename, 60000)
+        train_labels = extract_labels(train_labels_filename, 60000)
+        test_data = extract_data(test_data_filename, 10000)
+        test_labels = extract_labels(test_labels_filename, 10000)
+
+        # Generate a validation set.
+        validation_data = train_data[:VALIDATION_SIZE, ...]
+        validation_labels = train_labels[:VALIDATION_SIZE]
+        train_data = train_data[VALIDATION_SIZE:, ...]
+        train_labels = train_labels[VALIDATION_SIZE:]
+        num_epochs = NUM_EPOCHS
+
+    train_size = train_labels.shape[0]
+
+    # This is where training samples and labels are fed to the graph.
+    # These placeholder nodes will be fed a batch of training data at each
+    # training step using the {feed_dict} argument to the Run() call below.
+    train_data_node = tf.placeholder(
+        data_type(),
+        shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
+    train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
+    eval_data = tf.placeholder(
+        data_type(),
+        shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
+
+    # The variables below hold all the trainable weights. They are passed an
+    # initial value which will be assigned when we call:
+    # {tf.global_variables_initializer().run()}
+    conv1_weights = tf.Variable(
+        tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
+                            stddev=0.1,
+                            seed=SEED, dtype=data_type()))
+    conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
+    conv2_weights = tf.Variable(tf.truncated_normal(
+        [5, 5, 32, 64], stddev=0.1,
+        seed=SEED, dtype=data_type()))
+    conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
+    fc1_weights = tf.Variable(  # fully connected, depth 512.
+        tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
+                            stddev=0.1,
+                            seed=SEED,
+                            dtype=data_type()))
+    fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
+    fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
+                                                  stddev=0.1,
+                                                  seed=SEED,
+                                                  dtype=data_type()))
+    fc2_biases = tf.Variable(tf.constant(
+        0.1, shape=[NUM_LABELS], dtype=data_type()))
+
+    # We will replicate the model structure for the training subgraph, as well
+    # as the evaluation subgraphs, while sharing the trainable parameters.
+    def model(data, train=False):
+        """The Model definition."""
+        # 2D convolution, with 'SAME' padding (i.e. the output feature map has
+        # the same size as the input). Note that {strides} is a 4D array whose
+        # shape matches the data layout: [image index, y, x, depth].
+        conv = tf.nn.conv2d(data,
+                            conv1_weights,
+                            strides=[1, 1, 1, 1],
+                            padding='SAME')
+        # Bias and rectified linear non-linearity.
+        relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
+        # Max pooling. The kernel size spec {ksize} also follows the layout of
+        # the data. Here we have a pooling window of 2, and a stride of 2.
+        pool = tf.nn.max_pool(relu,
+                              ksize=[1, 2, 2, 1],
+                              strides=[1, 2, 2, 1],
+                              padding='SAME')
+        conv = tf.nn.conv2d(pool,
+                            conv2_weights,
+                            strides=[1, 1, 1, 1],
+                            padding='SAME')
+        relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
+        pool = tf.nn.max_pool(relu,
+                              ksize=[1, 2, 2, 1],
+                              strides=[1, 2, 2, 1],
+                              padding='SAME')
+        # Reshape the feature map cuboid into a 2D matrix to feed it to the
+        # fully connected layers.
+        pool_shape = pool.get_shape().as_list()
+        reshape = tf.reshape(
+            pool,
+            [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
+        # Fully connected layer. Note that the '+' operation automatically
+        # broadcasts the biases.
+        hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
+        # Add a 50% dropout during training only. Dropout also scales
+        # activations such that no rescaling is needed at evaluation time.
+        if train:
+            hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
+        return tf.matmul(hidden, fc2_weights) + fc2_biases
+
+    # Training computation: logits + cross-entropy loss.
+    logits = model(train_data_node, True)
+    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=train_labels_node, logits=logits))
+
+    # L2 regularization for the fully connected parameters.
+    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
+                    tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
+    # Add the regularization term to the loss.
+    loss += 5e-4 * regularizers
+
+    # Optimizer: set up a variable that's incremented once per batch and
+    # controls the learning rate decay.
+    batch = tf.Variable(0, dtype=data_type())
+    # Decay once per epoch, using an exponential schedule starting at 0.01.
+    learning_rate = tf.train.exponential_decay(
+        0.01,  # Base learning rate.
+        batch * BATCH_SIZE,  # Current index into the dataset.
+        train_size,  # Decay step.
+        0.95,  # Decay rate.
+        staircase=True)
+    # Use simple momentum for the optimization.
+    optimizer = tf.train.MomentumOptimizer(learning_rate,
+                                           0.9).minimize(loss,
+                                                         global_step=batch)
+
+    # Predictions for the current training minibatch.
+    train_prediction = tf.nn.softmax(logits)
+
+    # Predictions for the test and validation, which we'll compute less often.
+    eval_prediction = tf.nn.softmax(model(eval_data))
+
+    # Small utility function to evaluate a dataset by feeding batches of data to
+    # {eval_data} and pulling the results from {eval_predictions}.
+    # Saves memory and enables this to run on smaller GPUs.
+    def eval_in_batches(data, sess):
+        """Get all predictions for a dataset by running it in small batches."""
+        size = data.shape[0]
+        if size < EVAL_BATCH_SIZE:
+            raise ValueError("batch size for evals larger than dataset: %d" % size)
+        predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
+        for begin in xrange(0, size, EVAL_BATCH_SIZE):
+            end = begin + EVAL_BATCH_SIZE
+            if end <= size:
+                predictions[begin:end, :] = sess.run(
+                    eval_prediction,
+                    feed_dict={eval_data: data[begin:end, ...]})
+            else:
+                batch_predictions = sess.run(
+                    eval_prediction,
+                    feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
+                predictions[begin:, :] = batch_predictions[begin - size:, :]
+        return predictions
+
+    # Create a local session to run the training.
+    start_time = time.time()
+    with tf.Session() as sess:
+        # Run all the initializers to prepare the trainable parameters.
+        tf.global_variables_initializer().run()
+        print('Initialized!')
+        # Loop through training steps.
+        for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
+            # Compute the offset of the current minibatch in the data.
+            # Note that we could use better randomization across epochs.
+            offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
+            batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
+            batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
+            # This dictionary maps the batch data (as a numpy array) to the
+            # node in the graph it should be fed to.
+            feed_dict = {train_data_node: batch_data,
+                         train_labels_node: batch_labels}
+            # Run the optimizer to update weights.
+            sess.run(optimizer, feed_dict=feed_dict)
+            # print some extra information once reach the evaluation frequency
+            if step % EVAL_FREQUENCY == 0:
+                # fetch some extra nodes' data
+                l, lr, predictions = sess.run([loss, learning_rate, train_prediction],
+                                              feed_dict=feed_dict)
+                elapsed_time = time.time() - start_time
+                start_time = time.time()
+                print('Step %d (epoch %.2f), %.1f ms' %
+                      (step, float(step) * BATCH_SIZE / train_size,
+                       1000 * elapsed_time / EVAL_FREQUENCY))
+                print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
+                print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
+                print('Validation error: %.1f%%' % error_rate(
+                    eval_in_batches(validation_data, sess), validation_labels))
+                sys.stdout.flush()
+        # Finally print the result!
+        test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
+        print('Test error: %.1f%%' % test_error)
+        if FLAGS.self_test:
+            print('test_error', test_error)
+            assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % (
+                test_error,)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--use_fp16',
+        default=False,
+        help='Use half floats instead of full floats if True.',
+        action='store_true')
+    parser.add_argument(
+        '--self_test',
+        default=False,
+        action='store_true',
+        help='True if running a self test.')
+
+    FLAGS, unparsed = parser.parse_known_args()
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/run_on_localMachine.sh b/tensorflow/run_on_localMachine.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c5737c9fc9d6bca93e25fca9f785e52320131fc
--- /dev/null
+++ b/tensorflow/run_on_localMachine.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# Run the program
+python -u mnist.py
\ No newline at end of file
diff --git a/tensorflow/submit_job_jureca_python2.sh b/tensorflow/submit_job_jureca_python2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6672f8c3d71e4a80774914590cbde11325459273
--- /dev/null
+++ b/tensorflow/submit_job_jureca_python2.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TENSORFLOW_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load TensorFlow/1.12.0-GPU-Python-2.7.15
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/submit_job_jureca_python3.sh b/tensorflow/submit_job_jureca_python3.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c0831c9213d937a41e6d22dc4a0b5c8e07b2e745
--- /dev/null
+++ b/tensorflow/submit_job_jureca_python3.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TENSORFLOW_MNIST
+#SBATCH --gres=gpu:1 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module use /usr/local/software/jureca/OtherStages
+module load Stages/Devel-2018b
+module load GCC/7.3.0
+module load TensorFlow/1.12.0-GPU-Python-3.6.6
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow/submit_job_juron_python2.sh b/tensorflow/submit_job_juron_python2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6270cd2bbe665e5d405f36d96a8cf22ca62f07d6
--- /dev/null
+++ b/tensorflow/submit_job_juron_python2.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J TENSORFLOW_MNIST
+
+# Load the required modules
+module load python/2.7.14
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+
+# Run the program
+python -u mnist.py
diff --git a/tensorflow/submit_job_juron_python3.sh b/tensorflow/submit_job_juron_python3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..30fa2043f2059fc8d4d6ac673f52ba0bebb3ac2d
--- /dev/null
+++ b/tensorflow/submit_job_juron_python3.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+#BSUB -q normal
+#BSUB -W 10
+#BSUB -n 1
+#BSUB -R "span[ptile=1]"
+#BSUB -gpu "num=1"
+#BSUB -e "error.%J.er"
+#BSUB -o "output_%J.out"
+#BSUB -J TENSORFLOW_MNIST
+
+# Load the required modules
+module load python/3.6.1
+module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130
+
+# Run the program
+python -u mnist.py
diff --git a/utils/data_utils.py b/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..df15e9aa067b6e3b1ae20047ebd651c1cbd4c533
--- /dev/null
+++ b/utils/data_utils.py
@@ -0,0 +1,63 @@
+"""
+    A collections of utilities for data manipulation.
+
+    It was created to simplify the process of working with pre-downloaded
+    datasets.
+
+"""
+
+import os
+
+
+class DataValidator:
+    """
+    This class provides functions for validation of input data.
+
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def validated_data_dir(filename):
+        """
+        Checks if the given 'filename' exists, and is available in any of the
+        recognized input data directory locations. If the check is passed,
+        returns the fully qualified path to the input data directory.
+
+        Parameters
+        ----------
+        filename:
+            Name of the data file to be checked
+
+        Returns
+        -------
+        string:
+            Fully qualified path to the input data directory
+
+        """
+
+        # Check the environment variable
+        if 'DL_TEST_DATA_HOME' in os.environ:
+            # Read the data directory path from the environment variable
+            data_dir = os.environ.get('DL_TEST_DATA_HOME')
+        else:
+            # Set path to the 'datasets' directory in the project root
+            data_dir = os.path.join(os.path.abspath('../datasets'))
+
+            # We are two levels deep when executing Horovod samples
+            if not os.path.exists(data_dir):
+                data_dir = os.path.join(os.path.abspath('../../datasets'))
+
+            print('Using %s as the data directory.' % data_dir)
+
+        # Check if the directory exists
+        assert os.path.exists(data_dir), \
+            data_dir + ' refers to a non-existing directory. '\
+            'Please either correctly set the DL_TEST_DATA_HOME environment variable, ' \
+                       'or make sure the datasets are available in the project root.'
+
+        assert os.path.exists(os.path.join(data_dir, filename)), \
+            'Unable to locate ' + filename + ' in ' + data_dir
+
+        return data_dir