diff --git a/caffe/README.md b/caffe/README.md
deleted file mode 100644
index 1804dceeab230c4754b7616d3c111da8a51873cb..0000000000000000000000000000000000000000
--- a/caffe/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-**Caution:** Caffe is no longer being actively developed, which is why we prefer not to support
-it as a system-wide module on the supercomputers for long. This is why Caffe is available with 
-Python 2 support only on JURECA, while it is not at all supported on JUWELS. The users are advised 
-to switch to other frameworks such as Tensorflow/Keras and PyTorch.
-
-# Notes
-
-There are three ways in which Caffe can be used,
-1.  As a command line tool with only built-in layers
-2.  As a library from within a Python program. Either only built-in layers can be used, 
-or one or more custom layers can be written in Python.
-3.  As a command line tool with one or more custom C++ layers.
-
-## Caffe as a command line tool
-
-The `mnist_cmd` sub-directory contains configuration and job scripts for running 
-Caffe as a command line tool with only built-in layers. This example represents use 
-case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` 
-were taken from the MNIST examples directory available in the Caffe repository 
-[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have 
-been made just so the path to the input dataset is correct. The `caffe` command 
-in the job submission scripts can be modified as follows to run training on 
-all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`):
-
-    caffe train --solver=lenet_solver.prototxt -gpu all
-
-## Using Caffe within a Python program
-
-The `lenet_python` sub-directory contains the required files for an example of 
-using Caffe as a library from within a Python program. This corresponds to use case 
-2 as described above. The `train_lenet.py` file contains source code adapted from 
-the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples 
-[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example 
-results in the generation of a learning curve plot in the current directory.
-
-## Caffe with custom C++ layers
-
-Working with custom C++ layers requires recompiling Caffe with the custom code. As 
-this is not possible with a system-wide installation, we have decided not to 
-include an example of this use case. Nevertheless, if you must work with custom 
-C++ layers and require assistance, please send an email to the JULAIN mailing list 
-(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)).
-
diff --git a/caffe/lenet_python/.submit_job_jureca_python2.sh b/caffe/lenet_python/.submit_job_jureca_python2.sh
deleted file mode 100755
index 75069256157eb55f4122b0ebc2f390b925f89396..0000000000000000000000000000000000000000
--- a/caffe/lenet_python/.submit_job_jureca_python2.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=CAFFE_LENET_PYTHON
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/Devel-2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load Caffe/1.0-Python-2.7.15
-
-# Run the program
-srun python -u train_lenet.py
diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt
deleted file mode 100644
index 44af3ad6cecd7a8090902160666e5453622f8be6..0000000000000000000000000000000000000000
--- a/caffe/lenet_python/lenet_auto_solver.prototxt
+++ /dev/null
@@ -1,24 +0,0 @@
-# The train/test net protocol buffer definition
-train_net: "lenet_auto_train.prototxt"
-test_net: "lenet_auto_test.prototxt"
-# test_iter specifies how many forward passes the test should carry out.
-# In the case of MNIST, we have test batch size 100 and 100 test iterations,
-# covering the full 10,000 testing images.
-test_iter: 100
-# Carry out testing every 500 training iterations.
-test_interval: 500
-# The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.01
-momentum: 0.9
-weight_decay: 0.0005
-# The learning rate policy
-lr_policy: "inv"
-gamma: 0.0001
-power: 0.75
-# Display every 100 iterations
-display: 100
-# The maximum number of iterations
-max_iter: 10000
-# snapshot intermediate results
-snapshot: 5000
-snapshot_prefix: "snapshots/lenet"
diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh
deleted file mode 100755
index 2025a389b89bb90c6593b598231f14c8fb1fdcf0..0000000000000000000000000000000000000000
--- a/caffe/lenet_python/submit_job_juron_python2.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_LENET_PYTHON
-
-# Load the Python and Caffe modules
-module load python/2.7.14
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train LeNet
-python -u train_lenet.py
diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh
deleted file mode 100755
index 7e737766bcb4ee609fdefab0d52f6adcc95e12e8..0000000000000000000000000000000000000000
--- a/caffe/lenet_python/submit_job_juron_python3.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_LENET_PYTHON
-
-# Load the Python and Caffe modules
-module load python/3.6.1
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train LeNet
-python -u train_lenet.py
diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py
deleted file mode 100644
index ad5cae3bf4d6a7f1f9a418b802418714efb6ee67..0000000000000000000000000000000000000000
--- a/caffe/lenet_python/train_lenet.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import os
-import sys
-import matplotlib
-
-# Force matplotlib to not use any Xwindows backend.
-matplotlib.use('Agg')
-import pylab
-
-import caffe
-from caffe import layers as L, params as P
-
-# Import the DataValidator, which can then be used to
-# validate and load the path to the already downloaded dataset.
-sys.path.insert(0, '../../utils')
-from data_utils import DataValidator
-
-
-# Prepares network specification
-def lenet(lmdb, batch_size):
-    # Caffe's version of LeNet: a series of linear and simple nonlinear transformations
-    n = caffe.NetSpec()
-
-    n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb,
-                             transform_param=dict(scale=1. / 255), ntop=2)
-
-    n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier'))
-    n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
-    n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier'))
-    n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
-    n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier'))
-    n.relu1 = L.ReLU(n.fc1, in_place=True)
-    n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier'))
-    n.loss = L.SoftmaxWithLoss(n.score, n.label)
-
-    return n.to_proto()
-
-
-# Names of the directories containing the LMDB files for TRAIN and TEST phases
-test_dir = 'mnist/caffe/mnist_test_lmdb'
-train_dir = 'mnist/caffe/mnist_train_lmdb'
-
-# Validated path to the data root
-DataValidator.validated_data_dir(train_dir)
-data_dir = DataValidator.validated_data_dir(test_dir)
-
-# Write the prototxt for TRAIN phase
-with open('lenet_auto_train.prototxt', 'w') as f:
-    f.write(str(lenet(os.path.join(data_dir, train_dir), 64)))
-
-# Write the prototxt for TEST phase
-with open('lenet_auto_test.prototxt', 'w') as f:
-    f.write(str(lenet(os.path.join(data_dir, test_dir), 100)))
-
-# Use the GPU for training
-caffe.set_device(0)
-caffe.set_mode_gpu()
-
-# Load the solver and create train and test nets
-solver = None  # ignore this workaround for lmdb data (can't instantiate two solvers on the same data)
-solver = caffe.SGDSolver('lenet_auto_solver.prototxt')
-
-solver.net.forward()  # train net
-solver.test_nets[0].forward()  # test net (there can be more than one)
-
-niter = 200
-test_interval = 25
-# losses will also be stored in the log
-train_loss = pylab.zeros(niter)
-test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval)))
-output = pylab.zeros((niter, 8, 10))
-
-# the main solver loop
-for it in range(niter):
-    solver.step(1)  # SGD by Caffe
-
-    # store the train loss
-    train_loss[it] = solver.net.blobs['loss'].data
-
-    # store the output on the first test batch
-    # (start the forward pass at conv1 to avoid loading new data)
-    solver.test_nets[0].forward(start='conv1')
-    output[it] = solver.test_nets[0].blobs['score'].data[:8]
-
-    # run a full test every so often
-    # (Caffe can also do this for us and write to a log, but we show here
-    #  how to do it directly in Python, where more complicated things are easier.)
-    if it % test_interval == 0:
-        print('Iteration', it, 'testing...')
-        correct = 0
-        for test_it in range(100):
-            solver.test_nets[0].forward()
-            correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1)
-                           == solver.test_nets[0].blobs['label'].data)
-        test_acc[it // test_interval] = correct / 1e4
-
-# Plot the training curve
-_, ax1 = pylab.subplots()
-ax2 = ax1.twinx()
-ax1.plot(pylab.arange(niter), train_loss)
-ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r')
-ax1.set_xlabel('iteration')
-ax1.set_ylabel('train loss')
-ax2.set_ylabel('test accuracy')
-ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1]))
-
-# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace
-pylab.savefig('learning_curve.png', bbox_inches='tight')
diff --git a/caffe/mnist_cmd/.submit_job_jureca_python2.sh b/caffe/mnist_cmd/.submit_job_jureca_python2.sh
deleted file mode 100755
index 029520e3308a4e322cfd14c3d863e982fb5ac02e..0000000000000000000000000000000000000000
--- a/caffe/mnist_cmd/.submit_job_jureca_python2.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Slurm job configuration
-#SBATCH --nodes=1
-#SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --output=output_%j.out
-#SBATCH --error=error_%j.er
-#SBATCH --time=00:10:00
-#SBATCH --job-name=CAFFE_MNIST_CMD
-#SBATCH --gres=gpu:1 --partition=develgpus
-#SBATCH --mail-type=ALL
-
-# Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/Devel-2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load Caffe/1.0-Python-2.7.15
-
-# Train the model using the 'caffe' binary
-srun caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt
deleted file mode 100644
index 103b2e757061c84e3bb00a83a54f55606b3ce64b..0000000000000000000000000000000000000000
--- a/caffe/mnist_cmd/lenet_solver.prototxt
+++ /dev/null
@@ -1,25 +0,0 @@
-# The train/test net protocol buffer definition
-net: "lenet_train_test.prototxt"
-# test_iter specifies how many forward passes the test should carry out.
-# In the case of MNIST, we have test batch size 100 and 100 test iterations,
-# covering the full 10,000 testing images.
-test_iter: 100
-# Carry out testing every 500 training iterations.
-test_interval: 500
-# The base learning rate, momentum and the weight decay of the network.
-base_lr: 0.01
-momentum: 0.9
-weight_decay: 0.0005
-# The learning rate policy
-lr_policy: "inv"
-gamma: 0.0001
-power: 0.75
-# Display every 100 iterations
-display: 100
-# The maximum number of iterations
-max_iter: 10000
-# snapshot intermediate results
-snapshot: 5000
-snapshot_prefix: "snapshots/lenet"
-# solver mode: CPU or GPU
-solver_mode: GPU
diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt
deleted file mode 100644
index f34ab716ec5467584ac059af3bd5d087a9d2fb34..0000000000000000000000000000000000000000
--- a/caffe/mnist_cmd/lenet_train_test.prototxt
+++ /dev/null
@@ -1,168 +0,0 @@
-name: "LeNet"
-layer {
-  name: "mnist"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TRAIN
-  }
-  transform_param {
-    scale: 0.00390625
-  }
-  data_param {
-    source: "../../datasets/mnist/caffe/mnist_train_lmdb"
-    batch_size: 64
-    backend: LMDB
-  }
-}
-layer {
-  name: "mnist"
-  type: "Data"
-  top: "data"
-  top: "label"
-  include {
-    phase: TEST
-  }
-  transform_param {
-    scale: 0.00390625
-  }
-  data_param {
-    source: "../../datasets/mnist/caffe/mnist_test_lmdb"
-    batch_size: 100
-    backend: LMDB
-  }
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 20
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool1"
-  type: "Pooling"
-  bottom: "conv1"
-  top: "pool1"
-  pooling_param {
-    pool: MAX
-    kernel_size: 2
-    stride: 2
-  }
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "pool1"
-  top: "conv2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  convolution_param {
-    num_output: 50
-    kernel_size: 5
-    stride: 1
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "pool2"
-  type: "Pooling"
-  bottom: "conv2"
-  top: "pool2"
-  pooling_param {
-    pool: MAX
-    kernel_size: 2
-    stride: 2
-  }
-}
-layer {
-  name: "ip1"
-  type: "InnerProduct"
-  bottom: "pool2"
-  top: "ip1"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 500
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "relu1"
-  type: "ReLU"
-  bottom: "ip1"
-  top: "ip1"
-}
-layer {
-  name: "ip2"
-  type: "InnerProduct"
-  bottom: "ip1"
-  top: "ip2"
-  param {
-    lr_mult: 1
-  }
-  param {
-    lr_mult: 2
-  }
-  inner_product_param {
-    num_output: 10
-    weight_filler {
-      type: "xavier"
-    }
-    bias_filler {
-      type: "constant"
-    }
-  }
-}
-layer {
-  name: "accuracy"
-  type: "Accuracy"
-  bottom: "ip2"
-  bottom: "label"
-  top: "accuracy"
-  include {
-    phase: TEST
-  }
-}
-layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
-  bottom: "ip2"
-  bottom: "label"
-  top: "loss"
-}
diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh
deleted file mode 100755
index b5ee63c60aa1dddad9708367d6623deccc57022f..0000000000000000000000000000000000000000
--- a/caffe/mnist_cmd/submit_job_juron_python2.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_MNIST_CMD
-
-# Load the Python and Caffe modules
-module load python/2.7.14
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train a model for MNIST
-caffe train --solver=lenet_solver.prototxt -gpu 0
\ No newline at end of file
diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh
deleted file mode 100755
index bdac4a2aef6d670bff2fcf4a928bf3586df3781b..0000000000000000000000000000000000000000
--- a/caffe/mnist_cmd/submit_job_juron_python3.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-#BSUB -q normal
-#BSUB -W 10
-#BSUB -n 1
-#BSUB -R "span[ptile=1]"
-#BSUB -gpu "num=1"
-#BSUB -e "error.%J.er"
-#BSUB -o "output_%J.out"
-#BSUB -J CAFFE_MNIST_CMD
-
-# Load the Python and Caffe modules
-module load python/3.6.1
-module load caffe/1.0-gcc_5.4.0-cuda_10.0.130
-
-# Train a model for MNIST
-caffe train --solver=lenet_solver.prototxt -gpu 0
diff --git a/horovod_data_distributed/juwels_booster_job b/horovod_data_distributed/juwels_booster_job
new file mode 100755
index 0000000000000000000000000000000000000000..803e7648c08c91b5a40f30dde6ee758bbf0e688a
--- /dev/null
+++ b/horovod_data_distributed/juwels_booster_job
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=booster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load mpi4py/3.0.3-Python-3.8.5
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py
index d4c68c19174058a41d0198b322a0f4035ef22419..b2335a83ed979ee77d2d30c2dc67d541c87ba2e4 100644
--- a/horovod_data_distributed/mnist_data_distributed.py
+++ b/horovod_data_distributed/mnist_data_distributed.py
@@ -20,7 +20,6 @@ import mpi4py
 import numpy as np
 import tensorflow as tf
 import horovod.tensorflow.keras as hvd
-from tensorflow.python.keras import backend as K
 
 from hpc4neuro.errors import MpiInitError
 from hpc4neuro.distribution import DataDistributor
@@ -102,10 +101,14 @@ def initialize_hvd_and_mpi():
 
     # Bind the local rank to a specific GPU, so that each rank uses
     # a different GPU
-    tf_config = tf.ConfigProto()
-    tf_config.gpu_options.allow_growth = True
-    tf_config.gpu_options.visible_device_list = str(hvd.local_rank())
-    K.set_session(tf.Session(config=tf_config))
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+        tf.config.experimental.set_visible_devices(
+            gpus[hvd.local_rank()],
+            'GPU'
+        )
 
     # Verify that MPI multi-threading is supported. Horovod cannot work
     # with mpi4py (or any other MPI library) otherwise.
@@ -113,8 +116,9 @@ def initialize_hvd_and_mpi():
     # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163
     if not hvd.mpi_threads_supported():
         raise MpiInitError(
-            'MPI multi-threading is not supported. Horovod cannot work with mpi4py'
-            'in this case. Please enable MPI multi-threading and try again.'
+            'MPI multi-threading is not supported. Horovod cannot work with '
+            'mpi4py in this case. Please enable MPI multi-threading and try '
+            'again.'
         )
 
     # Disable automatic MPI initialization on importing mpi4py.MPI,
diff --git a/requirements.txt b/requirements.txt
index 79144dccd44dd967fb51438abbcd9589c6d81937..6a4def7c590fc30601ff616009ff388d1198e244 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,24 +1,41 @@
-absl-py==0.8.0
-astor==0.8.0
-cffi==1.12.3
-cloudpickle==1.2.1
-gast==0.3.1
-grpcio==1.23.0
+absl-py==0.12.0
+astunparse==1.6.3
+cachetools==4.2.1
+certifi==2020.12.5
+cffi==1.14.5
+chardet==4.0.0
+cloudpickle==1.6.0
+gast==0.3.3
+google-auth==1.29.0
+google-auth-oauthlib==0.4.4
+google-pasta==0.2.0
+grpcio==1.37.0
 h5py==2.10.0
-Markdown==3.1.1
-mock==3.0.5
-mpi4py==3.0.2
-numpy==1.17.2
-protobuf==3.9.1
-psutil==5.6.3
-pycparser==2.19
-six==1.12.0
-Werkzeug==0.15.6
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-tensorboard==1.13.1
-tensorflow-estimator==1.13.0
-tensorflow-gpu==1.13.1
+horovod==0.20.3
+hpc4neuro @ git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git@57a560b4085dba2ba3262d4d3238ef70991be877
+idna==2.10
+Keras-Preprocessing==1.1.2
+Markdown==3.3.4
+mpi4py==3.0.3
+numpy==1.18.5
+oauthlib==3.1.0
+opt-einsum==3.3.0
+protobuf==3.15.8
+psutil==5.8.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.20
+PyYAML==5.4.1
+requests==2.25.1
+requests-oauthlib==1.3.0
+rsa==4.7.2
+six==1.15.0
+tensorboard==2.5.0
+tensorboard-data-server==0.6.0
+tensorboard-plugin-wit==1.8.0
+tensorflow==2.3.1
+tensorflow-estimator==2.3.0
 termcolor==1.1.0
-keras==2.3.1
-horovod==0.16.2
\ No newline at end of file
+urllib3==1.26.4
+Werkzeug==1.0.1
+wrapt==1.12.1
diff --git a/caffe/lenet_python/snapshots/.gitkeep b/tensorflow2/checkpoints/.gitkeep
similarity index 100%
rename from caffe/lenet_python/snapshots/.gitkeep
rename to tensorflow2/checkpoints/.gitkeep
diff --git a/tensorflow2/juwels_booster_job b/tensorflow2/juwels_booster_job
new file mode 100755
index 0000000000000000000000000000000000000000..625afac97d8e8a59df4cb5afdf7ec8233466752f
--- /dev/null
+++ b/tensorflow2/juwels_booster_job
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=TUTORIAL
+#SBATCH --gres=gpu:4
+#SBATCH --partition=booster
+
+# Load the required modules
+module load GCC/9.3.0
+module load OpenMPI/4.1.0rc1
+module load TensorFlow/2.3.1-Python-3.8.5
+module load Horovod/0.20.3-Python-3.8.5
+
+# Enable MPI multi-threading for Horovod
+export HOROVOD_MPI_THREADS_DISABLE=0
+
+# Make all GPUs visible per node
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+# Run the program
+srun python -u mnist.py
diff --git a/tensorflow2/keras_mnist.py b/tensorflow2/keras_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..e44456030f75c420cfc49110292afcf338c62983
--- /dev/null
+++ b/tensorflow2/keras_mnist.py
@@ -0,0 +1,107 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(dataset_file)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+             tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+
+# Horovod: adjust learning rate based on number of GPUs.
+scaled_lr = 0.001 * hvd.size()
+opt = tf.optimizers.Adam(scaled_lr)
+
+# Horovod: add Horovod DistributedOptimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
+# uses hvd.DistributedOptimizer() to compute gradients.
+mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
+                    optimizer=opt,
+                    metrics=['accuracy'],
+                    experimental_run_tf_function=False)
+
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+    # Horovod: average metrics among workers at the end of every epoch.
+    #
+    # Note: This callback must be in the list before the ReduceLROnPlateau,
+    # TensorBoard or other metrics-based callbacks.
+    hvd.callbacks.MetricAverageCallback(),
+
+    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+    # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
+    hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5'))
+
+# Horovod: write logs on worker 0.
+verbose = 1 if hvd.rank() == 0 else 0
+
+# Train the model.
+# Horovod: adjust number of steps based on number of GPUs.
+mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose)
\ No newline at end of file
diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cb1dad8d0c5705954e39df48ea895edd02fb62
--- /dev/null
+++ b/tensorflow2/mnist.py
@@ -0,0 +1,109 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import sys
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+# [HPCNS] Import the DataValidator, which can then be used to
+# validate and load the path to the already downloaded dataset.
+sys.path.insert(0, '../utils')
+from data_utils import DataValidator
+
+# [HPCNS] Name of the dataset file
+data_file = 'mnist/keras/mnist.npz'
+
+# [HPCNS] Path to the directory containing the dataset file
+data_dir = DataValidator.validated_data_dir(data_file)
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+gpus = tf.config.experimental.list_physical_devices('GPU')
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+if gpus:
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+# [HPCNS] Fully qualified dataset file name
+dataset_file = os.path.join(data_dir, data_file)
+
+(mnist_images, mnist_labels), _ = \
+    tf.keras.datasets.mnist.load_data(dataset_file)
+
+dataset = tf.data.Dataset.from_tensor_slices(
+    (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
+             tf.cast(mnist_labels, tf.int64))
+)
+dataset = dataset.repeat().shuffle(10000).batch(128)
+
+mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
+    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation='relu'),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation='softmax')
+])
+loss = tf.losses.SparseCategoricalCrossentropy()
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = tf.optimizers.Adam(0.001 * hvd.size())
+
+checkpoint_dir = 'checkpoints/'
+checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
+
+
+@tf.function
+def training_step(images, labels, first_batch):
+    with tf.GradientTape() as tape:
+        probs = mnist_model(images, training=True)
+        loss_value = loss(labels, probs)
+
+    # Horovod: add Horovod Distributed GradientTape.
+    tape = hvd.DistributedGradientTape(tape)
+
+    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
+    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
+
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    #
+    # Note: broadcast should be done after the first gradient step to ensure optimizer
+    # initialization.
+    if first_batch:
+        hvd.broadcast_variables(mnist_model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+    return loss_value
+
+
+# Horovod: adjust number of steps based on number of GPUs.
+for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())):
+    loss_value = training_step(images, labels, batch == 0)
+
+    if batch % 10 == 0 and hvd.local_rank() == 0:
+        print('Step #%d\tLoss: %.6f' % (batch, loss_value))
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from
+# corrupting it.
+if hvd.rank() == 0:
+    checkpoint.save(checkpoint_dir)