From 456e3c270916d7a1d64f615498f65f5f1e7cd191 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Sun, 17 Nov 2019 14:54:40 +0100
Subject: [PATCH 1/3] Added license information.

---
 .gitignore                            |  4 +-
 LICENSE                               | 26 ++++++++++
 NOTICE                                | 71 +++++++++++++++++++++++++++
 datasets/mnist/LICENSE                |  3 ++
 datasets/mnist/NOTICE                 | 12 +++++
 horovod/keras/mnist.py                |  5 ++
 horovod/keras/mnist_advanced.py       |  6 +++
 horovod/tensorflow/mnist.py           | 18 ++-----
 horovod/tensorflow/mnist_estimator.py | 19 ++-----
 keras/mnist.py                        |  6 +++
 requirements.txt                      | 24 +++++++++
 tensorflow/mnist.py                   | 18 ++-----
 utils/data_utils.py                   |  3 ++
 13 files changed, 172 insertions(+), 43 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 NOTICE
 create mode 100644 datasets/mnist/LICENSE
 create mode 100644 datasets/mnist/NOTICE
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index 7f5ba6e..340d044 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,4 +117,6 @@ mnist_convnet_model/
 
 # Error and output files from the supercomputers
 *.er
-*.out
\ No newline at end of file
+*.out
+
+horovod/keras/mnist_data_distributed.py
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..78d9698
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+All contents of this work, except for the contents of the "datasets/mnist"
+sub-directory are licensed under The MIT License (see license details below).
+Contents of the "datasets/mnist" sub-directory are licensed under the Creative
+Commons Attribution-ShareAlike 3.0 Unported License (see "datasets/mnist/LICENSE").
+
+MIT License
+
+Copyright (c) 2019 Forschungszentrum Juelich GmbH
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..22a9d69
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,71 @@
+This project includes derived work from the following:
+
+
+Horovod
+Copyright 2018 Uber Technologies, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+Tensorflow
+Copyright 2016 The TensorFlow Authors.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+Keras
+All contributions by François Chollet:
+Copyright (c) 2015 - 2019, François Chollet.
+All rights reserved.
+
+All contributions by Google:
+Copyright (c) 2015 - 2019, Google, Inc.
+All rights reserved.
+
+All contributions by Microsoft:
+Copyright (c) 2017 - 2019, Microsoft, Inc.
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2015 - 2019, the respective contributors.
+All rights reserved.
+
+Licensed under The MIT License (MIT)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/datasets/mnist/LICENSE b/datasets/mnist/LICENSE
new file mode 100644
index 0000000..65f46d5
--- /dev/null
+++ b/datasets/mnist/LICENSE
@@ -0,0 +1,3 @@
+The mnist directory is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License.
+To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send
+a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
diff --git a/datasets/mnist/NOTICE b/datasets/mnist/NOTICE
new file mode 100644
index 0000000..ff1a1c3
--- /dev/null
+++ b/datasets/mnist/NOTICE
@@ -0,0 +1,12 @@
+The contents of the mnist directory are derived from the MNIST dataset:
+
+Yann LeCun (Courant Institute, NYU) and Corinna Cortes (Google Labs, New York)
+hold the copyright of MNIST dataset (http://yann.lecun.com/exdb/mnist), which is
+a derivative work from original NIST datasets. MNIST dataset is made available
+under the terms of the Creative Commons Attribution-Share Alike 3.0 license. The
+license details are available via the following URL:
+
+http://creativecommons.org/licenses/by-sa/3.0/
+
+Individual images and labels have not been changed in this work. The only changes
+made are to the dataset format.
diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py
index e31aa8a..0c46a77 100644
--- a/horovod/keras/mnist.py
+++ b/horovod/keras/mnist.py
@@ -1,3 +1,8 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py,
+# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
+
 from __future__ import print_function
 import os
 import sys
diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py
index bf52fdd..ba60b6d 100644
--- a/horovod/keras/mnist_advanced.py
+++ b/horovod/keras/mnist_advanced.py
@@ -1,3 +1,9 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist_advanced.py,
+# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
+
+
 from __future__ import print_function
 import os
 import sys
diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py
index 8099f1c..3c780ac 100644
--- a/horovod/tensorflow/mnist.py
+++ b/horovod/tensorflow/mnist.py
@@ -1,17 +1,7 @@
-# Copyright 2017 Uber Technologies, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py,
+# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
 
 import os
 import sys
diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py
index 861de50..792c057 100644
--- a/horovod/tensorflow/mnist_estimator.py
+++ b/horovod/tensorflow/mnist_estimator.py
@@ -1,17 +1,8 @@
-#  Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_estimator.py,
+# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
+
 """Convolutional Neural Network Estimator for MNIST, built with tf.layers."""
 
 from __future__ import absolute_import
diff --git a/keras/mnist.py b/keras/mnist.py
index c183169..9fc93f2 100644
--- a/keras/mnist.py
+++ b/keras/mnist.py
@@ -1,3 +1,9 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py,
+# which is also licensed under The MIT License (see the NOTICE file for details).
+
+
 """Trains a simple convnet on the MNIST dataset.
 
 Gets to 99.25% test accuracy after 12 epochs
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..79144dc
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,24 @@
+absl-py==0.8.0
+astor==0.8.0
+cffi==1.12.3
+cloudpickle==1.2.1
+gast==0.3.1
+grpcio==1.23.0
+h5py==2.10.0
+Markdown==3.1.1
+mock==3.0.5
+mpi4py==3.0.2
+numpy==1.17.2
+protobuf==3.9.1
+psutil==5.6.3
+pycparser==2.19
+six==1.12.0
+Werkzeug==0.15.6
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.0
+tensorboard==1.13.1
+tensorflow-estimator==1.13.0
+tensorflow-gpu==1.13.1
+termcolor==1.1.0
+keras==2.3.1
+horovod==0.16.2
\ No newline at end of file
diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py
index 7ba4bdc..30477e1 100644
--- a/tensorflow/mnist.py
+++ b/tensorflow/mnist.py
@@ -1,17 +1,7 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py,
+# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details).
 
 """Simple, end-to-end, LeNet-5-like convolutional MNIST model example.
 
diff --git a/utils/data_utils.py b/utils/data_utils.py
index 0488856..bab6e03 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -1,3 +1,6 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+
 """
     A collections of utilities for data manipulation.
 
-- 
GitLab


From e1965bc6d99d2654266a662f085a746f06bc9f09 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Sun, 17 Nov 2019 16:44:33 +0100
Subject: [PATCH 2/3] Added the data distributed MNIST example, along with the
 partitioned MNIST dataset.

---
 .gitattributes                           |   4 +
 .gitignore                               |   2 -
 datasets/mnist/partitioned/test/x/0.npy  |   3 +
 datasets/mnist/partitioned/test/x/1.npy  |   3 +
 datasets/mnist/partitioned/test/x/2.npy  |   3 +
 datasets/mnist/partitioned/test/x/3.npy  |   3 +
 datasets/mnist/partitioned/test/x/4.npy  |   3 +
 datasets/mnist/partitioned/test/x/5.npy  |   3 +
 datasets/mnist/partitioned/test/x/6.npy  |   3 +
 datasets/mnist/partitioned/test/x/7.npy  |   3 +
 datasets/mnist/partitioned/test/y/0.npy  |   3 +
 datasets/mnist/partitioned/test/y/1.npy  |   3 +
 datasets/mnist/partitioned/test/y/2.npy  |   3 +
 datasets/mnist/partitioned/test/y/3.npy  |   3 +
 datasets/mnist/partitioned/test/y/4.npy  |   3 +
 datasets/mnist/partitioned/test/y/5.npy  |   3 +
 datasets/mnist/partitioned/test/y/6.npy  |   3 +
 datasets/mnist/partitioned/test/y/7.npy  |   3 +
 datasets/mnist/partitioned/train/x/0.npy |   3 +
 datasets/mnist/partitioned/train/x/1.npy |   3 +
 datasets/mnist/partitioned/train/x/2.npy |   3 +
 datasets/mnist/partitioned/train/x/3.npy |   3 +
 datasets/mnist/partitioned/train/x/4.npy |   3 +
 datasets/mnist/partitioned/train/x/5.npy |   3 +
 datasets/mnist/partitioned/train/x/6.npy |   3 +
 datasets/mnist/partitioned/train/x/7.npy |   3 +
 datasets/mnist/partitioned/train/y/0.npy |   3 +
 datasets/mnist/partitioned/train/y/1.npy |   3 +
 datasets/mnist/partitioned/train/y/2.npy |   3 +
 datasets/mnist/partitioned/train/y/3.npy |   3 +
 datasets/mnist/partitioned/train/y/4.npy |   3 +
 datasets/mnist/partitioned/train/y/5.npy |   3 +
 datasets/mnist/partitioned/train/y/6.npy |   3 +
 datasets/mnist/partitioned/train/y/7.npy |   3 +
 horovod/keras/.run_mnist_data_dist       |   3 +
 horovod/keras/mnist_data_distributed.py  | 216 +++++++++++++++++++++++
 36 files changed, 319 insertions(+), 2 deletions(-)
 create mode 100644 datasets/mnist/partitioned/test/x/0.npy
 create mode 100644 datasets/mnist/partitioned/test/x/1.npy
 create mode 100644 datasets/mnist/partitioned/test/x/2.npy
 create mode 100644 datasets/mnist/partitioned/test/x/3.npy
 create mode 100644 datasets/mnist/partitioned/test/x/4.npy
 create mode 100644 datasets/mnist/partitioned/test/x/5.npy
 create mode 100644 datasets/mnist/partitioned/test/x/6.npy
 create mode 100644 datasets/mnist/partitioned/test/x/7.npy
 create mode 100644 datasets/mnist/partitioned/test/y/0.npy
 create mode 100644 datasets/mnist/partitioned/test/y/1.npy
 create mode 100644 datasets/mnist/partitioned/test/y/2.npy
 create mode 100644 datasets/mnist/partitioned/test/y/3.npy
 create mode 100644 datasets/mnist/partitioned/test/y/4.npy
 create mode 100644 datasets/mnist/partitioned/test/y/5.npy
 create mode 100644 datasets/mnist/partitioned/test/y/6.npy
 create mode 100644 datasets/mnist/partitioned/test/y/7.npy
 create mode 100644 datasets/mnist/partitioned/train/x/0.npy
 create mode 100644 datasets/mnist/partitioned/train/x/1.npy
 create mode 100644 datasets/mnist/partitioned/train/x/2.npy
 create mode 100644 datasets/mnist/partitioned/train/x/3.npy
 create mode 100644 datasets/mnist/partitioned/train/x/4.npy
 create mode 100644 datasets/mnist/partitioned/train/x/5.npy
 create mode 100644 datasets/mnist/partitioned/train/x/6.npy
 create mode 100644 datasets/mnist/partitioned/train/x/7.npy
 create mode 100644 datasets/mnist/partitioned/train/y/0.npy
 create mode 100644 datasets/mnist/partitioned/train/y/1.npy
 create mode 100644 datasets/mnist/partitioned/train/y/2.npy
 create mode 100644 datasets/mnist/partitioned/train/y/3.npy
 create mode 100644 datasets/mnist/partitioned/train/y/4.npy
 create mode 100644 datasets/mnist/partitioned/train/y/5.npy
 create mode 100644 datasets/mnist/partitioned/train/y/6.npy
 create mode 100644 datasets/mnist/partitioned/train/y/7.npy
 create mode 100755 horovod/keras/.run_mnist_data_dist
 create mode 100644 horovod/keras/mnist_data_distributed.py

diff --git a/.gitattributes b/.gitattributes
index 775c8fe..dbf6f0e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,3 +9,7 @@ datasets/mnist/raw/t10k-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/raw/t10k-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/raw/train-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text
 datasets/mnist/raw/train-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/partitioned/train/x/*.npy filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/partitioned/train/y/*.npy filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/partitioned/test/x/*.npy filter=lfs diff=lfs merge=lfs -text
+datasets/mnist/partitioned/test/y/*.npy filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index 340d044..9c4d6d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,5 +118,3 @@ mnist_convnet_model/
 # Error and output files from the supercomputers
 *.er
 *.out
-
-horovod/keras/mnist_data_distributed.py
diff --git a/datasets/mnist/partitioned/test/x/0.npy b/datasets/mnist/partitioned/test/x/0.npy
new file mode 100644
index 0000000..23c2d04
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d337b85aa5761f401dd8ef5485ec8365a0254febdf02dc75abd97e852e46672b
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/1.npy b/datasets/mnist/partitioned/test/x/1.npy
new file mode 100644
index 0000000..a24cb47
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b13a84dd3c642af5a36cd64deda794dec5afe57985db5ae1ef70b2ab9c2c3da
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/2.npy b/datasets/mnist/partitioned/test/x/2.npy
new file mode 100644
index 0000000..a4261b3
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce66171e3c983e0a7ac3bfd1e526fa311e1b635d24f3aaffbe9409037e369d88
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/3.npy b/datasets/mnist/partitioned/test/x/3.npy
new file mode 100644
index 0000000..726e572
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/3.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfbc845e3668427f859c6dd687e1c440ede252bdb3f51349bb0250a7fc920c6a
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/4.npy b/datasets/mnist/partitioned/test/x/4.npy
new file mode 100644
index 0000000..47dc5ed
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/4.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3842852bf7abd261b394b32432c97c922c2d4e566ae8a824bec762e96576f974
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/5.npy b/datasets/mnist/partitioned/test/x/5.npy
new file mode 100644
index 0000000..6c18938
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/5.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56c0f2d2e2e6b4aae57a7f7bfd9cf4a35c81640095ca96c00f0ba6ea487dccb5
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/6.npy b/datasets/mnist/partitioned/test/x/6.npy
new file mode 100644
index 0000000..2e08250
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/6.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2abcd772fa7008092d33452de27d91ee6865f4eedf876e5ad07265a3ec6a33ee
+size 980128
diff --git a/datasets/mnist/partitioned/test/x/7.npy b/datasets/mnist/partitioned/test/x/7.npy
new file mode 100644
index 0000000..7c3fea6
--- /dev/null
+++ b/datasets/mnist/partitioned/test/x/7.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7241c5639af8ad3c0197f8f6da20a6a6bea510e5fe8dc340e44a270f6f1efae
+size 980128
diff --git a/datasets/mnist/partitioned/test/y/0.npy b/datasets/mnist/partitioned/test/y/0.npy
new file mode 100644
index 0000000..e73ef7c
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:292a20ed0440011df9e0057018d8b0e2712963f206b2356f72294dc56d2cd305
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/1.npy b/datasets/mnist/partitioned/test/y/1.npy
new file mode 100644
index 0000000..1bdbf1b
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:003428c745e9bf48a3c297dfce0465a2524557a13f4e8b9cc0c3c52c85b87ce0
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/2.npy b/datasets/mnist/partitioned/test/y/2.npy
new file mode 100644
index 0000000..53ae68c
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d836ac22edbec15473d650cbe049473cc287d31d2b5e229a0b6ddb66edd20057
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/3.npy b/datasets/mnist/partitioned/test/y/3.npy
new file mode 100644
index 0000000..fe6e890
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/3.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3074d0129a4571fd780fc94e17925122408aed7fcb5bb5425be00c2c26c7b1e1
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/4.npy b/datasets/mnist/partitioned/test/y/4.npy
new file mode 100644
index 0000000..21cdec7
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/4.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16323c30f940623002003fd2ee96029b6c2b1fbb09583408499ada4748a0c537
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/5.npy b/datasets/mnist/partitioned/test/y/5.npy
new file mode 100644
index 0000000..01f1a07
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/5.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381c928395e7b1e17d537d4f4d42d752d18e5371a511a69d2038ab4f8d828aa3
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/6.npy b/datasets/mnist/partitioned/test/y/6.npy
new file mode 100644
index 0000000..9d5e67f
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/6.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b6692f8f3e1623f9db7f1f0badda307d121538090418b83ba32a04b666809dc
+size 1378
diff --git a/datasets/mnist/partitioned/test/y/7.npy b/datasets/mnist/partitioned/test/y/7.npy
new file mode 100644
index 0000000..61a67b2
--- /dev/null
+++ b/datasets/mnist/partitioned/test/y/7.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d535f9dce9161fd74c357de58abc7c4a1ce6364c553837a4351d56f97f26ca53
+size 1378
diff --git a/datasets/mnist/partitioned/train/x/0.npy b/datasets/mnist/partitioned/train/x/0.npy
new file mode 100644
index 0000000..f897477
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76ab9b1dd7a661bea99a3b0dd91e81904b03855ea725e274fe8bd041780cf18f
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/1.npy b/datasets/mnist/partitioned/train/x/1.npy
new file mode 100644
index 0000000..26f3410
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4cbee1c3c137ba8fe6033d11a5f097eba3d581b14f813e57d5bb1a39be03b2c
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/2.npy b/datasets/mnist/partitioned/train/x/2.npy
new file mode 100644
index 0000000..a6a8225
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:507c0aa68fb2e56ef97b87f4d4033a51a05657980d1c721fd3247ba6ab78ecdd
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/3.npy b/datasets/mnist/partitioned/train/x/3.npy
new file mode 100644
index 0000000..603237a
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/3.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f31fb1045ed250ac6d0f1db02181821232ecf9fde11186ffdaffb41aa57f422
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/4.npy b/datasets/mnist/partitioned/train/x/4.npy
new file mode 100644
index 0000000..bdb707e
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/4.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2368ef45033da7492e3b5c5819d534158ca7b776cf4388bc8250bdcefd64bff5
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/5.npy b/datasets/mnist/partitioned/train/x/5.npy
new file mode 100644
index 0000000..3a7a11b
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/5.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:208beab7854df575b67b40191720c52866893779ea145e13a6da3e38b8fe7352
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/6.npy b/datasets/mnist/partitioned/train/x/6.npy
new file mode 100644
index 0000000..f598c3e
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/6.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5313f1e80e0270b61982dddecaf68dd06cef978b4314af165cdcc39970e164
+size 5880128
diff --git a/datasets/mnist/partitioned/train/x/7.npy b/datasets/mnist/partitioned/train/x/7.npy
new file mode 100644
index 0000000..3a7db3d
--- /dev/null
+++ b/datasets/mnist/partitioned/train/x/7.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe736405b00cef238fd06ffae326aba880400a1cf2e953c8f5ef5543f4e7c06
+size 5880128
diff --git a/datasets/mnist/partitioned/train/y/0.npy b/datasets/mnist/partitioned/train/y/0.npy
new file mode 100644
index 0000000..1fe9885
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/0.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:581d31f1482fd9ae2ad93bf018a31e24dd0b87c5e8299fa062b5b955ffff7f5e
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/1.npy b/datasets/mnist/partitioned/train/y/1.npy
new file mode 100644
index 0000000..046dacf
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a14441423517041fb8fe61312b60754cd0d4eeb8956ef1060a7f57780a768367
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/2.npy b/datasets/mnist/partitioned/train/y/2.npy
new file mode 100644
index 0000000..b257e23
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0234fa84a17b72bdd6ae29c87b4d2c1efccaec8f29e79adf9445a9cd008cd12
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/3.npy b/datasets/mnist/partitioned/train/y/3.npy
new file mode 100644
index 0000000..659e670
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/3.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83c8af4ccb59b3a0343fd0f43db4b3b6796962fb871fa4a1e3470a7e506469c0
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/4.npy b/datasets/mnist/partitioned/train/y/4.npy
new file mode 100644
index 0000000..a5c22bd
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/4.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:810b5e113d4913a7414b38294c8508e5607e0ff4a22c1e6c3c6fa7221ac37e40
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/5.npy b/datasets/mnist/partitioned/train/y/5.npy
new file mode 100644
index 0000000..512d28d
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/5.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64c9e94f8b223ad58e2b19341d2b2f2d69823703a2bb664ae8a55565157136fc
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/6.npy b/datasets/mnist/partitioned/train/y/6.npy
new file mode 100644
index 0000000..f7ad45e
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/6.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ff847487ba5695f4d39241efdb985aeb4e8e7cc70df5bac309bbe9f5025b292
+size 7628
diff --git a/datasets/mnist/partitioned/train/y/7.npy b/datasets/mnist/partitioned/train/y/7.npy
new file mode 100644
index 0000000..18a15d6
--- /dev/null
+++ b/datasets/mnist/partitioned/train/y/7.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c47569491fc4db006a4b173b2e4dc6f866390598342fa41b0b57e5b5e5f03ff0
+size 7628
diff --git a/horovod/keras/.run_mnist_data_dist b/horovod/keras/.run_mnist_data_dist
new file mode 100755
index 0000000..b9b19fa
--- /dev/null
+++ b/horovod/keras/.run_mnist_data_dist
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+PYTHONHASHSEED=0 mpirun -np 1 python -u mnist_data_distributed.py
diff --git a/horovod/keras/mnist_data_distributed.py b/horovod/keras/mnist_data_distributed.py
new file mode 100644
index 0000000..c1632cd
--- /dev/null
+++ b/horovod/keras/mnist_data_distributed.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+
+"""
+    This program  program distributes the partitioned MNIST data across
+    multiple MPI ranks for truly data distributed training of a shallow ANN
+    for handwritten digit classification.
+
+    The Horovod framework is used for seamless distributed training. Instead
+    of distributing epochs, this program distributes data amongst the ranks,
+    so that each rank contributes training based on its local subset of the
+    training data.
+
+"""
+
+import os
+import sys
+
+import mpi4py
+import numpy as np
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+from tensorflow.python.keras import backend as K
+
+from slns.errors import MpiInitError
+from slns.distribution import DataDistributor
+
+sys.path.insert(0, '../../utils')
+from data_utils import DataValidator
+
+
+def get_filenames(path):
+    """
+    Returns a list of names of files available on the given path.
+
+    :param path: str. Valid path to an existing directory.
+
+    :return: list. A list of filenames, where each filename is
+                   of type str.
+    """
+
+    absolute_path = os.path.join(os.path.abspath(f'{path}/x'))
+
+    return os.listdir(absolute_path)
+
+
+def get_concatenated_data(path, filenames):
+    """
+    Loads all files with the given filenames from the given path,
+    and concatenates all the loaded tensors into one large
+    tensor.
+
+    :param path: str. Valid path to an existing directory.
+    :param filenames: list. A list of filenames, where each filename is
+                   of type str.
+
+    :return: np.ndarray. A tensor with all the loaded content.
+    """
+
+    arrays = [
+        np.load(os.path.join(path, f)) for f in filenames
+    ]
+
+    return np.concatenate(arrays)
+
+
+def load_dataset(path, filenames):
+    """
+    Loads the input data and the corresponding labels as
+    two np.ndarray types, and returns these as a tuple.
+
+    :param path: str. Valid path to an existing directory.
+    :param filenames: list. A list of filenames, where each filename is
+                   of type str.
+
+    :return: Tuple consisting two np.ndarray types. The value at
+             the first tuple index is the input tensor, while the
+             other value is the corresponding array of labels.
+    """
+
+    x_dir = os.path.join(os.path.abspath(f'{path}/x'))
+    y_dir = os.path.join(os.path.abspath(f'{path}/y'))
+
+    x = get_concatenated_data(x_dir, filenames)
+    y = get_concatenated_data(y_dir, filenames)
+
+    return x, y
+
+
+def initialize_hvd_and_mpi():
+    """
+    Configure and initialize Horovod and MPI. Also, make sure there
+    are no conflicts between Horovod and mpi4py communicator
+    initialization.
+
+    :exception: hpcns.errors.MpiInitError is raised in the case
+                of initialization failure.
+    """
+
+    # Initialize Horovod.
+    hvd.init()
+
+    # Pin the GPU to be used to process local rank (one GPU per process)
+    tf_config = tf.ConfigProto()
+    tf_config.gpu_options.allow_growth = True
+    tf_config.gpu_options.visible_device_list = str(hvd.local_rank())
+    K.set_session(tf.Session(config=tf_config))
+
+    # Verify that MPI multi-threading is supported. Horovod cannot work
+    # with mpi4py (or any other MPI library) otherwise.
+    # More info on MPI multi-threading:
+    # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163
+    if not hvd.mpi_threads_supported():
+        raise MpiInitError(
+            'MPI multi-threading is not supported. Horovod cannot work with mpi4py'
+            'in this case. Please enable MPI multi-threading and try again.'
+        )
+
+    # Disable automatic MPI initialization on importing mpi4py.MPI,
+    # as we are relying on Horovod to take care of the initialization.
+    mpi4py.rc.initialize = False
+
+    # Verify that Horovod and mpi4py are using the same number of ranks
+    from mpi4py import MPI
+    if hvd.size() != MPI.COMM_WORLD.Get_size():
+        raise MpiInitError(
+            'Mismatch in hvd.size() and MPI.COMM_WORLD size.'
+            f' No. of ranks in Horovod: {hvd.size()}.'
+            f' No. of ranks in mpi4py: {MPI.COMM_WORLD.Get_size()}'
+        )
+
+
+def main():
+    """ Orchestrates the distributed training program. """
+
+    # Configure and initialize Horovod and mpi4py
+    initialize_hvd_and_mpi()
+
+    # Flag to indicate whether this is the MPI root
+    is_root = hvd.rank() == 0
+
+    dist_decorator = DataDistributor(
+        mpi_comm=mpi4py.MPI.COMM_WORLD, shutdown_on_error=True
+    )
+    get_rank_local_filenames = dist_decorator(get_filenames)
+
+    data_sub_dir = 'mnist/partitioned'
+    data_dir = DataValidator.validated_data_dir(data_sub_dir)
+
+    train_filenames = get_rank_local_filenames(
+        f'{os.path.join(data_dir, data_sub_dir)}/train')
+    x_train, y_train = load_dataset(
+        f'{os.path.join(data_dir, data_sub_dir)}/train', train_filenames)
+
+    # Normalize input samples
+    x_train = x_train / 255.0
+
+    if is_root:
+        test_filenames = get_filenames(
+            f'{os.path.join(data_dir, data_sub_dir)}/test')
+        x_test, y_test = load_dataset(
+            f'{os.path.join(data_dir, data_sub_dir)}/test', test_filenames)
+        x_test = x_test / 255.0
+    else:
+        x_test, y_test = None, None
+
+    # Define the model, i.e., the network
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(512, activation=tf.nn.relu),
+        tf.keras.layers.Dense(10, activation=tf.nn.softmax)
+    ])
+
+    # Optimizer
+    optimizer = tf.keras.optimizers.Adam()
+
+    # Horovod: add Horovod Distributed Optimizer.
+    optimizer = hvd.DistributedOptimizer(optimizer)
+
+    # Compile the model
+    model.compile(
+        optimizer=optimizer,
+        loss='sparse_categorical_crossentropy',
+        metrics=['accuracy']
+    )
+
+    # Fixed No. of epochs
+    epochs = 24
+
+    # Training callbacks
+    callbacks = [
+        # Horovod: broadcast initial variable states from rank 0 to all other processes.
+        # This is necessary to ensure consistent initialization of all workers when
+        # training is started with random weights or restored from a checkpoint.
+        hvd.callbacks.BroadcastGlobalVariablesCallback(0)
+    ]
+
+    # Train the model using the training set
+    model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=32,
+        epochs=epochs,
+        verbose=1 if is_root else 0,
+        callbacks=callbacks
+    )
+
+    if is_root:
+        # Test the model on the test set
+        score = model.evaluate(x=x_test, y=y_test, verbose=0)
+        print('Test loss:', score[0])
+        print('Test accuracy:', score[1])
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab


From 627d346af39e84a055bb37f2f3c18068b4a54d42 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Mon, 18 Nov 2019 08:38:21 +0100
Subject: [PATCH 3/3] The code sample for proper data-distributed training has
 been moved to a separate directory. A directory-local README.md file contains
 setup instructions. An announcement has been added to the main README.md.

---
 README.md                                     | 11 +++++--
 horovod/keras/.run_mnist_data_dist            |  3 --
 horovod_data_distributed/README.md            | 24 +++++++++++++++
 .../mnist_data_distributed.py                 | 30 +++++++++++--------
 horovod_data_distributed/submit_job_juwels.sh | 24 +++++++++++++++
 5 files changed, 73 insertions(+), 19 deletions(-)
 delete mode 100755 horovod/keras/.run_mnist_data_dist
 create mode 100644 horovod_data_distributed/README.md
 rename {horovod/keras => horovod_data_distributed}/mnist_data_distributed.py (87%)
 create mode 100755 horovod_data_distributed/submit_job_juwels.sh

diff --git a/README.md b/README.md
index a36ef62..dc59596 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,17 @@ visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/E
 
 ### Announcements
 
-*  Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well.
-*  Python 2 support has been removed from the tutorial for all frameworks except Caffe.
-*  Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch 
+*  **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code 
+samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism 
+where the training data is distributed instead of epochs. Further information is available in the 
+directory-local `README.md`.
+*  **September 02, 2019:** Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch 
 examples have been removed from this tutorial. This is due to the fact that the tutorial
 developers are not currently working with PyTorch, and are therefore not in a position to provide
 support for PyTorch related issues.
+*  **August 23, 2019:**
+   *  Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well.
+   *  Python 2 support has been removed from the tutorial for all frameworks except Caffe.
 
 # Table of contents
 <!-- TOC -->
diff --git a/horovod/keras/.run_mnist_data_dist b/horovod/keras/.run_mnist_data_dist
deleted file mode 100755
index b9b19fa..0000000
--- a/horovod/keras/.run_mnist_data_dist
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-PYTHONHASHSEED=0 mpirun -np 1 python -u mnist_data_distributed.py
diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md
new file mode 100644
index 0000000..c0934fc
--- /dev/null
+++ b/horovod_data_distributed/README.md
@@ -0,0 +1,24 @@
+# Introduction
+
+Please see the main docstring in each program for details.
+
+# Notes
+
+The `mnist_data_distributed.py` program requires the [`slns.distribution`](
+https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils#1-slnsdistribution)
+module for distribution of training data filenames across multiple ranks. 
+Please follow the steps below to install the required package.
+
+1.  Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` 
+2.  Load the system-wide Python module.
+    *  On JURECA and JUWELS: `module load Python/3.6.8`
+    *  On JURON: `module load Python/3.6.1`
+3.  Create a Python virtual environment: `python -m venv venv_dl_slns`
+4.  Activate the virtual environment: `source activate venv_dl_slns/bin/activate`
+5.  Install the `slns` package: `python -m pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils.git`
+6.  Open the job submission script you intend to use, and make sure the path to the virtual environment is correct
+
+Once all the above steps are completed, the job can be submitted.
+
+**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there
+only eight training files.
\ No newline at end of file
diff --git a/horovod/keras/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py
similarity index 87%
rename from horovod/keras/mnist_data_distributed.py
rename to horovod_data_distributed/mnist_data_distributed.py
index c1632cd..b6add7e 100644
--- a/horovod/keras/mnist_data_distributed.py
+++ b/horovod_data_distributed/mnist_data_distributed.py
@@ -2,14 +2,14 @@
 # This code is licensed under MIT license (see the LICENSE file for details).
 
 """
-    This program  program distributes the partitioned MNIST data across
-    multiple MPI ranks for truly data distributed training of a shallow ANN
-    for handwritten digit classification.
+    This program distributes the partitioned MNIST data across multiple ranks
+    for truly data distributed training of a shallow ANN for handwritten digit
+    classification.
 
-    The Horovod framework is used for seamless distributed training. Instead
-    of distributing epochs, this program distributes data amongst the ranks,
-    so that each rank contributes training based on its local subset of the
-    training data.
+    The Horovod framework is used for seamless distributed training. However,
+    instead of distributing epochs, this program distributes data amongst the
+    ranks, so that each rank contributes training based on its local subset of
+    the training data.
 
 """
 
@@ -93,14 +93,15 @@ def initialize_hvd_and_mpi():
     are no conflicts between Horovod and mpi4py communicator
     initialization.
 
-    :exception: hpcns.errors.MpiInitError is raised in the case
+    :exception: slns.errors.MpiInitError is raised in the case
                 of initialization failure.
     """
 
     # Initialize Horovod.
     hvd.init()
 
-    # Pin the GPU to be used to process local rank (one GPU per process)
+    # Bind the local rank to a specific GPU, so that each rank uses
+    # a different GPU
     tf_config = tf.ConfigProto()
     tf_config.gpu_options.allow_growth = True
     tf_config.gpu_options.visible_device_list = str(hvd.local_rank())
@@ -139,14 +140,19 @@ def main():
     # Flag to indicate whether this is the MPI root
     is_root = hvd.rank() == 0
 
+    # Decorate the get_filenames function so that instead of returning
+    # a list of all filenames, it returns a list of the subset of
+    # filenames that are to be processed by the local rank.
     dist_decorator = DataDistributor(
         mpi_comm=mpi4py.MPI.COMM_WORLD, shutdown_on_error=True
     )
     get_rank_local_filenames = dist_decorator(get_filenames)
 
+    # Data directory paths
     data_sub_dir = 'mnist/partitioned'
     data_dir = DataValidator.validated_data_dir(data_sub_dir)
 
+    # Prepare training data
     train_filenames = get_rank_local_filenames(
         f'{os.path.join(data_dir, data_sub_dir)}/train')
     x_train, y_train = load_dataset(
@@ -156,6 +162,7 @@ def main():
     x_train = x_train / 255.0
 
     if is_root:
+        # Prepare test data
         test_filenames = get_filenames(
             f'{os.path.join(data_dir, data_sub_dir)}/test')
         x_test, y_test = load_dataset(
@@ -174,7 +181,7 @@ def main():
     # Optimizer
     optimizer = tf.keras.optimizers.Adam()
 
-    # Horovod: add Horovod Distributed Optimizer.
+    # Decorate the optimizer with the Horovod Distributed Optimizer
     optimizer = hvd.DistributedOptimizer(optimizer)
 
     # Compile the model
@@ -189,9 +196,6 @@ def main():
 
     # Training callbacks
     callbacks = [
-        # Horovod: broadcast initial variable states from rank 0 to all other processes.
-        # This is necessary to ensure consistent initialization of all workers when
-        # training is started with random weights or restored from a checkpoint.
         hvd.callbacks.BroadcastGlobalVariablesCallback(0)
     ]
 
diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh
new file mode 100755
index 0000000..24f50d8
--- /dev/null
+++ b/horovod_data_distributed/submit_job_juwels.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# Slurm job configuration
+#SBATCH --nodes=2
+#SBATCH --ntasks=8
+#SBATCH --ntasks-per-node=4
+#SBATCH --output=output_%j.out
+#SBATCH --error=error_%j.er
+#SBATCH --time=00:10:00
+#SBATCH --job-name=HVD_DATA_DIST
+#SBATCH --gres=gpu:4 --partition=develgpus
+#SBATCH --mail-type=ALL
+
+# Load the required modules
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
+
+# Source the virtual environment
+source activate venv_dl_slns/bin/activate
+
+# Run the program
+srun python -u mnist_data_distributed.py
-- 
GitLab