From 5729edf271194caebc77d3b14c9f92b7a69724ea Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Wed, 27 Nov 2019 09:34:09 +0100
Subject: [PATCH] Added code samples presented on slides as part of the intro
 to SC usage course.

---
 .../examples/mnist_epoch_distributed.py       | 99 +++++++++++++++++++
 course_material/examples/mnist_single_gpu.py  | 67 +++++++++++++
 course_material/slides/.gitkeep               |  0
 utils/data_utils.py                           | 12 +--
 4 files changed, 169 insertions(+), 9 deletions(-)
 create mode 100644 course_material/examples/mnist_epoch_distributed.py
 create mode 100644 course_material/examples/mnist_single_gpu.py
 create mode 100644 course_material/slides/.gitkeep

diff --git a/course_material/examples/mnist_epoch_distributed.py b/course_material/examples/mnist_epoch_distributed.py
new file mode 100644
index 0000000..7c9080e
--- /dev/null
+++ b/course_material/examples/mnist_epoch_distributed.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from Horovod, which is licensed under the Apache License,
+# Version 2.0 (see the NOTICE file for details).
+
+"""
+    This program is an adaptation of the following code sample:
+    https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py.
+    The program creates and trains a shallow ANN for handwritten digit
+    classification using the MNIST dataset.
+
+    The Horovod framework is used for seamless distributed training. In this
+    example epochs are distributed across the Horovod ranks, not data.
+
+    To run this sample use the following command on your
+    workstation/laptop equipped with a GPU:
+
+    mpirun -np 1 python -u mnist_epoch_distributed.py
+
+    If you have more than one GPU on your system, you can increase the
+    number of ranks accordingly.
+
+    The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and
+    horovod 0.16.2.
+
+    Note: This code will NOT work on the supercomputers.
+
+"""
+
+import math
+import tensorflow as tf
+import horovod.tensorflow.keras as hvd
+from tensorflow.python.keras import backend as K
+
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+config.gpu_options.visible_device_list = str(hvd.local_rank())
+K.set_session(tf.Session(config=config))
+
+# Reference to the MNIST dataset
+mnist = tf.keras.datasets.mnist
+
+# Load the MNIST dataset, split between train and test sets
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+# Normalize input samples
+x_train, x_test = x_train / 255.0, x_test / 255.0
+
+# Define the model, i.e., the network
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(512, activation=tf.nn.relu),
+    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
+])
+
+# Optimizer
+optimizer = tf.keras.optimizers.Adam()
+
+# Decorate the optimizer with Horovod's distributed optimizer
+optimizer = hvd.DistributedOptimizer(optimizer)
+
+# Horovod: adjust number of epochs based on number of GPUs.
+epochs = int(math.ceil(4.0 / hvd.size()))
+
+# Compile the model
+model.compile(
+    optimizer=optimizer,
+    loss='sparse_categorical_crossentropy',
+    metrics=['accuracy']
+)
+
+# Training callbacks
+callbacks = [
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    hvd.callbacks.BroadcastGlobalVariablesCallback(0)
+]
+
+# Train the model using the training set
+model.fit(
+    x=x_train,
+    y=y_train,
+    batch_size=32,
+    epochs=epochs,
+    verbose=1 if hvd.rank() == 0 else 0,
+    callbacks=callbacks
+)
+
+# Run the test on the root rank only
+if hvd.rank() == 0:
+    # Test the model on the test set
+    score = model.evaluate(x=x_test, y=y_test, verbose=0)
+    print(f'Test loss:  {score[0]}')
+    print(f'Test accuracy: {score[1]}')
diff --git a/course_material/examples/mnist_single_gpu.py b/course_material/examples/mnist_single_gpu.py
new file mode 100644
index 0000000..794150f
--- /dev/null
+++ b/course_material/examples/mnist_single_gpu.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
+# This code is licensed under MIT license (see the LICENSE file for details).
+# This code is derived from Tensorflow tutorials, which is licensed under the Apache License,
+# Version 2.0 (see the NOTICE file for details).
+
+"""
+    This program is an adaptation of the code sample available at
+    https://www.tensorflow.org/tutorials/. The program creates
+    and trains a shallow ANN for handwritten digit classification
+    using the MNIST dataset.
+
+    To run this sample use the following command on your
+    workstation/laptop equipped with a GPU:
+
+    python -u mnist.py
+
+    The code has been tested with Python 3.7.5 and tensorflow-gpu 1.13.1.
+
+    Note: This code will NOT work on the supercomputers.
+
+"""
+
+import tensorflow as tf
+
+
+# Reference to the MNIST data object
+mnist = tf.keras.datasets.mnist
+
+# Load the MNIST dataset, split between train and test sets
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+# Normalize input samples
+x_train, x_test = x_train / 255.0, x_test / 255.0
+
+# Define the model, i.e., the network
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(512, activation=tf.nn.relu),
+    tf.keras.layers.Dense(10, activation=tf.nn.softmax)
+])
+
+# Optimizer
+optimizer = tf.keras.optimizers.Adam()
+
+# No. of epochs
+epochs = 4
+
+# Compile the model
+model.compile(
+    optimizer=optimizer,
+    loss='sparse_categorical_crossentropy',
+    metrics=['accuracy']
+)
+
+# Train the model using the training set
+model.fit(
+    x=x_train,
+    y=y_train,
+    batch_size=32,
+    epochs=epochs,
+    verbose=1
+)
+
+# Test the model using the test set
+score = model.evaluate(x=x_test, y=y_test, verbose=0)
+print(f'Test loss:  {score[0]}')
+print(f'Test accuracy: {score[1]}')
diff --git a/course_material/slides/.gitkeep b/course_material/slides/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/utils/data_utils.py b/utils/data_utils.py
index bab6e03..090337b 100644
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -32,15 +32,9 @@ class DataValidator:
         recognized input data directory locations. If the check is passed,
         returns the fully qualified path to the input data directory.
 
-        Parameters
-        ----------
-        filename:
-            Name of the data file to be checked
-
-        Returns
-        -------
-        string:
-            Fully qualified path to the input data directory
+        :param filename: Name of the data file to be checked.
+
+        :return: str. Fully qualified path to the input data directory.
 
         """
 
-- 
GitLab