diff --git a/README.md b/README.md index 5af3ac4bacc3d563dab2dd9133dcc3ea5aadf1f9..a1e6ecb62ef15ba36a668a85a9248d621bd84fc9 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,9 @@ visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/E ### Announcements +* **November 28, 2019:** Slides and code samples for the "Deep Learning on Supercomputers" talk given +as part of the [Introduction to the programming and usage of the supercomputer resources at Jülich](https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) +course are now available in the `course_material` directory. * **November 22, 2019:** Samples for Caffe are no longer supported on JURECA due to system-wide MVAPICH2 module changes. * **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code diff --git a/course_material/README.md b/course_material/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8bb64436ffaa398a40f6db8b549bbcaedec90a4 --- /dev/null +++ b/course_material/README.md @@ -0,0 +1,9 @@ +# Slides and code samples + +The slides and code samples in the sub-directories correspond to the introductory examples presented during the +"Deep Learning on Supercomputers" talk, which is given as +part of the [Introduction to the programming and usage of the supercomputer resources at Jülich](https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) +course. + +**Note:** These code samples are NOT designed to work on our supercomputers. To see why, read `datasets/README.md`. +To run code samples on the supercomputers, please follow the main tutorial. \ No newline at end of file diff --git a/course_material/examples/mnist_epoch_distributed.py b/course_material/examples/mnist_epoch_distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..7c9080e63af23eabeb6a6b47c8e89edf26e7190f --- /dev/null +++ b/course_material/examples/mnist_epoch_distributed.py @@ -0,0 +1,99 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from Horovod, which is licensed under the Apache License, +# Version 2.0 (see the NOTICE file for details). + +""" + This program is an adaptation of the following code sample: + https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py. + The program creates and trains a shallow ANN for handwritten digit + classification using the MNIST dataset. + + The Horovod framework is used for seamless distributed training. In this + example epochs are distributed across the Horovod ranks, not data. + + To run this sample use the following command on your + workstation/laptop equipped with a GPU: + + mpirun -np 1 python -u mnist_epoch_distributed.py + + If you have more than one GPU on your system, you can increase the + number of ranks accordingly. + + The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and + horovod 0.16.2. + + Note: This code will NOT work on the supercomputers. + +""" + +import math +import tensorflow as tf +import horovod.tensorflow.keras as hvd +from tensorflow.python.keras import backend as K + + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +# Reference to the MNIST dataset +mnist = tf.keras.datasets.mnist + +# Load the MNIST dataset, split between train and test sets +(x_train, y_train), (x_test, y_test) = mnist.load_data() + +# Normalize input samples +x_train, x_test = x_train / 255.0, x_test / 255.0 + +# Define the model, i.e., the network +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dense(10, activation=tf.nn.softmax) +]) + +# Optimizer +optimizer = tf.keras.optimizers.Adam() + +# Decorate the optimizer with Horovod's distributed optimizer +optimizer = hvd.DistributedOptimizer(optimizer) + +# Horovod: adjust number of epochs based on number of GPUs. +epochs = int(math.ceil(4.0 / hvd.size())) + +# Compile the model +model.compile( + optimizer=optimizer, + loss='sparse_categorical_crossentropy', + metrics=['accuracy'] +) + +# Training callbacks +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0) +] + +# Train the model using the training set +model.fit( + x=x_train, + y=y_train, + batch_size=32, + epochs=epochs, + verbose=1 if hvd.rank() == 0 else 0, + callbacks=callbacks +) + +# Run the test on the root rank only +if hvd.rank() == 0: + # Test the model on the test set + score = model.evaluate(x=x_test, y=y_test, verbose=0) + print(f'Test loss: {score[0]}') + print(f'Test accuracy: {score[1]}') diff --git a/course_material/examples/mnist_single_gpu.py b/course_material/examples/mnist_single_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..794150fe230348b0001d86158d32a9a9e5e52cbd --- /dev/null +++ b/course_material/examples/mnist_single_gpu.py @@ -0,0 +1,67 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from Tensorflow tutorials, which is licensed under the Apache License, +# Version 2.0 (see the NOTICE file for details). + +""" + This program is an adaptation of the code sample available at + https://www.tensorflow.org/tutorials/. The program creates + and trains a shallow ANN for handwritten digit classification + using the MNIST dataset. + + To run this sample use the following command on your + workstation/laptop equipped with a GPU: + + python -u mnist.py + + The code has been tested with Python 3.7.5 and tensorflow-gpu 1.13.1. + + Note: This code will NOT work on the supercomputers. + +""" + +import tensorflow as tf + + +# Reference to the MNIST data object +mnist = tf.keras.datasets.mnist + +# Load the MNIST dataset, split between train and test sets +(x_train, y_train), (x_test, y_test) = mnist.load_data() + +# Normalize input samples +x_train, x_test = x_train / 255.0, x_test / 255.0 + +# Define the model, i.e., the network +model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dense(10, activation=tf.nn.softmax) +]) + +# Optimizer +optimizer = tf.keras.optimizers.Adam() + +# No. of epochs +epochs = 4 + +# Compile the model +model.compile( + optimizer=optimizer, + loss='sparse_categorical_crossentropy', + metrics=['accuracy'] +) + +# Train the model using the training set +model.fit( + x=x_train, + y=y_train, + batch_size=32, + epochs=epochs, + verbose=1 +) + +# Test the model using the test set +score = model.evaluate(x=x_test, y=y_test, verbose=0) +print(f'Test loss: {score[0]}') +print(f'Test accuracy: {score[1]}') diff --git a/course_material/slides/.gitkeep b/course_material/slides/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/course_material/slides/fahad_DL_on_SCs_November_2019.pdf b/course_material/slides/fahad_DL_on_SCs_November_2019.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d0ddcb130c4bda3a571935530cf2ce7b19488bab Binary files /dev/null and b/course_material/slides/fahad_DL_on_SCs_November_2019.pdf differ diff --git a/utils/data_utils.py b/utils/data_utils.py index bab6e035ea82e22108998da561519a70eea94eac..f2d10e4111fe70fdc465287a3b5f18bff8d6981f 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -22,9 +22,6 @@ class DataValidator: """ - def __init__(self): - """ No-op constructor. """ - @staticmethod def validated_data_dir(filename): """ @@ -32,15 +29,9 @@ class DataValidator: recognized input data directory locations. If the check is passed, returns the fully qualified path to the input data directory. - Parameters - ---------- - filename: - Name of the data file to be checked + :param filename: Name of the data file to be checked. - Returns - ------- - string: - Fully qualified path to the input data directory + :return: str. Fully qualified path to the input data directory. """