Skip to content
Snippets Groups Projects

Issue 3

Merged Ghost User requested to merge issue_3 into master
7 files
+ 180
11
Compare changes
  • Side-by-side
  • Inline
Files
7
 
# Copyright (c) 2019 Forschungszentrum Juelich GmbH.
 
# This code is licensed under MIT license (see the LICENSE file for details).
 
# This code is derived from Horovod, which is licensed under the Apache License,
 
# Version 2.0 (see the NOTICE file for details).
 
 
"""
 
This program is an adaptation of the following code sample:
 
https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py.
 
The program creates and trains a shallow ANN for handwritten digit
 
classification using the MNIST dataset.
 
 
The Horovod framework is used for seamless distributed training. In this
 
example epochs are distributed across the Horovod ranks, not data.
 
 
To run this sample use the following command on your
 
workstation/laptop equipped with a GPU:
 
 
mpirun -np 1 python -u mnist_epoch_distributed.py
 
 
If you have more than one GPU on your system, you can increase the
 
number of ranks accordingly.
 
 
The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and
 
horovod 0.16.2.
 
 
Note: This code will NOT work on the supercomputers.
 
 
"""
 
 
import math
 
import tensorflow as tf
 
import horovod.tensorflow.keras as hvd
 
from tensorflow.python.keras import backend as K
 
 
 
# Horovod: initialize Horovod.
 
hvd.init()
 
 
# Horovod: pin GPU to be used to process local rank (one GPU per process)
 
config = tf.ConfigProto()
 
config.gpu_options.visible_device_list = str(hvd.local_rank())
 
K.set_session(tf.Session(config=config))
 
 
# Reference to the MNIST dataset
 
mnist = tf.keras.datasets.mnist
 
 
# Load the MNIST dataset, split between train and test sets
 
(x_train, y_train), (x_test, y_test) = mnist.load_data()
 
 
# Normalize input samples
 
x_train, x_test = x_train / 255.0, x_test / 255.0
 
 
# Define the model, i.e., the network
 
model = tf.keras.models.Sequential([
 
tf.keras.layers.Flatten(),
 
tf.keras.layers.Dense(512, activation=tf.nn.relu),
 
tf.keras.layers.Dense(10, activation=tf.nn.softmax)
 
])
 
 
# Optimizer
 
optimizer = tf.keras.optimizers.Adam()
 
 
# Decorate the optimizer with Horovod's distributed optimizer
 
optimizer = hvd.DistributedOptimizer(optimizer)
 
 
# Horovod: adjust number of epochs based on number of GPUs.
 
epochs = int(math.ceil(4.0 / hvd.size()))
 
 
# Compile the model
 
model.compile(
 
optimizer=optimizer,
 
loss='sparse_categorical_crossentropy',
 
metrics=['accuracy']
 
)
 
 
# Training callbacks
 
callbacks = [
 
# Horovod: broadcast initial variable states from rank 0 to all other processes.
 
# This is necessary to ensure consistent initialization of all workers when
 
# training is started with random weights or restored from a checkpoint.
 
hvd.callbacks.BroadcastGlobalVariablesCallback(0)
 
]
 
 
# Train the model using the training set
 
model.fit(
 
x=x_train,
 
y=y_train,
 
batch_size=32,
 
epochs=epochs,
 
verbose=1 if hvd.rank() == 0 else 0,
 
callbacks=callbacks
 
)
 
 
# Run the test on the root rank only
 
if hvd.rank() == 0:
 
# Test the model on the test set
 
score = model.evaluate(x=x_test, y=y_test, verbose=0)
 
print(f'Test loss: {score[0]}')
 
print(f'Test accuracy: {score[1]}')
Loading