From ba420e69e7e11e20692dd299574fe4d35baa680a Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Mon, 26 Apr 2021 15:19:58 +0200 Subject: [PATCH 1/8] Added and tested TF2 code on JUWEL Booster. --- caffe/README.md | 43 ----- .../.submit_job_jureca_python2.sh | 22 --- caffe/lenet_python/lenet_auto_solver.prototxt | 24 --- .../lenet_python/submit_job_juron_python2.sh | 17 -- .../lenet_python/submit_job_juron_python3.sh | 17 -- caffe/lenet_python/train_lenet.py | 107 ----------- caffe/mnist_cmd/.submit_job_jureca_python2.sh | 22 --- caffe/mnist_cmd/lenet_solver.prototxt | 25 --- caffe/mnist_cmd/lenet_train_test.prototxt | 168 ------------------ caffe/mnist_cmd/snapshots/.gitkeep | 0 caffe/mnist_cmd/submit_job_juron_python2.sh | 17 -- caffe/mnist_cmd/submit_job_juron_python3.sh | 17 -- horovod_data_distributed/juwels_booster_job | 28 +++ .../mnist_data_distributed.py | 18 +- requirements.txt | 61 ++++--- .../checkpoints}/.gitkeep | 0 tensorflow2/juwels_booster_job | 27 +++ tensorflow2/keras_mnist.py | 107 +++++++++++ tensorflow2/mnist.py | 109 ++++++++++++ 19 files changed, 321 insertions(+), 508 deletions(-) delete mode 100644 caffe/README.md delete mode 100755 caffe/lenet_python/.submit_job_jureca_python2.sh delete mode 100644 caffe/lenet_python/lenet_auto_solver.prototxt delete mode 100755 caffe/lenet_python/submit_job_juron_python2.sh delete mode 100755 caffe/lenet_python/submit_job_juron_python3.sh delete mode 100644 caffe/lenet_python/train_lenet.py delete mode 100755 caffe/mnist_cmd/.submit_job_jureca_python2.sh delete mode 100644 caffe/mnist_cmd/lenet_solver.prototxt delete mode 100644 caffe/mnist_cmd/lenet_train_test.prototxt delete mode 100644 caffe/mnist_cmd/snapshots/.gitkeep delete mode 100755 caffe/mnist_cmd/submit_job_juron_python2.sh delete mode 100755 caffe/mnist_cmd/submit_job_juron_python3.sh create mode 100755 horovod_data_distributed/juwels_booster_job rename {caffe/lenet_python/snapshots => tensorflow2/checkpoints}/.gitkeep (100%) create mode 100755 tensorflow2/juwels_booster_job create mode 100644 tensorflow2/keras_mnist.py create mode 100644 tensorflow2/mnist.py diff --git a/caffe/README.md b/caffe/README.md deleted file mode 100644 index 1804dce..0000000 --- a/caffe/README.md +++ /dev/null @@ -1,43 +0,0 @@ -**Caution:** Caffe is no longer being actively developed, which is why we prefer not to support -it as a system-wide module on the supercomputers for long. This is why Caffe is available with -Python 2 support only on JURECA, while it is not at all supported on JUWELS. The users are advised -to switch to other frameworks such as Tensorflow/Keras and PyTorch. - -# Notes - -There are three ways in which Caffe can be used, -1. As a command line tool with only built-in layers -2. As a library from within a Python program. Either only built-in layers can be used, -or one or more custom layers can be written in Python. -3. As a command line tool with one or more custom C++ layers. - -## Caffe as a command line tool - -The `mnist_cmd` sub-directory contains configuration and job scripts for running -Caffe as a command line tool with only built-in layers. This example represents use -case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` -were taken from the MNIST examples directory available in the Caffe repository -[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have -been made just so the path to the input dataset is correct. The `caffe` command -in the job submission scripts can be modified as follows to run training on -all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`): - - caffe train --solver=lenet_solver.prototxt -gpu all - -## Using Caffe within a Python program - -The `lenet_python` sub-directory contains the required files for an example of -using Caffe as a library from within a Python program. This corresponds to use case -2 as described above. The `train_lenet.py` file contains source code adapted from -the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples -[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example -results in the generation of a learning curve plot in the current directory. - -## Caffe with custom C++ layers - -Working with custom C++ layers requires recompiling Caffe with the custom code. As -this is not possible with a system-wide installation, we have decided not to -include an example of this use case. Nevertheless, if you must work with custom -C++ layers and require assistance, please send an email to the JULAIN mailing list -(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)). - diff --git a/caffe/lenet_python/.submit_job_jureca_python2.sh b/caffe/lenet_python/.submit_job_jureca_python2.sh deleted file mode 100755 index 7506925..0000000 --- a/caffe/lenet_python/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_LENET_PYTHON -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Run the program -srun python -u train_lenet.py diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt deleted file mode 100644 index 44af3ad..0000000 --- a/caffe/lenet_python/lenet_auto_solver.prototxt +++ /dev/null @@ -1,24 +0,0 @@ -# The train/test net protocol buffer definition -train_net: "lenet_auto_train.prototxt" -test_net: "lenet_auto_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh deleted file mode 100755 index 2025a38..0000000 --- a/caffe/lenet_python/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh deleted file mode 100755 index 7e73776..0000000 --- a/caffe/lenet_python/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py deleted file mode 100644 index ad5cae3..0000000 --- a/caffe/lenet_python/train_lenet.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -import sys -import matplotlib - -# Force matplotlib to not use any Xwindows backend. -matplotlib.use('Agg') -import pylab - -import caffe -from caffe import layers as L, params as P - -# Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - - -# Prepares network specification -def lenet(lmdb, batch_size): - # Caffe's version of LeNet: a series of linear and simple nonlinear transformations - n = caffe.NetSpec() - - n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb, - transform_param=dict(scale=1. / 255), ntop=2) - - n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) - n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) - n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier')) - n.relu1 = L.ReLU(n.fc1, in_place=True) - n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier')) - n.loss = L.SoftmaxWithLoss(n.score, n.label) - - return n.to_proto() - - -# Names of the directories containing the LMDB files for TRAIN and TEST phases -test_dir = 'mnist/caffe/mnist_test_lmdb' -train_dir = 'mnist/caffe/mnist_train_lmdb' - -# Validated path to the data root -DataValidator.validated_data_dir(train_dir) -data_dir = DataValidator.validated_data_dir(test_dir) - -# Write the prototxt for TRAIN phase -with open('lenet_auto_train.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, train_dir), 64))) - -# Write the prototxt for TEST phase -with open('lenet_auto_test.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, test_dir), 100))) - -# Use the GPU for training -caffe.set_device(0) -caffe.set_mode_gpu() - -# Load the solver and create train and test nets -solver = None # ignore this workaround for lmdb data (can't instantiate two solvers on the same data) -solver = caffe.SGDSolver('lenet_auto_solver.prototxt') - -solver.net.forward() # train net -solver.test_nets[0].forward() # test net (there can be more than one) - -niter = 200 -test_interval = 25 -# losses will also be stored in the log -train_loss = pylab.zeros(niter) -test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval))) -output = pylab.zeros((niter, 8, 10)) - -# the main solver loop -for it in range(niter): - solver.step(1) # SGD by Caffe - - # store the train loss - train_loss[it] = solver.net.blobs['loss'].data - - # store the output on the first test batch - # (start the forward pass at conv1 to avoid loading new data) - solver.test_nets[0].forward(start='conv1') - output[it] = solver.test_nets[0].blobs['score'].data[:8] - - # run a full test every so often - # (Caffe can also do this for us and write to a log, but we show here - # how to do it directly in Python, where more complicated things are easier.) - if it % test_interval == 0: - print('Iteration', it, 'testing...') - correct = 0 - for test_it in range(100): - solver.test_nets[0].forward() - correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1) - == solver.test_nets[0].blobs['label'].data) - test_acc[it // test_interval] = correct / 1e4 - -# Plot the training curve -_, ax1 = pylab.subplots() -ax2 = ax1.twinx() -ax1.plot(pylab.arange(niter), train_loss) -ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r') -ax1.set_xlabel('iteration') -ax1.set_ylabel('train loss') -ax2.set_ylabel('test accuracy') -ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1])) - -# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace -pylab.savefig('learning_curve.png', bbox_inches='tight') diff --git a/caffe/mnist_cmd/.submit_job_jureca_python2.sh b/caffe/mnist_cmd/.submit_job_jureca_python2.sh deleted file mode 100755 index 029520e..0000000 --- a/caffe/mnist_cmd/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_MNIST_CMD -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Train the model using the 'caffe' binary -srun caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt deleted file mode 100644 index 103b2e7..0000000 --- a/caffe/mnist_cmd/lenet_solver.prototxt +++ /dev/null @@ -1,25 +0,0 @@ -# The train/test net protocol buffer definition -net: "lenet_train_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" -# solver mode: CPU or GPU -solver_mode: GPU diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt deleted file mode 100644 index f34ab71..0000000 --- a/caffe/mnist_cmd/lenet_train_test.prototxt +++ /dev/null @@ -1,168 +0,0 @@ -name: "LeNet" -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_train_lmdb" - batch_size: 64 - backend: LMDB - } -} -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_test_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 20 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "conv1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 50 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "conv2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "ip1" - type: "InnerProduct" - bottom: "pool2" - top: "ip1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 500 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "ip1" - top: "ip1" -} -layer { - name: "ip2" - type: "InnerProduct" - bottom: "ip1" - top: "ip2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 10 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "ip2" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "ip2" - bottom: "label" - top: "loss" -} diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh deleted file mode 100755 index b5ee63c..0000000 --- a/caffe/mnist_cmd/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh deleted file mode 100755 index bdac4a2..0000000 --- a/caffe/mnist_cmd/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 diff --git a/horovod_data_distributed/juwels_booster_job b/horovod_data_distributed/juwels_booster_job new file mode 100755 index 0000000..803e764 --- /dev/null +++ b/horovod_data_distributed/juwels_booster_job @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=booster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py index d4c68c1..b2335a8 100644 --- a/horovod_data_distributed/mnist_data_distributed.py +++ b/horovod_data_distributed/mnist_data_distributed.py @@ -20,7 +20,6 @@ import mpi4py import numpy as np import tensorflow as tf import horovod.tensorflow.keras as hvd -from tensorflow.python.keras import backend as K from hpc4neuro.errors import MpiInitError from hpc4neuro.distribution import DataDistributor @@ -102,10 +101,14 @@ def initialize_hvd_and_mpi(): # Bind the local rank to a specific GPU, so that each rank uses # a different GPU - tf_config = tf.ConfigProto() - tf_config.gpu_options.allow_growth = True - tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) - K.set_session(tf.Session(config=tf_config)) + gpus = tf.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + if gpus: + tf.config.experimental.set_visible_devices( + gpus[hvd.local_rank()], + 'GPU' + ) # Verify that MPI multi-threading is supported. Horovod cannot work # with mpi4py (or any other MPI library) otherwise. @@ -113,8 +116,9 @@ def initialize_hvd_and_mpi(): # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163 if not hvd.mpi_threads_supported(): raise MpiInitError( - 'MPI multi-threading is not supported. Horovod cannot work with mpi4py' - 'in this case. Please enable MPI multi-threading and try again.' + 'MPI multi-threading is not supported. Horovod cannot work with ' + 'mpi4py in this case. Please enable MPI multi-threading and try ' + 'again.' ) # Disable automatic MPI initialization on importing mpi4py.MPI, diff --git a/requirements.txt b/requirements.txt index 79144dc..6a4def7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,41 @@ -absl-py==0.8.0 -astor==0.8.0 -cffi==1.12.3 -cloudpickle==1.2.1 -gast==0.3.1 -grpcio==1.23.0 +absl-py==0.12.0 +astunparse==1.6.3 +cachetools==4.2.1 +certifi==2020.12.5 +cffi==1.14.5 +chardet==4.0.0 +cloudpickle==1.6.0 +gast==0.3.3 +google-auth==1.29.0 +google-auth-oauthlib==0.4.4 +google-pasta==0.2.0 +grpcio==1.37.0 h5py==2.10.0 -Markdown==3.1.1 -mock==3.0.5 -mpi4py==3.0.2 -numpy==1.17.2 -protobuf==3.9.1 -psutil==5.6.3 -pycparser==2.19 -six==1.12.0 -Werkzeug==0.15.6 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.0 -tensorboard==1.13.1 -tensorflow-estimator==1.13.0 -tensorflow-gpu==1.13.1 +horovod==0.20.3 +hpc4neuro @ git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git@57a560b4085dba2ba3262d4d3238ef70991be877 +idna==2.10 +Keras-Preprocessing==1.1.2 +Markdown==3.3.4 +mpi4py==3.0.3 +numpy==1.18.5 +oauthlib==3.1.0 +opt-einsum==3.3.0 +protobuf==3.15.8 +psutil==5.8.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +PyYAML==5.4.1 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7.2 +six==1.15.0 +tensorboard==2.5.0 +tensorboard-data-server==0.6.0 +tensorboard-plugin-wit==1.8.0 +tensorflow==2.3.1 +tensorflow-estimator==2.3.0 termcolor==1.1.0 -keras==2.3.1 -horovod==0.16.2 \ No newline at end of file +urllib3==1.26.4 +Werkzeug==1.0.1 +wrapt==1.12.1 diff --git a/caffe/lenet_python/snapshots/.gitkeep b/tensorflow2/checkpoints/.gitkeep similarity index 100% rename from caffe/lenet_python/snapshots/.gitkeep rename to tensorflow2/checkpoints/.gitkeep diff --git a/tensorflow2/juwels_booster_job b/tensorflow2/juwels_booster_job new file mode 100755 index 0000000..625afac --- /dev/null +++ b/tensorflow2/juwels_booster_job @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=booster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow2/keras_mnist.py b/tensorflow2/keras_mnist.py new file mode 100644 index 0000000..e444560 --- /dev/null +++ b/tensorflow2/keras_mnist.py @@ -0,0 +1,107 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys + +import tensorflow as tf +import horovod.tensorflow.keras as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) + +# Horovod: adjust learning rate based on number of GPUs. +scaled_lr = 0.001 * hvd.size() +opt = tf.optimizers.Adam(scaled_lr) + +# Horovod: add Horovod DistributedOptimizer. +opt = hvd.DistributedOptimizer(opt) + +# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow +# uses hvd.DistributedOptimizer() to compute gradients. +mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), + optimizer=opt, + metrics=['accuracy'], + experimental_run_tf_function=False) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(tf.keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) + +# Horovod: write logs on worker 0. +verbose = 1 if hvd.rank() == 0 else 0 + +# Train the model. +# Horovod: adjust number of steps based on number of GPUs. +mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose) \ No newline at end of file diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py new file mode 100644 index 0000000..53cb1da --- /dev/null +++ b/tensorflow2/mnist.py @@ -0,0 +1,109 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys + +import tensorflow as tf +import horovod.tensorflow as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) +loss = tf.losses.SparseCategoricalCrossentropy() + +# Horovod: adjust learning rate based on number of GPUs. +opt = tf.optimizers.Adam(0.001 * hvd.size()) + +checkpoint_dir = 'checkpoints/' +checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) + + +@tf.function +def training_step(images, labels, first_batch): + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) + + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) + + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + + return loss_value + + +# Horovod: adjust number of steps based on number of GPUs. +for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())): + loss_value = training_step(images, labels, batch == 0) + + if batch % 10 == 0 and hvd.local_rank() == 0: + print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + +# Horovod: save checkpoints only on worker 0 to prevent other workers from +# corrupting it. +if hvd.rank() == 0: + checkpoint.save(checkpoint_dir) -- GitLab From 315aca15e05fbd95df2a1c66dab09e0089f7c016 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Mon, 26 Apr 2021 19:53:57 +0200 Subject: [PATCH 2/8] The first complete tutorial overhaul with Tensorflow2. --- .gitattributes | 4 - .gitignore | 3 + README.md | 335 ++++++-------- datasets/mnist/caffe/mnist_test_lmdb/data.mdb | 3 - datasets/mnist/caffe/mnist_test_lmdb/lock.mdb | 3 - .../mnist/caffe/mnist_train_lmdb/data.mdb | 3 - .../mnist/caffe/mnist_train_lmdb/lock.mdb | 3 - horovod/README.md | 35 -- horovod/keras/mnist.py | 116 ----- horovod/keras/mnist_advanced.py | 149 ------- horovod/keras/run_on_localMachine.sh | 8 - horovod/keras/submit_job_jureca.sh | 22 - horovod/keras/submit_job_juron.sh | 20 - horovod/keras/submit_job_juwels.sh | 22 - horovod/tensorflow/checkpoints/.gitkeep | 0 horovod/tensorflow/mnist.py | 159 ------- horovod/tensorflow/mnist_estimator.py | 214 --------- horovod/tensorflow/run_on_localMachine.sh | 8 - horovod/tensorflow/submit_job_jureca.sh | 22 - horovod/tensorflow/submit_job_juron.sh | 19 - horovod/tensorflow/submit_job_juwels.sh | 22 - horovod/tensorflow/synthetic_benchmark.py | 120 ----- horovod_data_distributed/README.md | 33 -- horovod_data_distributed/setup_juron.sh | 24 - horovod_data_distributed/submit_job_jureca.sh | 22 - horovod_data_distributed/submit_job_juron.sh | 28 -- horovod_data_distributed/submit_job_juwels.sh | 22 - keras/README.md | 13 - keras/mnist.py | 93 ---- keras/run_on_localMachine.sh | 4 - keras/submit_job_jureca.sh | 20 - keras/submit_job_juron.sh | 18 - keras/submit_job_juwels.sh | 20 - tensorflow/README.md | 21 +- .../keras => tensorflow}/checkpoints/.gitkeep | 0 tensorflow/jureca_job.sh | 24 + tensorflow/jusuf_job.sh | 24 + tensorflow/juwels_booster_job.sh | 24 + tensorflow/juwels_job.sh | 24 + {tensorflow2 => tensorflow}/keras_mnist.py | 2 +- tensorflow/mnist.py | 415 +++++------------- tensorflow/run_on_localMachine.sh | 4 - tensorflow/submit_job_jureca.sh | 19 - tensorflow/submit_job_juron.sh | 17 - tensorflow/submit_job_juwels.sh | 19 - tensorflow2/checkpoints/.gitkeep | 0 tensorflow2/mnist.py | 109 ----- training_data_distribution/README.md | 27 ++ .../jureca_job.sh | 5 +- training_data_distribution/jusuf_job.sh | 28 ++ .../juwels_booster_job.sh | 28 ++ .../juwels_job.sh | 2 +- .../mnist_data_distributed.py | 0 utils/data_utils.py | 10 +- 54 files changed, 439 insertions(+), 1950 deletions(-) delete mode 100644 datasets/mnist/caffe/mnist_test_lmdb/data.mdb delete mode 100644 datasets/mnist/caffe/mnist_test_lmdb/lock.mdb delete mode 100644 datasets/mnist/caffe/mnist_train_lmdb/data.mdb delete mode 100644 datasets/mnist/caffe/mnist_train_lmdb/lock.mdb delete mode 100644 horovod/README.md delete mode 100644 horovod/keras/mnist.py delete mode 100644 horovod/keras/mnist_advanced.py delete mode 100644 horovod/keras/run_on_localMachine.sh delete mode 100755 horovod/keras/submit_job_jureca.sh delete mode 100755 horovod/keras/submit_job_juron.sh delete mode 100755 horovod/keras/submit_job_juwels.sh delete mode 100644 horovod/tensorflow/checkpoints/.gitkeep delete mode 100644 horovod/tensorflow/mnist.py delete mode 100644 horovod/tensorflow/mnist_estimator.py delete mode 100644 horovod/tensorflow/run_on_localMachine.sh delete mode 100755 horovod/tensorflow/submit_job_jureca.sh delete mode 100644 horovod/tensorflow/submit_job_juron.sh delete mode 100755 horovod/tensorflow/submit_job_juwels.sh delete mode 100644 horovod/tensorflow/synthetic_benchmark.py delete mode 100644 horovod_data_distributed/README.md delete mode 100755 horovod_data_distributed/setup_juron.sh delete mode 100755 horovod_data_distributed/submit_job_jureca.sh delete mode 100755 horovod_data_distributed/submit_job_juron.sh delete mode 100755 horovod_data_distributed/submit_job_juwels.sh delete mode 100644 keras/README.md delete mode 100644 keras/mnist.py delete mode 100644 keras/run_on_localMachine.sh delete mode 100755 keras/submit_job_jureca.sh delete mode 100644 keras/submit_job_juron.sh delete mode 100755 keras/submit_job_juwels.sh rename {horovod/keras => tensorflow}/checkpoints/.gitkeep (100%) create mode 100755 tensorflow/jureca_job.sh create mode 100755 tensorflow/jusuf_job.sh create mode 100755 tensorflow/juwels_booster_job.sh create mode 100755 tensorflow/juwels_job.sh rename {tensorflow2 => tensorflow}/keras_mnist.py (97%) delete mode 100644 tensorflow/run_on_localMachine.sh delete mode 100755 tensorflow/submit_job_jureca.sh delete mode 100644 tensorflow/submit_job_juron.sh delete mode 100755 tensorflow/submit_job_juwels.sh delete mode 100644 tensorflow2/checkpoints/.gitkeep delete mode 100644 tensorflow2/mnist.py create mode 100644 training_data_distribution/README.md rename tensorflow2/juwels_booster_job => training_data_distribution/jureca_job.sh (83%) create mode 100755 training_data_distribution/jusuf_job.sh create mode 100755 training_data_distribution/juwels_booster_job.sh rename horovod_data_distributed/juwels_booster_job => training_data_distribution/juwels_job.sh (95%) rename {horovod_data_distributed => training_data_distribution}/mnist_data_distributed.py (100%) diff --git a/.gitattributes b/.gitattributes index dbf6f0e..36df28f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,7 +1,3 @@ -datasets/mnist/caffe/mnist_test_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_test_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_train_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_train_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text datasets/mnist/keras/mnist.npz filter=lfs diff=lfs merge=lfs -text datasets/mnist/pytorch/data/processed/training.pt filter=lfs diff=lfs merge=lfs -text datasets/mnist/pytorch/data/processed/test.pt filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 9c4d6d5..05043c3 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,6 @@ mnist_convnet_model/ # Error and output files from the supercomputers *.er *.out + +# MacOS +.DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 0990dfd..7ac59ff 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,46 @@ # Getting started with Deep Learning on Supercomputers -This repository is intended to serve as a tutorial for anyone interested in utilizing the supercomputers -available at the Jülich Supercomputing Center (JSC) for deep learning based projects. It is assumed that -the reader is proficient in one or more of the following frameworks: +This repository is intended to serve as a tutorial for anyone interested in +utilizing the supercomputers available at the Jülich Supercomputing Center (JSC) +for deep learning based projects. It is assumed that the reader is proficient in +the following frameworks: * [Tensorflow](https://www.tensorflow.org/) -* [Keras](https://keras.io/) * [Horovod](https://github.com/horovod/horovod) -* [Caffe](http://caffe.berkeleyvision.org/) (limited support) -**Note:** This tutorial is by no means intended as an introduction to deep learning, or to any of the -above mentioned frameworks. If you are interested in educational resources for beginners, please -visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/Education) page. +**Note:** This tutorial is by no means intended as an introduction to deep +learning, or to any of the above mentioned frameworks. If you are interested in +educational resources for beginners, please visit +[this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/-/wikis/home) +page. ### Announcements -* **November 28, 2019:** Slides and code samples for the "Deep Learning on Supercomputers" talk given -as part of the [Introduction to the programming and usage of the supercomputer resources at Jülich](https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) -course are now available in the `course_material` directory. -* **November 22, 2019:** Samples for Caffe are no longer supported on JURECA due to system-wide -MVAPICH2 module changes. -* **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code -samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism -where the training data is distributed instead of epochs. Further information is available in the -directory-local `README.md`. -* **September 02, 2019:** Even though PyTorch is available as a system-wide module on the JSC supercomputers, all PyTorch -examples have been removed from this tutorial. This is due to the fact that the tutorial -developers are not currently working with PyTorch, and are therefore not in a position to provide -support for PyTorch related issues. +* **April 26, 2021:** The tutorial has been updated to use Tensorflow2. Also, + code samples and datasets that are no longer relevant, e.g., those for Caffe, + have been removed. +* **November 28, 2019:** Slides and code samples for the "Deep Learning on + Supercomputers" talk given as part of the [Introduction to the programming + and usage of the supercomputer resources at Jülich]( + https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) + course are now available in the `course_material` directory. +* **November 22, 2019:** Samples for Caffe are no longer supported on JURECA + due to system-wide MVAPICH2 module changes. +* **November 18, 2019:** The `horovod_data_distributed` directory has been + added that contains code samples to illustrate proper data-distributed + training with Horovod, i.e., a distribution mechanism where the training data + is distributed instead of epochs. Further information is available in the + directory-local `README.md`. +* **September 02, 2019:** Even though PyTorch is available as a system-wide + module on the JSC supercomputers, all PyTorch examples have been removed from + this tutorial. This is due to the fact that the tutorial developers are not + currently working with PyTorch, and are therefore not in a position to + provide support for PyTorch related issues. * **August 23, 2019:** - * Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well. - * Python 2 support has been removed from the tutorial for all frameworks except Caffe. + * Tensorflow and Keras examples (with and without Horovod) are now fully + functional on JUWELS as well. + * Python 2 support has been removed from the tutorial for all frameworks + except Caffe. # Table of contents <!-- TOC --> @@ -38,133 +48,97 @@ support for PyTorch related issues. 1. [A word regarding the code samples](#1-a-word-regarding-the-code-samples) 2. [Changes made to support loading of pre-downloaded datasets](#2-changes-made-to-support-loading-of-pre-downloaded-datasets) 3. [Applying for user accounts on supercomputers](#3-applying-for-user-accounts-on-supercomputers) - * [3.1. JURECA and JUWELS](#31-jureca-and-juwels) - * [3.2. JURON](#32-juron) 4. [Logging on to the supercomputers](#4-logging-on-to-the-supercomputers) - * [4.1. JURECA and JUWELS](#41-jureca-and-juwels) - * [4.2. JURON](#42-juron) 5. [Cloning the repository](#5-cloning-the-repository) - * [5.1. JURECA and JUWELS](#51-jureca-and-juwels) - * [5.2. JURON](#52-juron) 6. [Running a sample](#6-running-a-sample) - * [6.1. JURECA and JUWELS](#61-jureca-and-juwels) - * [6.2. JURON](#62-juron) -7. [Python 2 support](#7-python-2-support) -8. [Distributed training](#8-distributed-training) -9. [Credits](#9-credits) +7. [Distributed training](#8-distributed-training) +8. [Credits](#9-credits) <!-- /TOC --> ## 1. A word regarding the code samples -Samples for each framework are available in the correspondingly named directory. Each such -directory typically contains at least one code sample, which trains a simple artificial neural -network on the canonical MNIST hand-written digit classification task. Moreover, job submission -scripts are included for all the supercomputers on which this tutorial has been tested. The job -scripts will hopefully make it easier to figure out which modules to load. Finally, -a `README.md` file contains further information about the contents of the directory. +Samples for each framework are available in the correspondingly named directory. +Each such directory typically contains at least one code sample, which trains a +simple artificial neural network on the canonical MNIST hand-written digit +classification task. Moreover, job submission scripts are included for all the +supercomputers on which this tutorial has been tested. The job scripts will +hopefully make it easier to figure out which modules to load. Finally, a +`README.md` file contains further information about the contents of the +directory. -**Disclaimer:** Neither are the samples intended to serve as examples of optimized code, nor do these -represent programming best practices. +**Disclaimer:** Neither are the samples intended to serve as examples of +optimized code, nor do these represent programming best practices. ## 2. Changes made to support loading of pre-downloaded datasets -It is worth mentioning that all the code samples were taken from the corresponding framework's -official samples/tutorials repository, as practitioners are likely familiar with these (links -to the original code samples are included in the directory-local `README.md`). However, the -original examples are designed to automatically download the required dataset in a -framework-defined directory. This is not a feasible option while working with supercomputers as compute nodes -do not have access to the Internet. Therefore, the samples have been slightly modified to load data from -the `datasets` directory included in this repository; specific code changes, at least for now, -have been marked by comments prefixed with the `[HPCNS]` tag. For more information see the `README.md` -available in the `datasets` directory. +It is worth mentioning that all the code samples were taken from the +corresponding framework's official samples/tutorials repository, as +practitioners are likely familiar with these (links to the original code samples +are included in the directory-local `README.md`). However, the original examples +are designed to automatically download the required dataset in a +framework-defined directory. This is not a feasible option while working with +supercomputers as compute nodes do not have access to the Internet. Therefore, +the samples have been slightly modified to load data from the `datasets` +directory included in this repository; specific code changes, at least for now, +have been marked by comments prefixed with the `[HPCNS]` tag. For more +information see the `README.md` available in the `datasets` directory. ## 3. Applying for user accounts on supercomputers -In case you do not already have an account on your supercomputer of interest, please take a look at the -instructions provided in the following sub-sections. - -### 3.1 JURECA and JUWELS - -For more information on getting accounts on JURECA and JUWELS, click -[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html). - -### 3.2 JURON - -To get a user account on JURON, please follow the steps below: - -1. Write an email to [Dirk Pleiter](http://www.fz-juelich.de/SharedDocs/Personen/IAS/JSC/EN/staff/pleiter_d.html?nn=362224), -in which please introduce yourself and mention why you need the account. -2. Apply for the account via the [JuDoor](https://dspserv.zam.kfa-juelich.de/judoor/login) portal -(more information about JuDoor is available [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/JuDoor.html?nn=945700)). -If your work is related to the Human Brain Project (HBP), please join the `PCP0` and `CPCP0` projects. -Otherwise please join the `PADC` and `CPADC` projects. +In case you do not already have an account on your supercomputer of interest, +please refer to the instructions available [here]( +http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html), +as you will need to apply for computing time before an account is created for you. ## 4. Logging on to the supercomputers -**Note:** From here on it is assumed that you already have an account on your required supercomputer. +**Note:** From here on it is assumed that you already have an account on your +required supercomputer. -### 4.1 JURECA and JUWELS +**Note:** This tutorial is supported for the following supercomputers: JURECA, +JUWELS, JUWELS Booster, and JUSUF. Following are the steps required to login (more information: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/access.html#access), -[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access)). +[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access), +[JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/access.html)). + +For the purpose of this tutorial, we will assume that our system of interest is +JURECA. If you intend to use a different system, you can simply replace the +system name in the commands below; the procedure is precisely the same for all +machines. -1. Use SSH to login. Use one of the following commands, depending on your target system: +1. Use SSH to login: - `ssh <username>@jureca.fz-juelich.de` or `ssh <username>@juwels.fz-juelich.de` + `ssh -i ~/.ssh/<keyfile> <username>@jureca.fz-juelich.de` 2. Upon successful login, activate your project environment: `jutil env activate -p <name of compute project> -A <name of budget>` - **Note:** To view a list of all project and budget names available to you, please use the following command: - `jutil user projects -o columns`. Each name under the column titled "project" has a corresponding type under the - column titled "project-type". All projects with "project-type" "C" are compute projects, and - can be used in the `<name of compute project>` field for the command above. The `<name of budget>` field should then - contain the corresponding name under the "budgets" column. Please click [here]( + **Note:** To view a list of all project and budget names available to you, + please use the following command: `jutil user projects -o columns`. Each + name under the column titled "project" has a corresponding type under the + column titled "project-type". All projects with "project-type" "C" are + compute projects, and can be used in the `<name of compute project>` field + for the command above. The `<name of budget>` field should then contain the + corresponding name under the "budgets" column. Please click [here]( http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html) for more information. 3. Change to the project directory: `cd $PROJECT` -You should be in your project directory at this point. As the project directory is shared with other project -members, it is recommended to create a new directory with your username, and change to that directory. If -you'd like to clone this repository elsewhere, please change to the directory of your choice. - -### 4.2 JURON - -Following are the steps required to login. - -1. Use SSH to login: - - `ssh <username>@juron.fz-juelich.de` -2. Upon successful login, activate your project environment (more information -[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)). - - `jutil env activate -p <name of compute project>` - - The `<name of compute project>` can be either `CPCP0` or `CPADC`, depending on whether you are a member - of `CPCP0` or `CPADC` (to view a list of all project names available to you, please use the following - command: `jutil user projects -o columns`). Note that as opposed to the corresponding section on JURECA, - the `<name of budget>` is not included. This is because the `CPCP0` and `CPADC` projects do not support - accounting. -3. Change to the project directory: - - `cd $PROJECT` - -You should be in your project directory at this point. As the `CPCP0` and `CPADC` project directories -are shared amongst many users from different institutes and organizations, it is recommended to create -a personal directory (named after your username) withing the project directory. You can then use your -personal directory for all your work, including cloning this tutorial. +You should be in your project directory at this point. As the project directory +is shared with other project members, it is recommended to create a new +directory with your username, and change to that directory. If you'd like to +clone this repository elsewhere, please change to the directory of your choice. ## 5. Cloning the repository -In order to store the datasets within the repository, we use Git LFS. This makes cloning the -repository a little bit different. Please find below the instructions on how to clone on different -systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/). - -### 5.1 JURECA and JUWELS +In order to store the datasets within the repository, we use Git LFS. This makes +cloning the repository slightly different. Please find below the instructions +on how to clone the repository. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/). 1. Load the Git LFS module: @@ -176,107 +150,66 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju `git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git` -### 5.2 JURON - -The process is simpler on JURON. You can simply clone the repository along with the datasets using -the following command: - - git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git - ## 6. Running a sample -Let us consider a scenario where you would like to run the `mnist.py` sample available in the `keras` -directory. This sample trains a CNN on MNIST using Keras on a single GPU. The following sub-sections list -the steps required for different supercomputers. - -### 6.1 JURECA and JUWELS +Let us consider a scenario where you would like to run the `keras_mnist.py` +sample available in the `tensorflow` directory. This sample trains a CNN on +MNIST using Tensorflow's Keras API. Following steps can be used to run the +sample: 1. Change directory to the repository root: `cd dl_on_supercomputers` -2. Change to the keras sub-directory: +2. Change to the tensorflow sub-directory: - `cd keras` + `cd tensorflow` 3. Submit the job to run the sample: - `sbatch submit_job_jureca.sh` or `sbatch submit_job_juwels.sh` + `sbatch jureca_job.sh` -That's it; this is all you need for job submission. If you'd like to receive email notifications -regarding the status of the job, add the following statement to the "SLURM job configuration" -block in the `submit_job_jureca.sh` (or `submit_job_juwels.sh`) script (replace `<your email address here>` with your -email address). +That's it; this is all you need for job submission. If you'd like to receive +email notifications regarding the status of the job, add the following statement +to the "SLURM job configuration" block in the `jureca_job.sh`script (replace +`<your email address here>` with your email address). #SBATCH --mail-user=<your email address here> -Output from the job is available in the `error` and `output` files, as specified in the job -configuration. - -**Note:** In the job submission scripts, the `--partition` value is set to `develgpus`, as jobs -are often (but not always) scheduled faster on this partition than the `gpus` partition. However, -resources in `develgpus` are limited -(as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), -[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions)). Therefore, -it is highly recommended that users familiarize themselves with the limitations, and use the `gpus` -partition for all production use, as well as when developing/testing with more resources than are -available on the `develgpus` partition. - -### 6.2 JURON - -1. Change directory to the repository root: - - `cd dl_on_supercomputers` -2. Change to the keras sub-directory: - - `cd keras` -3. Submit the job to run the sample: - - `bsub < submit_job_juron.sh` - -Please note that unlike JURECA and JUWELS, JURON uses LSF for job submission, which is why a different -syntax is required for job configuration and submission. Moreover, email notifications are not -supported on JURON. For more information on how to use LSF on JURON, use the following command: - - man 7 juron-lsf - -Output from the job is available in the `error` and `output` files, as specified in the job -configuration. - -## 7. Python 2 support - -As the official support for Python 2 will be be discontinued in 2020, we decided to encourage our -users to make the switch to Python 3 already. This also enables us to provide better support for -Python 3 based modules, as we no longer have to spend time maintaining Python 2 modules. - -The only exception is Caffe, as on JURECA it is available with Python 2 only. Please note however that -other than on JURON, Caffe is only available in the JURECA Stage 2018b, i.e., one of the previous stages. -We do not intend to provide support for Caffe from Stage 2019a and onward. This is due to the fact that -Caffe is no longer being developed. - -## 8. Distributed training - -[Horovod](https://github.com/horovod/horovod) provides a simple and efficient solution for -training artificial neural networks on multiple GPUs across multiple nodes in a cluster. It can -be used with Tensorflow and Keras (some other frameworks are supported as well, but -not Caffe). In this repository, the `horovod` directory contains further sub-directories; one -for each compatible framework that has been tested. E.g., there is a `keras` sub-directory that -contains samples that utilize distributed training with Keras and Horovod (more information is available -in the directory-local `README.md`). - -Please note that Horovod currently only supports a distribution strategy where the entire model is -replicated on every GPU. It is the data that is distributed across the GPUs. If you are interested -in model-parallel training, where the model itself can be split and distributed, a different -solution is required. We hope to add a sample for model-parallel training at a later time. - -Caffe does not support multi-node training. However, it has built-in support for [multi-GPU -training](https://github.com/BVLC/caffe/blob/master/docs/multigpu.md) on a single node (only -via the C/C++ interface). The `mnist_cmd` sample in the `caffe` directory contains the job -script that can be used to train the model on multiple GPUs. Please see the -directory-local `README.md` for further information. - -## 9. Credits +Output from the job is available in the `error` and `output` files, as specified +in the job configuration. + +**Note:** Please note that the job scripts for all systems are almost exactly +the same, except for the `--partition` value. This is because partition names +vary from system to system. Nevertheless, for each system, this tutorial uses +the corresponding development partition, e.g., `dc-gpu-devel` on JURECA. This is +because jobs are often (but not always) scheduled faster on this partition than +the production partition. However, resources in the development partitions are +limited (as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), +[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions), +and [JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/quickintro.html#quick-avail-partitions)). +Therefore, it is highly recommended that users familiarize themselves with the +limitations, and use the production partition for all production use, as well as +when developing/testing with more resources than are available on the +development partition. + +## 7. Distributed training + +[Horovod](https://github.com/horovod/horovod) provides a simple and efficient +solution for training artificial neural networks on multiple GPUs across +multiple nodes in a cluster. It can be used with Tensorflow (some +other frameworks are supported as well). Since this tutorial primarily concerns +distributed training, only code samples that utilize Horovod are included. + +Please note that Horovod currently only supports a distribution strategy where +the entire model is replicated on every GPU. It is the data that is distributed +across the GPUs. If you are interested in model-parallel training, where the +model itself can be split and distributed, a different solution is required. We +hope to add a sample for model-parallel training at a later time. + +## 8. Credits * **Created by:** Fahad Khalid (SLNS/HPCNS, JSC) * **Installation of modules on JURON:** Andreas Herten (HPCNS, JSC) -* **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi Deepu (SLNS/HPCNS, JSC) -* **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea Kirchner (SLNS/HPCNS, JSC), -Susanne Wenzel (INM-1) +* **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi + Deepu (SLNS/HPCNS, JSC) +* **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea + Kirchner (SLNS/HPCNS, JSC), Susanne Wenzel (INM-1) diff --git a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb deleted file mode 100644 index 760ab42..0000000 --- a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a70974534a27eaa5dc42638940ad311981b0259f1f089ea46c695bfd9c1862da -size 8749056 diff --git a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb deleted file mode 100644 index eda8c00..0000000 --- a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0667461174c505913de02429312bcbd9c6cab774b4495c7a2bbe7061ce3ccea -size 8192 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb deleted file mode 100644 index 4432b2e..0000000 --- a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3eea94f5e1ea128f16ff0e18f9e287cc2676a54a3218105c525e602f375666c1 -size 50757632 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb deleted file mode 100644 index d961b47..0000000 --- a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33569d983c9d6d527cd7d3202c31a2a7395b254fb8076f59b84ecaecb9207906 -size 8192 diff --git a/horovod/README.md b/horovod/README.md deleted file mode 100644 index 3d63a23..0000000 --- a/horovod/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Notes - -All source code samples were taken from the Horovod examples repository -[here](https://github.com/uber/horovod/tree/master/examples) -(last checked: September 02, 2019). The samples that work with MNIST data have been -slightly modified. Our changes are limited to, - -* The data loading mechanism -* A bit of code cleanup -* A few additional comments pertaining to our custom data loading mechanism - -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All -statements that demonstrate the use of Horovod follow a comment beginning with -`[Horovod]` (as added by Horovod developers). - -## Keras samples - -The following Keras samples are included: - -1. `mnist.py`: A simple MNIST processing example with only the essential Horovod code -for distributed training. -2. `mnist_advanced.py`: This sample is primarily the same as `mnist.py`. However, a -few more advanced Horovod features are used. - -## Tensorflow samples - -The following Tensorflow samples are included: - -1. `mnist.py`: Demonstrates distributed training using Horovod with the low-level -Tensorflow API. A simple convolutional neural network is trained on the MNIST dataset. -2. `mnist_estimator.py`: Demonstrates distributed training using Horovod with the -high-level Estimator API in Tensorflow. A simple convolutional neural network is -trained on the MNIST dataset. -3. `synthetic_benchmark.py`: A simple benchmark that can be used to measure performance -of Tensorflow with Horovod without using any external dataset. diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py deleted file mode 100644 index 0c46a77..0000000 --- a/horovod/keras/mnist.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K -import math -import tensorflow as tf -import horovod.keras as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -# Horovod: adjust number of epochs based on number of GPUs. -epochs = int(math.ceil(16.0 / hvd.size())) - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -# [HPCNS] Load MNIST dataset -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -# Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(1.0 * hvd.size()) - -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) - -model.fit(x_train, y_train, - batch_size=batch_size, - callbacks=callbacks, - epochs=epochs, - verbose=1 if hvd.rank() == 0 else 0, - validation_data=(x_test, y_test)) -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py deleted file mode 100644 index ba60b6d..0000000 --- a/horovod/keras/mnist_advanced.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist_advanced.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras.preprocessing.image import ImageDataGenerator -from keras import backend as K -import tensorflow as tf -import horovod.keras as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -# Enough epochs to demonstrate learning rate warmup and the reduction of -# learning rate when training plateaues. -epochs = 16 - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -# [HPCNS] Load MNIST dataset. -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -# Determine how many batches are there in train and test sets -train_batches = len(x_train) // batch_size -test_batches = len(x_test) // batch_size - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -# Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size()) - -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), - - # Horovod: average metrics among workers at the end of every epoch. - # - # Note: This callback must be in the list before the ReduceLROnPlateau, - # TensorBoard or other metrics-based callbacks. - hvd.callbacks.MetricAverageCallback(), - - # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final - # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during - # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), - - # Reduce the learning rate if training plateaues. - keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) - -# Set up ImageDataGenerators to do data augmentation for the training images. -train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, - height_shift_range=0.08, zoom_range=0.08) -test_gen = ImageDataGenerator() - -# Train the model. -# Horovod: the training will randomly sample 1 / N batches of training data and -# 3 / N batches of validation data on every worker, where N is the number of workers. -# Over-sampling of validation data helps to increase probability that every validation -# example will be evaluated. -model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), - steps_per_epoch=train_batches // hvd.size(), - callbacks=callbacks, - epochs=epochs, - verbose=1, - validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), - validation_steps=3 * test_batches // hvd.size()) - -# Evaluate the model on the full data set. -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/horovod/keras/run_on_localMachine.sh b/horovod/keras/run_on_localMachine.sh deleted file mode 100644 index 9c9afb4..0000000 --- a/horovod/keras/run_on_localMachine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -mpirun -np 1 -H localhost:1 \ - -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ - -mca pml ob1 -mca btl ^openib \ - python -u mnist.py diff --git a/horovod/keras/submit_job_jureca.sh b/horovod/keras/submit_job_jureca.sh deleted file mode 100755 index 3591bba..0000000 --- a/horovod/keras/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/keras/submit_job_juron.sh b/horovod/keras/submit_job_juron.sh deleted file mode 100755 index 0318278..0000000 --- a/horovod/keras/submit_job_juron.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=2]" -#BSUB -gpu "num=2" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HOROVOD_KERAS_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 -module load keras/2.2.4 - -# Run the program -mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/keras/submit_job_juwels.sh b/horovod/keras/submit_job_juwels.sh deleted file mode 100755 index 3591bba..0000000 --- a/horovod/keras/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/checkpoints/.gitkeep b/horovod/tensorflow/checkpoints/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py deleted file mode 100644 index 3c780ac..0000000 --- a/horovod/tensorflow/mnist.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -import os -import sys -import tensorflow as tf -import horovod.tensorflow as hvd -import numpy as np -import shutil - -from tensorflow import keras - -layers = tf.layers - -tf.logging.set_verbosity(tf.logging.INFO) - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - - -def conv_model(feature, target, mode): - """2-layer convolution model.""" - # Convert the target to a one-hot tensor of shape (batch_size, 10) and - # with a on-value of 1 for each one-hot vector of length 10. - target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) - - # Reshape feature to 4d tensor with 2nd and 3rd dimensions being - # image width and height final dimension being the number of color channels. - feature = tf.reshape(feature, [-1, 28, 28, 1]) - - # First conv layer will compute 32 features for each 5x5 patch - with tf.variable_scope('conv_layer1'): - h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], - activation=tf.nn.relu, padding="SAME") - h_pool1 = tf.nn.max_pool( - h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') - - # Second conv layer will compute 64 features for each 5x5 patch. - with tf.variable_scope('conv_layer2'): - h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], - activation=tf.nn.relu, padding="SAME") - h_pool2 = tf.nn.max_pool( - h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') - # reshape tensor into a batch of vectors - h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) - - # Densely connected layer with 1024 neurons. - h_fc1 = layers.dropout( - layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu), - rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Compute logits (1 per class) and compute loss. - logits = layers.dense(h_fc1, 10, activation=None) - loss = tf.losses.softmax_cross_entropy(target, logits) - - return tf.argmax(logits, 1), loss - - -def train_input_generator(x_train, y_train, batch_size=64): - assert len(x_train) == len(y_train) - while True: - p = np.random.permutation(len(x_train)) - x_train, y_train = x_train[p], y_train[p] - index = 0 - while index <= len(x_train) - batch_size: - yield x_train[index:index + batch_size], \ - y_train[index:index + batch_size], - index += batch_size - - -def main(_): - # Horovod: initialize Horovod. - hvd.init() - - # [HPCNS] Fully qualified dataset file name - dataset_file = os.path.join(data_dir, data_file) - - # [HPCNS] Dataset filename for this rank - dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) - - # [HPCNS] Make a copy of the dataset for this rank - shutil.copyfile(dataset_file, dataset_for_rank) - - # [HPCNS] Load MNIST dataset - (x_train, y_train), (x_test, y_test) = \ - keras.datasets.mnist.load_data(dataset_for_rank) - - # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it - # into (-1, 784) to feed into our network. Also, need to normalize the - # features between 0 and 1. - x_train = np.reshape(x_train, (-1, 784)) / 255.0 - x_test = np.reshape(x_test, (-1, 784)) / 255.0 - - # Build model... - with tf.name_scope('input'): - image = tf.placeholder(tf.float32, [None, 784], name='image') - label = tf.placeholder(tf.float32, [None], name='label') - predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) - - # Horovod: adjust learning rate based on number of GPUs. - opt = tf.train.AdamOptimizer(0.001 * hvd.size()) - - # Horovod: add Horovod Distributed Optimizer. - opt = hvd.DistributedOptimizer(opt) - - global_step = tf.train.get_or_create_global_step() - train_op = opt.minimize(loss, global_step=global_step) - - hooks = [ - # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states - # from rank 0 to all other processes. This is necessary to ensure consistent - # initialization of all workers when training is started with random weights - # or restored from a checkpoint. - hvd.BroadcastGlobalVariablesHook(0), - - # Horovod: adjust number of steps based on number of GPUs. - tf.train.StopAtStepHook(last_step=20000 // hvd.size()), - - tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, - every_n_iter=10), - ] - - # Horovod: pin GPU to be used to process local rank (one GPU per process) - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) - - # Horovod: save checkpoints only on worker 0 to prevent other workers from - # corrupting them. - checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None - training_batch_generator = train_input_generator(x_train, - y_train, batch_size=100) - # The MonitoredTrainingSession takes care of session initialization, - # restoring from a checkpoint, saving to a checkpoint, and closing when done - # or an error occurs. - with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, - hooks=hooks, - config=config) as mon_sess: - while not mon_sess.should_stop(): - # Run a training step synchronously. - image_, label_ = next(training_batch_generator) - mon_sess.run(train_op, feed_dict={image: image_, label: label_}) - - # [HPCNS] Remove the copied dataset - os.remove(dataset_for_rank) - - -if __name__ == "__main__": - tf.app.run() diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py deleted file mode 100644 index 792c057..0000000 --- a/horovod/tensorflow/mnist_estimator.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_estimator.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -"""Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import shutil -import numpy as np -import tensorflow as tf -import horovod.tensorflow as hvd - -from tensorflow import keras - -tf.logging.set_verbosity(tf.logging.INFO) - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - - -def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - # Reshape X to 4-D tensor: [batch_size, width, height, channels] - # MNIST images are 28x28 pixels, and have one color channel - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - # Computes 32 features using a 5x5 filter with ReLU activation. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 28, 28, 1] - # Output Tensor Shape: [batch_size, 28, 28, 32] - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - # First max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 28, 28, 32] - # Output Tensor Shape: [batch_size, 14, 14, 32] - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 - # Computes 64 features using a 5x5 filter. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 14, 14, 32] - # Output Tensor Shape: [batch_size, 14, 14, 64] - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #2 - # Second max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 14, 14, 64] - # Output Tensor Shape: [batch_size, 7, 7, 64] - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Flatten tensor into a batch of vectors - # Input Tensor Shape: [batch_size, 7, 7, 64] - # Output Tensor Shape: [batch_size, 7 * 7 * 64] - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - - # Dense Layer - # Densely connected layer with 1024 neurons - # Input Tensor Shape: [batch_size, 7 * 7 * 64] - # Output Tensor Shape: [batch_size, 1024] - dense = tf.layers.dense(inputs=pool2_flat, units=1024, - activation=tf.nn.relu) - - # Add dropout operation; 0.6 probability that element will be kept - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits layer - # Input Tensor Shape: [batch_size, 1024] - # Output Tensor Shape: [batch_size, 10] - logits = tf.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) - loss = tf.losses.softmax_cross_entropy( - onehot_labels=onehot_labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - # Horovod: scale learning rate by the number of workers. - optimizer = tf.train.MomentumOptimizer( - learning_rate=0.001 * hvd.size(), momentum=0.9) - - # Horovod: add Horovod Distributed Optimizer. - optimizer = hvd.DistributedOptimizer(optimizer) - - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, - train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) - - -def main(unused_argv): - # Horovod: initialize Horovod. - hvd.init() - - # [HPCNS] Fully qualified dataset file name - dataset_file = os.path.join(data_dir, data_file) - - # [HPCNS] Dataset filename for this rank - dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) - - # [HPCNS] Make a copy of the dataset for this rank - shutil.copyfile(dataset_file, dataset_for_rank) - - # [HPCNS] Load MNIST dataset - (train_data, train_labels), (eval_data, eval_labels) = \ - keras.datasets.mnist.load_data(dataset_for_rank) - - # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it - # into (-1, 784) to feed into our network. Also, need to normalize the - # features between 0 and 1. - train_data = np.reshape(train_data, (-1, 784)) / 255.0 - eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 - - # Horovod: pin GPU to be used to process local rank (one GPU per process) - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) - - # Horovod: save checkpoints only on worker 0 to prevent other workers from - # corrupting them. - model_dir = 'checkpoints/mnist_convnet_model' if hvd.rank() == 0 else None - - # Create the Estimator - mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir=model_dir, - config=tf.estimator.RunConfig(session_config=config)) - - # Set up logging for predictions - # Log the values in the "Softmax" tensor with label "probabilities" - tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=500) - - # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from - # rank 0 to all other processes. This is necessary to ensure consistent - # initialization of all workers when training is started with random weights or - # restored from a checkpoint. - bcast_hook = hvd.BroadcastGlobalVariablesHook(0) - - # Train the model - train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": train_data}, - y=train_labels, - batch_size=100, - num_epochs=None, - shuffle=True) - - # Horovod: adjust number of steps based on number of GPUs. - mnist_classifier.train( - input_fn=train_input_fn, - steps=500 // hvd.size(), - hooks=[logging_hook, bcast_hook]) - - # Evaluate the model and print results - eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) - eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) - print(eval_results) - - # [HPCNS] Remove the copied dataset - os.remove(dataset_for_rank) - - -if __name__ == "__main__": - tf.app.run() diff --git a/horovod/tensorflow/run_on_localMachine.sh b/horovod/tensorflow/run_on_localMachine.sh deleted file mode 100644 index 9c9afb4..0000000 --- a/horovod/tensorflow/run_on_localMachine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -mpirun -np 1 -H localhost:1 \ - -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ - -mca pml ob1 -mca btl ^openib \ - python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca.sh b/horovod/tensorflow/submit_job_jureca.sh deleted file mode 100755 index fd12487..0000000 --- a/horovod/tensorflow/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juron.sh b/horovod/tensorflow/submit_job_juron.sh deleted file mode 100644 index 0107547..0000000 --- a/horovod/tensorflow/submit_job_juron.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=2]" -#BSUB -gpu "num=2" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HOROVOD_TFLOW_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 - -# Run the program -mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juwels.sh b/horovod/tensorflow/submit_job_juwels.sh deleted file mode 100755 index fd12487..0000000 --- a/horovod/tensorflow/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/synthetic_benchmark.py b/horovod/tensorflow/synthetic_benchmark.py deleted file mode 100644 index ee401a5..0000000 --- a/horovod/tensorflow/synthetic_benchmark.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import absolute_import, division, print_function - -import argparse -import os -import numpy as np -import timeit - -import tensorflow as tf -import horovod.tensorflow as hvd -from tensorflow.keras import applications - -# Benchmark settings -parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='ResNet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=10, - help='number of benchmark iterations') - -parser.add_argument('--eager', action='store_true', default=False, - help='enables eager execution') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - -args = parser.parse_args() -args.cuda = not args.no_cuda - -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -if args.cuda: - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) -else: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" - config.gpu_options.allow_growth = False - config.gpu_options.visible_device_list = '' - -if args.eager: - tf.enable_eager_execution(config) - -# Set up standard model. -model = getattr(applications, args.model)(weights=None) - -opt = tf.train.GradientDescentOptimizer(0.01) - -# Horovod: (optional) compression algorithm. -compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - -# Horovod: wrap optimizer with DistributedOptimizer. -opt = hvd.DistributedOptimizer(opt, compression=compression) - -init = tf.global_variables_initializer() -bcast_op = hvd.broadcast_global_variables(0) - -data = tf.random_uniform([args.batch_size, 224, 224, 3]) -target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) - - -def loss_function(): - probs = model(data, training=True) - return tf.losses.sparse_softmax_cross_entropy(target, probs) - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - - -def run(benchmark_step): - # Warm-up - log('Running warmup...') - timeit.timeit(benchmark_step, number=args.num_warmup_batches) - - # Benchmark - log('Running benchmark...') - img_secs = [] - for x in range(args.num_iters): - time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - - # Results - img_sec_mean = np.mean(img_secs) - img_sec_conf = 1.96 * np.std(img_secs) - log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) - log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) - - -if tf.executing_eagerly(): - with tf.device(device): - run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) -else: - with tf.Session(config=config) as session: - init.run() - bcast_op.run() - - loss = loss_function() - train_opt = opt.minimize(loss) - run(lambda: session.run(train_opt)) diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md deleted file mode 100644 index 3a13e2b..0000000 --- a/horovod_data_distributed/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Introduction - -Please see the main docstring in each program for details. - -# Notes - -On JURECA and JUWELS, the `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`]( -https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution) -module for distribution of training data filenames across multiple ranks. On JURON, multiple additional -package are required. Please follow the steps below to setup the environment before submitting the -training job. - -Note that a maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there -are eight training files. - -## JURECA and JUWELS - -1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` -2. Load the system-wide Python module: `module load Python/3.6.8` -3. Install the `hpc4neuro` package: - - `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git` - -4. Submit the job - -## JURON - -1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` -2. Setup a Python virtual environment with the required packages (may take upto 5 minutes): `./setup_juron.sh` -3. Submit the job: `bsub < submit_job_juron.sh` - -**Note:** The setup is required only once. Unless you explicitly remove the virtual environment, the same -setup can be used to run the example multiple times. diff --git a/horovod_data_distributed/setup_juron.sh b/horovod_data_distributed/setup_juron.sh deleted file mode 100755 index 7fa1a24..0000000 --- a/horovod_data_distributed/setup_juron.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -# Load the Python module -module load python/3.6.1 - -# Create a virtual environment -python -m venv venv_dl_hpc4neuro - -# Activate the virtual environment -source venv_dl_hpc4neuro/bin/activate - -# Upgrade pip and setuptools -pip install -U pip setuptools - -# Install mpi4py -env MPICC=/gpfs/software/opt/openmpi/3.1.2-gcc_5.4.0-cuda_10.0.130/bin/mpicc pip install mpi4py - -# Install six -pip install six - -# Install hpc4neuro -pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git - -printf "%s\n\n" "Setup complete." diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh deleted file mode 100755 index eedbaca..0000000 --- a/horovod_data_distributed/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HVD_DATA_DIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load mpi4py/3.0.1-Python-3.6.8 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh deleted file mode 100755 index a71bc47..0000000 --- a/horovod_data_distributed/submit_job_juron.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=4]" -#BSUB -gpu "num=4" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HVD_DATA_DIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 - -# Activate the virtual environment -source venv_dl_hpc4neuro/bin/activate - -# Run the program -mpirun -bind-to none \ - -map-by slot \ - -x NCCL_DEBUG=INFO \ - -x LD_LIBRARY_PATH \ - -x PATH \ - -mca pml ob1 \ - -mca btl ^openib \ - python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh deleted file mode 100755 index eedbaca..0000000 --- a/horovod_data_distributed/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HVD_DATA_DIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load mpi4py/3.0.1-Python-3.6.8 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist_data_distributed.py diff --git a/keras/README.md b/keras/README.md deleted file mode 100644 index 4e8462d..0000000 --- a/keras/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Notes - -The `mnist.py` sample is a slightly modified version of `mnist_cnn.py` -available in the Keras examples repository -[here](https://github.com/keras-team/keras/tree/master/examples) -(last checked: September 02, 2019). Our changes are -limited to, - -* The data loading mechanism -* A bit of code cleanup -* A few additional comments pertaining to our custom data loading mechanism - -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file diff --git a/keras/mnist.py b/keras/mnist.py deleted file mode 100644 index 9fc93f2..0000000 --- a/keras/mnist.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py, -# which is also licensed under The MIT License (see the NOTICE file for details). - - -"""Trains a simple convnet on the MNIST dataset. - -Gets to 99.25% test accuracy after 12 epochs -(there is still a lot of margin for parameter tuning). -16 seconds per epoch on a GRID K520 GPU. -""" - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -batch_size = 128 -num_classes = 10 -epochs = 12 - -# input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Load MNIST dataset -# the data, split between train and test sets -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adadelta(), - metrics=['accuracy']) - -model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/keras/run_on_localMachine.sh b/keras/run_on_localMachine.sh deleted file mode 100644 index 1895ec1..0000000 --- a/keras/run_on_localMachine.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -python -u mnist.py diff --git a/keras/submit_job_jureca.sh b/keras/submit_job_jureca.sh deleted file mode 100755 index 55feebb..0000000 --- a/keras/submit_job_jureca.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=KERAS_MNIST_CNN -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCCcore/.8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/keras/submit_job_juron.sh b/keras/submit_job_juron.sh deleted file mode 100644 index 7927b03..0000000 --- a/keras/submit_job_juron.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J KERAS_MNIST_CNN - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load keras/2.2.4 - -# Run the program -python -u mnist.py diff --git a/keras/submit_job_juwels.sh b/keras/submit_job_juwels.sh deleted file mode 100755 index 429c440..0000000 --- a/keras/submit_job_juwels.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=KERAS_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/tensorflow/README.md b/tensorflow/README.md index 3bf439c..a35d643 100644 --- a/tensorflow/README.md +++ b/tensorflow/README.md @@ -1,13 +1,22 @@ # Notes -The `mnist.py` sample is a slightly modified version of `convolutional.py` -available in the Tensorflow models repository -[here](https://github.com/tensorflow/models/blob/master/tutorials/image/mnist) -(last checked: September 02, 2019). Our changes are -limited to, +All source code samples were taken from the Horovod examples repository +[here](https://github.com/horovod/horovod/tree/master/examples/tensorflow2) +(last checked: April 26, 2021). The samples have been slightly modified. Our +changes are limited to, * The data loading mechanism * A bit of code cleanup * A few additional comments pertaining to our custom data loading mechanism -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All +statements that demonstrate the use of Horovod follow a comment beginning with +`[Horovod]` (as added by Horovod developers). + +The following samples are included: + +1. `keras_mnist.py`: A simple training program for an MNIST classifier that + uses the Keras API with Horovod. +2. `mnist.py`: Also a training program for an MNIST classifier, this sample + demonstrates using Horovod's `DistributedGradientTape` with a custom + training loop. diff --git a/horovod/keras/checkpoints/.gitkeep b/tensorflow/checkpoints/.gitkeep similarity index 100% rename from horovod/keras/checkpoints/.gitkeep rename to tensorflow/checkpoints/.gitkeep diff --git a/tensorflow/jureca_job.sh b/tensorflow/jureca_job.sh new file mode 100755 index 0000000..e818bc0 --- /dev/null +++ b/tensorflow/jureca_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=dc-gpu-devel + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/jusuf_job.sh b/tensorflow/jusuf_job.sh new file mode 100755 index 0000000..24f3c83 --- /dev/null +++ b/tensorflow/jusuf_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:1 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/juwels_booster_job.sh b/tensorflow/juwels_booster_job.sh new file mode 100755 index 0000000..df9cdef --- /dev/null +++ b/tensorflow/juwels_booster_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develbooster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/juwels_job.sh b/tensorflow/juwels_job.sh new file mode 100755 index 0000000..55831d0 --- /dev/null +++ b/tensorflow/juwels_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow2/keras_mnist.py b/tensorflow/keras_mnist.py similarity index 97% rename from tensorflow2/keras_mnist.py rename to tensorflow/keras_mnist.py index e444560..b07950c 100644 --- a/tensorflow2/keras_mnist.py +++ b/tensorflow/keras_mnist.py @@ -104,4 +104,4 @@ verbose = 1 if hvd.rank() == 0 else 0 # Train the model. # Horovod: adjust number of steps based on number of GPUs. -mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose) \ No newline at end of file +mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose) \ No newline at end of file diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py index 30477e1..7e56a70 100644 --- a/tensorflow/mnist.py +++ b/tensorflow/mnist.py @@ -1,328 +1,109 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== -"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example. - -This should achieve a test error of 0.7%. Please keep this model as simple and -linear as possible, it is meant as a tutorial for simple convolutional models. -Run with --self_test on the command line to execute a short self-test. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import gzip import os import sys -import time -import numpy -from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf +import horovod.tensorflow as hvd # [HPCNS] Import the DataValidator, which can then be used to # validate and load the path to the already downloaded dataset. sys.path.insert(0, '../utils') from data_utils import DataValidator -IMAGE_SIZE = 28 -NUM_CHANNELS = 1 -PIXEL_DEPTH = 255 -NUM_LABELS = 10 -VALIDATION_SIZE = 5000 # Size of the validation set. -SEED = 66478 # Set to None for random seed. -BATCH_SIZE = 64 -NUM_EPOCHS = 10 -EVAL_BATCH_SIZE = 64 -EVAL_FREQUENCY = 100 # Number of steps between evaluations. - -FLAGS = None - - -def data_type(): - """Return the type of the activations, weights, and placeholder variables.""" - if FLAGS.use_fp16: - return tf.float16 - else: - return tf.float32 - - -def extract_data(filename, num_images): - """Extract the images into a 4D tensor [image index, y, x, channels]. - - Values are rescaled from [0, 255] down to [-0.5, 0.5]. - """ - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(16) - buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) - data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32) - data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH - data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) - return data - - -def extract_labels(filename, num_images): - """Extract the labels into a vector of int64 label IDs.""" - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(8) - buf = bytestream.read(1 * num_images) - labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64) - return labels - - -def fake_data(num_images): - """Generate a fake dataset that matches the dimensions of MNIST.""" - data = numpy.ndarray( - shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS), - dtype=numpy.float32) - labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64) - for image in xrange(num_images): - label = image % 2 - data[image, :, :, 0] = label - 0.5 - labels[image] = label - return data, labels - - -def error_rate(predictions, labels): - """Return the error rate based on dense predictions and sparse labels.""" - return 100.0 - ( - 100.0 * - numpy.sum(numpy.argmax(predictions, 1) == labels) / - predictions.shape[0]) - - -def main(_): - if FLAGS.self_test: - print('Running self-test.') - train_data, train_labels = fake_data(256) - validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) - test_data, test_labels = fake_data(EVAL_BATCH_SIZE) - num_epochs = 1 - else: - # [HPCNS]: Data files relative to the 'datasets' directory - train_data_filename = 'mnist/raw/train-images-idx3-ubyte.gz' - train_labels_filename = 'mnist/raw/train-labels-idx1-ubyte.gz' - test_data_filename = 'mnist/raw/t10k-images-idx3-ubyte.gz' - test_labels_filename = 'mnist/raw/t10k-labels-idx1-ubyte.gz' - - # [HPCNS]: Update data file information with validated and fully qualified filenames - train_data_filename = os.path.join( - DataValidator.validated_data_dir(train_data_filename), train_data_filename) - train_labels_filename = os.path.join( - DataValidator.validated_data_dir(train_labels_filename), train_labels_filename) - test_data_filename = os.path.join( - DataValidator.validated_data_dir(test_data_filename), test_data_filename) - test_labels_filename = os.path.join( - DataValidator.validated_data_dir(test_labels_filename), test_labels_filename) - - # Extract it into numpy arrays. - train_data = extract_data(train_data_filename, 60000) - train_labels = extract_labels(train_labels_filename, 60000) - test_data = extract_data(test_data_filename, 10000) - test_labels = extract_labels(test_labels_filename, 10000) - - # Generate a validation set. - validation_data = train_data[:VALIDATION_SIZE, ...] - validation_labels = train_labels[:VALIDATION_SIZE] - train_data = train_data[VALIDATION_SIZE:, ...] - train_labels = train_labels[VALIDATION_SIZE:] - num_epochs = NUM_EPOCHS - - train_size = train_labels.shape[0] - - # This is where training samples and labels are fed to the graph. - # These placeholder nodes will be fed a batch of training data at each - # training step using the {feed_dict} argument to the Run() call below. - train_data_node = tf.placeholder( - data_type(), - shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) - eval_data = tf.placeholder( - data_type(), - shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - - # The variables below hold all the trainable weights. They are passed an - # initial value which will be assigned when we call: - # {tf.global_variables_initializer().run()} - conv1_weights = tf.Variable( - tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. - stddev=0.1, - seed=SEED, dtype=data_type())) - conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) - conv2_weights = tf.Variable(tf.truncated_normal( - [5, 5, 32, 64], stddev=0.1, - seed=SEED, dtype=data_type())) - conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) - fc1_weights = tf.Variable( # fully connected, depth 512. - tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) - fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc2_biases = tf.Variable(tf.constant( - 0.1, shape=[NUM_LABELS], dtype=data_type())) - - # We will replicate the model structure for the training subgraph, as well - # as the evaluation subgraphs, while sharing the trainable parameters. - def model(data, train=False): - """The Model definition.""" - # 2D convolution, with 'SAME' padding (i.e. the output feature map has - # the same size as the input). Note that {strides} is a 4D array whose - # shape matches the data layout: [image index, y, x, depth]. - conv = tf.nn.conv2d(data, - conv1_weights, - strides=[1, 1, 1, 1], - padding='SAME') - # Bias and rectified linear non-linearity. - relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) - # Max pooling. The kernel size spec {ksize} also follows the layout of - # the data. Here we have a pooling window of 2, and a stride of 2. - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - conv = tf.nn.conv2d(pool, - conv2_weights, - strides=[1, 1, 1, 1], - padding='SAME') - relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - # Reshape the feature map cuboid into a 2D matrix to feed it to the - # fully connected layers. - pool_shape = pool.get_shape().as_list() - reshape = tf.reshape( - pool, - [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) - # Fully connected layer. Note that the '+' operation automatically - # broadcasts the biases. - hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) - # Add a 50% dropout during training only. Dropout also scales - # activations such that no rescaling is needed at evaluation time. - if train: - hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) - return tf.matmul(hidden, fc2_weights) + fc2_biases - - # Training computation: logits + cross-entropy loss. - logits = model(train_data_node, True) - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=train_labels_node, logits=logits)) - - # L2 regularization for the fully connected parameters. - regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + - tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) - # Add the regularization term to the loss. - loss += 5e-4 * regularizers - - # Optimizer: set up a variable that's incremented once per batch and - # controls the learning rate decay. - batch = tf.Variable(0, dtype=data_type()) - # Decay once per epoch, using an exponential schedule starting at 0.01. - learning_rate = tf.train.exponential_decay( - 0.01, # Base learning rate. - batch * BATCH_SIZE, # Current index into the dataset. - train_size, # Decay step. - 0.95, # Decay rate. - staircase=True) - # Use simple momentum for the optimization. - optimizer = tf.train.MomentumOptimizer(learning_rate, - 0.9).minimize(loss, - global_step=batch) - - # Predictions for the current training minibatch. - train_prediction = tf.nn.softmax(logits) - - # Predictions for the test and validation, which we'll compute less often. - eval_prediction = tf.nn.softmax(model(eval_data)) - - # Small utility function to evaluate a dataset by feeding batches of data to - # {eval_data} and pulling the results from {eval_predictions}. - # Saves memory and enables this to run on smaller GPUs. - def eval_in_batches(data, sess): - """Get all predictions for a dataset by running it in small batches.""" - size = data.shape[0] - if size < EVAL_BATCH_SIZE: - raise ValueError("batch size for evals larger than dataset: %d" % size) - predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) - for begin in xrange(0, size, EVAL_BATCH_SIZE): - end = begin + EVAL_BATCH_SIZE - if end <= size: - predictions[begin:end, :] = sess.run( - eval_prediction, - feed_dict={eval_data: data[begin:end, ...]}) - else: - batch_predictions = sess.run( - eval_prediction, - feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) - predictions[begin:, :] = batch_predictions[begin - size:, :] - return predictions - - # Create a local session to run the training. - start_time = time.time() - with tf.Session() as sess: - # Run all the initializers to prepare the trainable parameters. - tf.global_variables_initializer().run() - print('Initialized!') - # Loop through training steps. - for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): - # Compute the offset of the current minibatch in the data. - # Note that we could use better randomization across epochs. - offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) - batch_data = train_data[offset:(offset + BATCH_SIZE), ...] - batch_labels = train_labels[offset:(offset + BATCH_SIZE)] - # This dictionary maps the batch data (as a numpy array) to the - # node in the graph it should be fed to. - feed_dict = {train_data_node: batch_data, - train_labels_node: batch_labels} - # Run the optimizer to update weights. - sess.run(optimizer, feed_dict=feed_dict) - # print some extra information once reach the evaluation frequency - if step % EVAL_FREQUENCY == 0: - # fetch some extra nodes' data - l, lr, predictions = sess.run([loss, learning_rate, train_prediction], - feed_dict=feed_dict) - elapsed_time = time.time() - start_time - start_time = time.time() - print('Step %d (epoch %.2f), %.1f ms' % - (step, float(step) * BATCH_SIZE / train_size, - 1000 * elapsed_time / EVAL_FREQUENCY)) - print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) - print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) - print('Validation error: %.1f%%' % error_rate( - eval_in_batches(validation_data, sess), validation_labels)) - sys.stdout.flush() - # Finally print the result! - test_error = error_rate(eval_in_batches(test_data, sess), test_labels) - print('Test error: %.1f%%' % test_error) - if FLAGS.self_test: - print('test_error', test_error) - assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( - test_error,) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--use_fp16', - default=False, - help='Use half floats instead of full floats if True.', - action='store_true') - parser.add_argument( - '--self_test', - default=False, - action='store_true', - help='True if running a self test.') - - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) +loss = tf.losses.SparseCategoricalCrossentropy() + +# Horovod: adjust learning rate based on number of GPUs. +opt = tf.optimizers.Adam(0.001 * hvd.size()) + +checkpoint_dir = 'checkpoints/' +checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) + + +@tf.function +def training_step(images, labels, first_batch): + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) + + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) + + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + + return loss_value + + +# Horovod: adjust number of steps based on number of GPUs. +for batch, (images, labels) in enumerate(dataset.take(10000 // hvd.size())): + loss_value = training_step(images, labels, batch == 0) + + if batch % 10 == 0 and hvd.local_rank() == 0: + print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + +# Horovod: save checkpoints only on worker 0 to prevent other workers from +# corrupting it. +if hvd.rank() == 0: + checkpoint.save(checkpoint_dir) diff --git a/tensorflow/run_on_localMachine.sh b/tensorflow/run_on_localMachine.sh deleted file mode 100644 index 9c5737c..0000000 --- a/tensorflow/run_on_localMachine.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -python -u mnist.py \ No newline at end of file diff --git a/tensorflow/submit_job_jureca.sh b/tensorflow/submit_job_jureca.sh deleted file mode 100755 index fa294f1..0000000 --- a/tensorflow/submit_job_jureca.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TFLOW_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCCcore/.8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/tensorflow/submit_job_juron.sh b/tensorflow/submit_job_juron.sh deleted file mode 100644 index 30fa204..0000000 --- a/tensorflow/submit_job_juron.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J TENSORFLOW_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 - -# Run the program -python -u mnist.py diff --git a/tensorflow/submit_job_juwels.sh b/tensorflow/submit_job_juwels.sh deleted file mode 100755 index fda7d98..0000000 --- a/tensorflow/submit_job_juwels.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TFLOW_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/tensorflow2/checkpoints/.gitkeep b/tensorflow2/checkpoints/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py deleted file mode 100644 index 53cb1da..0000000 --- a/tensorflow2/mnist.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -import sys - -import tensorflow as tf -import horovod.tensorflow as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -gpus = tf.config.experimental.list_physical_devices('GPU') -for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) -if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -(mnist_images, mnist_labels), _ = \ - tf.keras.datasets.mnist.load_data(dataset_file) - -dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), - tf.cast(mnist_labels, tf.int64)) -) -dataset = dataset.repeat().shuffle(10000).batch(128) - -mnist_model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), - tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation='softmax') -]) -loss = tf.losses.SparseCategoricalCrossentropy() - -# Horovod: adjust learning rate based on number of GPUs. -opt = tf.optimizers.Adam(0.001 * hvd.size()) - -checkpoint_dir = 'checkpoints/' -checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) - - -@tf.function -def training_step(images, labels, first_batch): - with tf.GradientTape() as tape: - probs = mnist_model(images, training=True) - loss_value = loss(labels, probs) - - # Horovod: add Horovod Distributed GradientTape. - tape = hvd.DistributedGradientTape(tape) - - grads = tape.gradient(loss_value, mnist_model.trainable_variables) - opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) - - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - # - # Note: broadcast should be done after the first gradient step to ensure optimizer - # initialization. - if first_batch: - hvd.broadcast_variables(mnist_model.variables, root_rank=0) - hvd.broadcast_variables(opt.variables(), root_rank=0) - - return loss_value - - -# Horovod: adjust number of steps based on number of GPUs. -for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())): - loss_value = training_step(images, labels, batch == 0) - - if batch % 10 == 0 and hvd.local_rank() == 0: - print('Step #%d\tLoss: %.6f' % (batch, loss_value)) - -# Horovod: save checkpoints only on worker 0 to prevent other workers from -# corrupting it. -if hvd.rank() == 0: - checkpoint.save(checkpoint_dir) diff --git a/training_data_distribution/README.md b/training_data_distribution/README.md new file mode 100644 index 0000000..6e4028e --- /dev/null +++ b/training_data_distribution/README.md @@ -0,0 +1,27 @@ +# Introduction + +This example distributes the partitioned MNIST data across multiple ranks +for truly data distributed training of a shallow ANN for handwritten digit +classification. + +The Horovod framework is used for seamless distributed training. However, +instead of distributing epochs, this example distributes data amongst the +ranks, so that each rank contributes training based on its local subset of +the training data. + +# Notes + +The `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`]( +https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution) +module for distribution of training data filenames across multiple ranks. Please +follow the steps below to install this package before submitting the training +job. + +1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/training_data_distribution` +2. Load the system-wide Python module: `module load Python/3.8.5` +3. Install the `hpc4neuro` package: + + `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git` + +**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, +as there are eight training files. diff --git a/tensorflow2/juwels_booster_job b/training_data_distribution/jureca_job.sh similarity index 83% rename from tensorflow2/juwels_booster_job rename to training_data_distribution/jureca_job.sh index 625afac..96a239b 100755 --- a/tensorflow2/juwels_booster_job +++ b/training_data_distribution/jureca_job.sh @@ -9,11 +9,12 @@ #SBATCH --time=00:10:00 #SBATCH --job-name=TUTORIAL #SBATCH --gres=gpu:4 -#SBATCH --partition=booster +#SBATCH --partition=dc-gpu-devel # Load the required modules module load GCC/9.3.0 module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 module load TensorFlow/2.3.1-Python-3.8.5 module load Horovod/0.20.3-Python-3.8.5 @@ -24,4 +25,4 @@ export HOROVOD_MPI_THREADS_DISABLE=0 export CUDA_VISIBLE_DEVICES=0,1,2,3 # Run the program -srun python -u mnist.py +srun python -u mnist_data_distributed.py diff --git a/training_data_distribution/jusuf_job.sh b/training_data_distribution/jusuf_job.sh new file mode 100755 index 0000000..95c262d --- /dev/null +++ b/training_data_distribution/jusuf_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:1 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/training_data_distribution/juwels_booster_job.sh b/training_data_distribution/juwels_booster_job.sh new file mode 100755 index 0000000..374f63d --- /dev/null +++ b/training_data_distribution/juwels_booster_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develbooster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/juwels_booster_job b/training_data_distribution/juwels_job.sh similarity index 95% rename from horovod_data_distributed/juwels_booster_job rename to training_data_distribution/juwels_job.sh index 803e764..b2b7641 100755 --- a/horovod_data_distributed/juwels_booster_job +++ b/training_data_distribution/juwels_job.sh @@ -9,7 +9,7 @@ #SBATCH --time=00:10:00 #SBATCH --job-name=TUTORIAL #SBATCH --gres=gpu:4 -#SBATCH --partition=booster +#SBATCH --partition=develgpus # Load the required modules module load GCC/9.3.0 diff --git a/horovod_data_distributed/mnist_data_distributed.py b/training_data_distribution/mnist_data_distributed.py similarity index 100% rename from horovod_data_distributed/mnist_data_distributed.py rename to training_data_distribution/mnist_data_distributed.py diff --git a/utils/data_utils.py b/utils/data_utils.py index f2d10e4..21a57ad 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -47,19 +47,19 @@ class DataValidator: if not os.path.exists(data_dir): data_dir = os.path.join(os.path.abspath('../../datasets')) - print('Using {} as the data directory.'.format(data_dir)) + print(f'Using {data_dir} as the data directory.') # Check if the directory exists if not os.path.exists(data_dir): raise DatasetNotFoundError( - '{} refers to a non-existing directory. Please either correctly set ' - 'the DL_TEST_DATA_HOME environment variable, or make sure the datasets are ' - 'available in the project root.'.format(data_dir) + f'{data_dir} refers to a non-existing directory. Please either ' + f'correctly set the DL_TEST_DATA_HOME environment variable, or ' + f'make sure the datasets are available in the project root.' ) if not os.path.exists(os.path.join(data_dir, filename)): raise DatasetNotFoundError( - 'Unable to locate {} in {}'.format(filename, data_dir) + f'Unable to locate {filename} in {data_dir}' ) return data_dir -- GitLab From 794b4304d8453b01ad20d31b50a9438a6982f5c0 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Mon, 26 Apr 2021 20:11:26 +0200 Subject: [PATCH 3/8] Minor corrections to the TOC. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7ac59ff..7c86418 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,8 @@ page. 4. [Logging on to the supercomputers](#4-logging-on-to-the-supercomputers) 5. [Cloning the repository](#5-cloning-the-repository) 6. [Running a sample](#6-running-a-sample) -7. [Distributed training](#8-distributed-training) -8. [Credits](#9-credits) +7. [Distributed training](#7-distributed-training) +8. [Credits](#8-credits) <!-- /TOC --> -- GitLab From be3dce52425f9b760bfb691744381c2aee98fa32 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 27 Apr 2021 11:03:13 +0200 Subject: [PATCH 4/8] Added a PyTorch sample. The code does work on all machines, but there are a couple of issues. It is very slow, in fact slower than on a worksation. Also, the job does not end after training and testing; it continues without any output until the job time expires. --- pytorch/jureca_job.sh | 25 ++++ pytorch/jusuf_job.sh | 25 ++++ pytorch/juwels_booster_job.sh | 25 ++++ pytorch/juwels_job.sh | 25 ++++ pytorch/mnist.py | 227 ++++++++++++++++++++++++++++++++++ 5 files changed, 327 insertions(+) create mode 100755 pytorch/jureca_job.sh create mode 100755 pytorch/jusuf_job.sh create mode 100755 pytorch/juwels_booster_job.sh create mode 100755 pytorch/juwels_job.sh create mode 100644 pytorch/mnist.py diff --git a/pytorch/jureca_job.sh b/pytorch/jureca_job.sh new file mode 100755 index 0000000..3959b01 --- /dev/null +++ b/pytorch/jureca_job.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=dc-gpu-devel + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load PyTorch/1.7.0-Python-3.8.5 +module load torchvision/0.8.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/jusuf_job.sh b/pytorch/jusuf_job.sh new file mode 100755 index 0000000..3ac1490 --- /dev/null +++ b/pytorch/jusuf_job.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:1 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load PyTorch/1.7.0-Python-3.8.5 +module load torchvision/0.8.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/juwels_booster_job.sh b/pytorch/juwels_booster_job.sh new file mode 100755 index 0000000..fd58b1d --- /dev/null +++ b/pytorch/juwels_booster_job.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develbooster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load PyTorch/1.7.0-Python-3.8.5 +module load torchvision/0.8.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/juwels_job.sh b/pytorch/juwels_job.sh new file mode 100755 index 0000000..b91e237 --- /dev/null +++ b/pytorch/juwels_job.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load PyTorch/1.7.0-Python-3.8.5 +module load torchvision/0.8.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/mnist.py b/pytorch/mnist.py new file mode 100644 index 0000000..3fa9c44 --- /dev/null +++ b/pytorch/mnist.py @@ -0,0 +1,227 @@ + +import os +import sys +import shutil +import argparse + +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +import torch.utils.data.distributed +import horovod.torch as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') +parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, + help='apply gradient predivide factor in optimizer (default: 1.0)') +parser.add_argument('--data-dir', + help='location of the training dataset in the local filesystem (will be downloaded if needed)') + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +def train(epoch): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), + 100. * batch_idx / len(train_loader), loss.item())) + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + # Horovod: print output only on first rank. + if hvd.rank() == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + + +if __name__ == '__main__': + args = parser.parse_args() + args.cuda = not args.no_cuda and torch.cuda.is_available() + + # Horovod: initialize library. + hvd.init() + torch.manual_seed(args.seed) + + if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + + # Horovod: limit # of CPU threads to be used per worker. + torch.set_num_threads(1) + + kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} + # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent + # issues with Infiniband implementations that are not fork-safe + if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and + mp._supports_context and 'forkserver' in mp.get_all_start_methods()): + kwargs['multiprocessing_context'] = 'forkserver' + + # data_dir = args.data_dir or './data' + + # [HPCNS] Name of the dataset file + data_file = 'mnist/pytorch/data' + + # [HPCNS] Path to the directory containing the dataset file + data_dir = DataValidator.validated_data_dir(data_file) + + # [HPCNS] Fully qualified dataset file name + dataset_file = os.path.join(data_dir, data_file) + + # [HPCNS] Dataset filename for this rank + dataset_root_for_rank = f'MNIST-data-{hvd.rank()}' + dataset_for_rank = f'{dataset_root_for_rank}/MNIST' + + # [HPCNS] If the path already exists, remove it + if os.path.exists(dataset_for_rank): + shutil.rmtree(dataset_for_rank) + + # [HPCNS] Make a copy of the dataset for this rank + shutil.copytree(dataset_file, dataset_for_rank) + + train_dataset = \ + datasets.MNIST(dataset_root_for_rank, train=True, download=False, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + + # Horovod: use DistributedSampler to partition the training data. + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + + test_dataset = \ + datasets.MNIST(dataset_root_for_rank, train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + # Horovod: use DistributedSampler to partition the test data. + test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) + test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + model = Net() + + # By default, Adasum doesn't need scaling up learning rate. + lr_scaler = hvd.size() if not args.use_adasum else 1 + + if args.cuda: + # Move model to GPU. + model.cuda() + # If using GPU Adasum allreduce, scale learning rate by local_size. + if args.use_adasum and hvd.nccl_built(): + lr_scaler = hvd.local_size() + + # Horovod: scale learning rate by lr_scaler. + optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, + momentum=args.momentum) + + # Horovod: broadcast parameters & optimizer state. + hvd.broadcast_parameters(model.state_dict(), root_rank=0) + hvd.broadcast_optimizer_state(optimizer, root_rank=0) + + # Horovod: (optional) compression algorithm. + compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + + # Horovod: wrap optimizer with DistributedOptimizer. + optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + compression=compression, + op=hvd.Adasum if args.use_adasum else hvd.Average, + gradient_predivide_factor=args.gradient_predivide_factor) + + for epoch in range(1, args.epochs + 1): + train(epoch) + test() + + # [HPCNS] Remove the copied dataset + shutil.rmtree(dataset_root_for_rank) -- GitLab From cc8ad9a910b9fabc7ef349770ef2887453a80a05 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 27 Apr 2021 11:13:29 +0200 Subject: [PATCH 5/8] Added readme for the PyTorch directory. --- pytorch/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 pytorch/README.md diff --git a/pytorch/README.md b/pytorch/README.md new file mode 100644 index 0000000..def300c --- /dev/null +++ b/pytorch/README.md @@ -0,0 +1,20 @@ +# Notes + +The source code sample was taken from the Horovod examples repository +[here](https://github.com/horovod/horovod/tree/master/examples/pytorch) +(last checked: April 27, 2021). The sample has been slightly modified. Our +changes are limited to, + +* The data loading mechanism. +* Removal of `filelock` to eliminate dependence on a package that is not + available on the supercomputers. +* A few additional comments pertaining to our custom data loading mechanism. + +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. +All statements that demonstrate the use of Horovod follow a comment beginning +with `[Horovod]` (as added by Horovod developers). + +The following sample is included: + +1. `mnist.py`: A simple training program for an MNIST classifier that + uses Horovod for data distribution. -- GitLab From efc3bd4a90deb407af59a951e44a83f56993f6ed Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 4 May 2021 08:32:43 +0200 Subject: [PATCH 6/8] Updated the course material so that the examples comply with TF2. --- .../examples/mnist_epoch_distributed.py | 17 ++++++++--------- course_material/examples/mnist_single_gpu.py | 13 ++++++------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/course_material/examples/mnist_epoch_distributed.py b/course_material/examples/mnist_epoch_distributed.py index 7c9080e..504b2a8 100644 --- a/course_material/examples/mnist_epoch_distributed.py +++ b/course_material/examples/mnist_epoch_distributed.py @@ -4,8 +4,6 @@ # Version 2.0 (see the NOTICE file for details). """ - This program is an adaptation of the following code sample: - https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py. The program creates and trains a shallow ANN for handwritten digit classification using the MNIST dataset. @@ -13,14 +11,14 @@ example epochs are distributed across the Horovod ranks, not data. To run this sample use the following command on your - workstation/laptop equipped with a GPU: + workstation/laptop: - mpirun -np 1 python -u mnist_epoch_distributed.py + mpirun -np 1 python -u mnist_epoch_distributed.py If you have more than one GPU on your system, you can increase the number of ranks accordingly. - The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and + The code has been tested with Python 3.8.7, tensorflow 2.3.1, and horovod 0.16.2. Note: This code will NOT work on the supercomputers. @@ -30,16 +28,17 @@ import math import tensorflow as tf import horovod.tensorflow.keras as hvd -from tensorflow.python.keras import backend as K # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) +gpus = tf.config.experimental.list_physical_devices('GPU') +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) # Reference to the MNIST dataset mnist = tf.keras.datasets.mnist diff --git a/course_material/examples/mnist_single_gpu.py b/course_material/examples/mnist_single_gpu.py index 794150f..2918cd0 100644 --- a/course_material/examples/mnist_single_gpu.py +++ b/course_material/examples/mnist_single_gpu.py @@ -4,17 +4,16 @@ # Version 2.0 (see the NOTICE file for details). """ - This program is an adaptation of the code sample available at - https://www.tensorflow.org/tutorials/. The program creates - and trains a shallow ANN for handwritten digit classification - using the MNIST dataset. + This program is an adaptation of a previously available code sample + at https://www.tensorflow.org/tutorials/. The program creates and trains a + shallow ANN for handwritten digit classification using the MNIST dataset. To run this sample use the following command on your - workstation/laptop equipped with a GPU: + workstation/laptop: - python -u mnist.py + python -u mnist.py - The code has been tested with Python 3.7.5 and tensorflow-gpu 1.13.1. + The code has been tested with Python 3.8.7 and tensorflow 2.3.1 Note: This code will NOT work on the supercomputers. -- GitLab From 66fcb392f79084c45902829e686f9341fd8c59be Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 18 May 2021 09:48:45 +0200 Subject: [PATCH 7/8] Updated 3rd party license information. --- NOTICE | 44 +++++++--------------------- training_data_distribution/README.md | 4 +-- 2 files changed, 13 insertions(+), 35 deletions(-) diff --git a/NOTICE b/NOTICE index 22a9d69..11aba54 100644 --- a/NOTICE +++ b/NOTICE @@ -18,7 +18,7 @@ limitations under the License. Tensorflow -Copyright 2016 The TensorFlow Authors. All rights reserved. +Copyright 2019 The TensorFlow Authors. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,38 +34,16 @@ limitations under the License. Keras -All contributions by François Chollet: -Copyright (c) 2015 - 2019, François Chollet. -All rights reserved. +Copyright 2015 The TensorFlow Authors. All rights reserved. -All contributions by Google: -Copyright (c) 2015 - 2019, Google, Inc. -All rights reserved. - -All contributions by Microsoft: -Copyright (c) 2017 - 2019, Microsoft, Inc. -All rights reserved. - -All other contributions: -Copyright (c) 2015 - 2019, the respective contributors. -All rights reserved. - -Licensed under The MIT License (MIT) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. + http://www.apache.org/licenses/LICENSE-2.0 -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/training_data_distribution/README.md b/training_data_distribution/README.md index 6e4028e..374f623 100644 --- a/training_data_distribution/README.md +++ b/training_data_distribution/README.md @@ -1,8 +1,8 @@ # Introduction This example distributes the partitioned MNIST data across multiple ranks -for truly data distributed training of a shallow ANN for handwritten digit -classification. +for truly data distributed training of a shallow Artificial Neural Network for +handwritten digit classification. The Horovod framework is used for seamless distributed training. However, instead of distributing epochs, this example distributes data amongst the -- GitLab From 912f6813f32aab1d7221aba4523c5e2bf787b894 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 18 May 2021 10:09:54 +0200 Subject: [PATCH 8/8] Removed the pytorch sample, as it has been moved to the tf2_pytorch branch. --- pytorch/README.md | 20 --- pytorch/jureca_job.sh | 25 ---- pytorch/jusuf_job.sh | 25 ---- pytorch/juwels_booster_job.sh | 25 ---- pytorch/juwels_job.sh | 25 ---- pytorch/mnist.py | 227 ---------------------------------- 6 files changed, 347 deletions(-) delete mode 100644 pytorch/README.md delete mode 100755 pytorch/jureca_job.sh delete mode 100755 pytorch/jusuf_job.sh delete mode 100755 pytorch/juwels_booster_job.sh delete mode 100755 pytorch/juwels_job.sh delete mode 100644 pytorch/mnist.py diff --git a/pytorch/README.md b/pytorch/README.md deleted file mode 100644 index def300c..0000000 --- a/pytorch/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Notes - -The source code sample was taken from the Horovod examples repository -[here](https://github.com/horovod/horovod/tree/master/examples/pytorch) -(last checked: April 27, 2021). The sample has been slightly modified. Our -changes are limited to, - -* The data loading mechanism. -* Removal of `filelock` to eliminate dependence on a package that is not - available on the supercomputers. -* A few additional comments pertaining to our custom data loading mechanism. - -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. -All statements that demonstrate the use of Horovod follow a comment beginning -with `[Horovod]` (as added by Horovod developers). - -The following sample is included: - -1. `mnist.py`: A simple training program for an MNIST classifier that - uses Horovod for data distribution. diff --git a/pytorch/jureca_job.sh b/pytorch/jureca_job.sh deleted file mode 100755 index 3959b01..0000000 --- a/pytorch/jureca_job.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TUTORIAL -#SBATCH --gres=gpu:4 -#SBATCH --partition=dc-gpu-devel - -# Load the required modules -module load GCC/9.3.0 -module load OpenMPI/4.1.0rc1 -module load PyTorch/1.7.0-Python-3.8.5 -module load torchvision/0.8.1-Python-3.8.5 -module load Horovod/0.20.3-Python-3.8.5 - -# Make all GPUs visible per node -export CUDA_VISIBLE_DEVICES=0,1,2,3 - -# Run the program -srun python -u mnist.py diff --git a/pytorch/jusuf_job.sh b/pytorch/jusuf_job.sh deleted file mode 100755 index 3ac1490..0000000 --- a/pytorch/jusuf_job.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TUTORIAL -#SBATCH --gres=gpu:1 -#SBATCH --partition=develgpus - -# Load the required modules -module load GCC/9.3.0 -module load OpenMPI/4.1.0rc1 -module load PyTorch/1.7.0-Python-3.8.5 -module load torchvision/0.8.1-Python-3.8.5 -module load Horovod/0.20.3-Python-3.8.5 - -# Make all GPUs visible per node -export CUDA_VISIBLE_DEVICES=0 - -# Run the program -srun python -u mnist.py diff --git a/pytorch/juwels_booster_job.sh b/pytorch/juwels_booster_job.sh deleted file mode 100755 index fd58b1d..0000000 --- a/pytorch/juwels_booster_job.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TUTORIAL -#SBATCH --gres=gpu:4 -#SBATCH --partition=develbooster - -# Load the required modules -module load GCC/9.3.0 -module load OpenMPI/4.1.0rc1 -module load PyTorch/1.7.0-Python-3.8.5 -module load torchvision/0.8.1-Python-3.8.5 -module load Horovod/0.20.3-Python-3.8.5 - -# Make all GPUs visible per node -export CUDA_VISIBLE_DEVICES=0,1,2,3 - -# Run the program -srun python -u mnist.py diff --git a/pytorch/juwels_job.sh b/pytorch/juwels_job.sh deleted file mode 100755 index b91e237..0000000 --- a/pytorch/juwels_job.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TUTORIAL -#SBATCH --gres=gpu:4 -#SBATCH --partition=develgpus - -# Load the required modules -module load GCC/9.3.0 -module load OpenMPI/4.1.0rc1 -module load PyTorch/1.7.0-Python-3.8.5 -module load torchvision/0.8.1-Python-3.8.5 -module load Horovod/0.20.3-Python-3.8.5 - -# Make all GPUs visible per node -export CUDA_VISIBLE_DEVICES=0,1,2,3 - -# Run the program -srun python -u mnist.py diff --git a/pytorch/mnist.py b/pytorch/mnist.py deleted file mode 100644 index 3fa9c44..0000000 --- a/pytorch/mnist.py +++ /dev/null @@ -1,227 +0,0 @@ - -import os -import sys -import shutil -import argparse - -import torch.multiprocessing as mp -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -import torch.utils.data.distributed -import horovod.torch as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../utils') -from data_utils import DataValidator - - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') -parser.add_argument('--seed', type=int, default=42, metavar='S', - help='random seed (default: 42)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') -parser.add_argument('--use-adasum', action='store_true', default=False, - help='use adasum algorithm to do reduction') -parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, - help='apply gradient predivide factor in optimizer (default: 1.0)') -parser.add_argument('--data-dir', - help='location of the training dataset in the local filesystem (will be downloaded if needed)') - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x) - - -def train(epoch): - model.train() - # Horovod: set epoch to sampler for shuffling. - train_sampler.set_epoch(epoch) - for batch_idx, (data, target) in enumerate(train_loader): - if args.cuda: - data, target = data.cuda(), target.cuda() - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - # Horovod: use train_sampler to determine the number of examples in - # this worker's partition. - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_sampler), - 100. * batch_idx / len(train_loader), loss.item())) - - -def metric_average(val, name): - tensor = torch.tensor(val) - avg_tensor = hvd.allreduce(tensor, name=name) - return avg_tensor.item() - - -def test(): - model.eval() - test_loss = 0. - test_accuracy = 0. - for data, target in test_loader: - if args.cuda: - data, target = data.cuda(), target.cuda() - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, size_average=False).item() - # get the index of the max log-probability - pred = output.data.max(1, keepdim=True)[1] - test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() - - # Horovod: use test_sampler to determine the number of examples in - # this worker's partition. - test_loss /= len(test_sampler) - test_accuracy /= len(test_sampler) - - # Horovod: average metric values across workers. - test_loss = metric_average(test_loss, 'avg_loss') - test_accuracy = metric_average(test_accuracy, 'avg_accuracy') - - # Horovod: print output only on first rank. - if hvd.rank() == 0: - print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( - test_loss, 100. * test_accuracy)) - - -if __name__ == '__main__': - args = parser.parse_args() - args.cuda = not args.no_cuda and torch.cuda.is_available() - - # Horovod: initialize library. - hvd.init() - torch.manual_seed(args.seed) - - if args.cuda: - # Horovod: pin GPU to local rank. - torch.cuda.set_device(hvd.local_rank()) - torch.cuda.manual_seed(args.seed) - - # Horovod: limit # of CPU threads to be used per worker. - torch.set_num_threads(1) - - kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} - # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent - # issues with Infiniband implementations that are not fork-safe - if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and - mp._supports_context and 'forkserver' in mp.get_all_start_methods()): - kwargs['multiprocessing_context'] = 'forkserver' - - # data_dir = args.data_dir or './data' - - # [HPCNS] Name of the dataset file - data_file = 'mnist/pytorch/data' - - # [HPCNS] Path to the directory containing the dataset file - data_dir = DataValidator.validated_data_dir(data_file) - - # [HPCNS] Fully qualified dataset file name - dataset_file = os.path.join(data_dir, data_file) - - # [HPCNS] Dataset filename for this rank - dataset_root_for_rank = f'MNIST-data-{hvd.rank()}' - dataset_for_rank = f'{dataset_root_for_rank}/MNIST' - - # [HPCNS] If the path already exists, remove it - if os.path.exists(dataset_for_rank): - shutil.rmtree(dataset_for_rank) - - # [HPCNS] Make a copy of the dataset for this rank - shutil.copytree(dataset_file, dataset_for_rank) - - train_dataset = \ - datasets.MNIST(dataset_root_for_rank, train=True, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - - # Horovod: use DistributedSampler to partition the training data. - train_sampler = torch.utils.data.distributed.DistributedSampler( - train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) - - test_dataset = \ - datasets.MNIST(dataset_root_for_rank, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - # Horovod: use DistributedSampler to partition the test data. - test_sampler = torch.utils.data.distributed.DistributedSampler( - test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) - test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, - sampler=test_sampler, **kwargs) - - model = Net() - - # By default, Adasum doesn't need scaling up learning rate. - lr_scaler = hvd.size() if not args.use_adasum else 1 - - if args.cuda: - # Move model to GPU. - model.cuda() - # If using GPU Adasum allreduce, scale learning rate by local_size. - if args.use_adasum and hvd.nccl_built(): - lr_scaler = hvd.local_size() - - # Horovod: scale learning rate by lr_scaler. - optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, - momentum=args.momentum) - - # Horovod: broadcast parameters & optimizer state. - hvd.broadcast_parameters(model.state_dict(), root_rank=0) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - - # Horovod: (optional) compression algorithm. - compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - - # Horovod: wrap optimizer with DistributedOptimizer. - optimizer = hvd.DistributedOptimizer(optimizer, - named_parameters=model.named_parameters(), - compression=compression, - op=hvd.Adasum if args.use_adasum else hvd.Average, - gradient_predivide_factor=args.gradient_predivide_factor) - - for epoch in range(1, args.epochs + 1): - train(epoch) - test() - - # [HPCNS] Remove the copied dataset - shutil.rmtree(dataset_root_for_rank) -- GitLab