diff --git a/caffe/README.md b/caffe/README.md deleted file mode 100644 index 1804dceeab230c4754b7616d3c111da8a51873cb..0000000000000000000000000000000000000000 --- a/caffe/README.md +++ /dev/null @@ -1,43 +0,0 @@ -**Caution:** Caffe is no longer being actively developed, which is why we prefer not to support -it as a system-wide module on the supercomputers for long. This is why Caffe is available with -Python 2 support only on JURECA, while it is not at all supported on JUWELS. The users are advised -to switch to other frameworks such as Tensorflow/Keras and PyTorch. - -# Notes - -There are three ways in which Caffe can be used, -1. As a command line tool with only built-in layers -2. As a library from within a Python program. Either only built-in layers can be used, -or one or more custom layers can be written in Python. -3. As a command line tool with one or more custom C++ layers. - -## Caffe as a command line tool - -The `mnist_cmd` sub-directory contains configuration and job scripts for running -Caffe as a command line tool with only built-in layers. This example represents use -case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` -were taken from the MNIST examples directory available in the Caffe repository -[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have -been made just so the path to the input dataset is correct. The `caffe` command -in the job submission scripts can be modified as follows to run training on -all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`): - - caffe train --solver=lenet_solver.prototxt -gpu all - -## Using Caffe within a Python program - -The `lenet_python` sub-directory contains the required files for an example of -using Caffe as a library from within a Python program. This corresponds to use case -2 as described above. The `train_lenet.py` file contains source code adapted from -the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples -[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example -results in the generation of a learning curve plot in the current directory. - -## Caffe with custom C++ layers - -Working with custom C++ layers requires recompiling Caffe with the custom code. As -this is not possible with a system-wide installation, we have decided not to -include an example of this use case. Nevertheless, if you must work with custom -C++ layers and require assistance, please send an email to the JULAIN mailing list -(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)). - diff --git a/caffe/lenet_python/.submit_job_jureca_python2.sh b/caffe/lenet_python/.submit_job_jureca_python2.sh deleted file mode 100755 index 75069256157eb55f4122b0ebc2f390b925f89396..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_LENET_PYTHON -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Run the program -srun python -u train_lenet.py diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt deleted file mode 100644 index 44af3ad6cecd7a8090902160666e5453622f8be6..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/lenet_auto_solver.prototxt +++ /dev/null @@ -1,24 +0,0 @@ -# The train/test net protocol buffer definition -train_net: "lenet_auto_train.prototxt" -test_net: "lenet_auto_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh deleted file mode 100755 index 2025a389b89bb90c6593b598231f14c8fb1fdcf0..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh deleted file mode 100755 index 7e737766bcb4ee609fdefab0d52f6adcc95e12e8..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py deleted file mode 100644 index ad5cae3bf4d6a7f1f9a418b802418714efb6ee67..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/train_lenet.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -import sys -import matplotlib - -# Force matplotlib to not use any Xwindows backend. -matplotlib.use('Agg') -import pylab - -import caffe -from caffe import layers as L, params as P - -# Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - - -# Prepares network specification -def lenet(lmdb, batch_size): - # Caffe's version of LeNet: a series of linear and simple nonlinear transformations - n = caffe.NetSpec() - - n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb, - transform_param=dict(scale=1. / 255), ntop=2) - - n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) - n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) - n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier')) - n.relu1 = L.ReLU(n.fc1, in_place=True) - n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier')) - n.loss = L.SoftmaxWithLoss(n.score, n.label) - - return n.to_proto() - - -# Names of the directories containing the LMDB files for TRAIN and TEST phases -test_dir = 'mnist/caffe/mnist_test_lmdb' -train_dir = 'mnist/caffe/mnist_train_lmdb' - -# Validated path to the data root -DataValidator.validated_data_dir(train_dir) -data_dir = DataValidator.validated_data_dir(test_dir) - -# Write the prototxt for TRAIN phase -with open('lenet_auto_train.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, train_dir), 64))) - -# Write the prototxt for TEST phase -with open('lenet_auto_test.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, test_dir), 100))) - -# Use the GPU for training -caffe.set_device(0) -caffe.set_mode_gpu() - -# Load the solver and create train and test nets -solver = None # ignore this workaround for lmdb data (can't instantiate two solvers on the same data) -solver = caffe.SGDSolver('lenet_auto_solver.prototxt') - -solver.net.forward() # train net -solver.test_nets[0].forward() # test net (there can be more than one) - -niter = 200 -test_interval = 25 -# losses will also be stored in the log -train_loss = pylab.zeros(niter) -test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval))) -output = pylab.zeros((niter, 8, 10)) - -# the main solver loop -for it in range(niter): - solver.step(1) # SGD by Caffe - - # store the train loss - train_loss[it] = solver.net.blobs['loss'].data - - # store the output on the first test batch - # (start the forward pass at conv1 to avoid loading new data) - solver.test_nets[0].forward(start='conv1') - output[it] = solver.test_nets[0].blobs['score'].data[:8] - - # run a full test every so often - # (Caffe can also do this for us and write to a log, but we show here - # how to do it directly in Python, where more complicated things are easier.) - if it % test_interval == 0: - print('Iteration', it, 'testing...') - correct = 0 - for test_it in range(100): - solver.test_nets[0].forward() - correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1) - == solver.test_nets[0].blobs['label'].data) - test_acc[it // test_interval] = correct / 1e4 - -# Plot the training curve -_, ax1 = pylab.subplots() -ax2 = ax1.twinx() -ax1.plot(pylab.arange(niter), train_loss) -ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r') -ax1.set_xlabel('iteration') -ax1.set_ylabel('train loss') -ax2.set_ylabel('test accuracy') -ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1])) - -# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace -pylab.savefig('learning_curve.png', bbox_inches='tight') diff --git a/caffe/mnist_cmd/.submit_job_jureca_python2.sh b/caffe/mnist_cmd/.submit_job_jureca_python2.sh deleted file mode 100755 index 029520e3308a4e322cfd14c3d863e982fb5ac02e..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_MNIST_CMD -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Train the model using the 'caffe' binary -srun caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt deleted file mode 100644 index 103b2e757061c84e3bb00a83a54f55606b3ce64b..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/lenet_solver.prototxt +++ /dev/null @@ -1,25 +0,0 @@ -# The train/test net protocol buffer definition -net: "lenet_train_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" -# solver mode: CPU or GPU -solver_mode: GPU diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt deleted file mode 100644 index f34ab716ec5467584ac059af3bd5d087a9d2fb34..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/lenet_train_test.prototxt +++ /dev/null @@ -1,168 +0,0 @@ -name: "LeNet" -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_train_lmdb" - batch_size: 64 - backend: LMDB - } -} -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_test_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 20 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "conv1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 50 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "conv2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "ip1" - type: "InnerProduct" - bottom: "pool2" - top: "ip1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 500 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "ip1" - top: "ip1" -} -layer { - name: "ip2" - type: "InnerProduct" - bottom: "ip1" - top: "ip2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 10 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "ip2" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "ip2" - bottom: "label" - top: "loss" -} diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh deleted file mode 100755 index b5ee63c60aa1dddad9708367d6623deccc57022f..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh deleted file mode 100755 index bdac4a2aef6d670bff2fcf4a928bf3586df3781b..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 diff --git a/horovod_data_distributed/juwels_booster_job b/horovod_data_distributed/juwels_booster_job new file mode 100755 index 0000000000000000000000000000000000000000..803e7648c08c91b5a40f30dde6ee758bbf0e688a --- /dev/null +++ b/horovod_data_distributed/juwels_booster_job @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=booster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py index d4c68c19174058a41d0198b322a0f4035ef22419..b2335a83ed979ee77d2d30c2dc67d541c87ba2e4 100644 --- a/horovod_data_distributed/mnist_data_distributed.py +++ b/horovod_data_distributed/mnist_data_distributed.py @@ -20,7 +20,6 @@ import mpi4py import numpy as np import tensorflow as tf import horovod.tensorflow.keras as hvd -from tensorflow.python.keras import backend as K from hpc4neuro.errors import MpiInitError from hpc4neuro.distribution import DataDistributor @@ -102,10 +101,14 @@ def initialize_hvd_and_mpi(): # Bind the local rank to a specific GPU, so that each rank uses # a different GPU - tf_config = tf.ConfigProto() - tf_config.gpu_options.allow_growth = True - tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) - K.set_session(tf.Session(config=tf_config)) + gpus = tf.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + if gpus: + tf.config.experimental.set_visible_devices( + gpus[hvd.local_rank()], + 'GPU' + ) # Verify that MPI multi-threading is supported. Horovod cannot work # with mpi4py (or any other MPI library) otherwise. @@ -113,8 +116,9 @@ def initialize_hvd_and_mpi(): # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163 if not hvd.mpi_threads_supported(): raise MpiInitError( - 'MPI multi-threading is not supported. Horovod cannot work with mpi4py' - 'in this case. Please enable MPI multi-threading and try again.' + 'MPI multi-threading is not supported. Horovod cannot work with ' + 'mpi4py in this case. Please enable MPI multi-threading and try ' + 'again.' ) # Disable automatic MPI initialization on importing mpi4py.MPI, diff --git a/requirements.txt b/requirements.txt index 79144dccd44dd967fb51438abbcd9589c6d81937..6a4def7c590fc30601ff616009ff388d1198e244 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,41 @@ -absl-py==0.8.0 -astor==0.8.0 -cffi==1.12.3 -cloudpickle==1.2.1 -gast==0.3.1 -grpcio==1.23.0 +absl-py==0.12.0 +astunparse==1.6.3 +cachetools==4.2.1 +certifi==2020.12.5 +cffi==1.14.5 +chardet==4.0.0 +cloudpickle==1.6.0 +gast==0.3.3 +google-auth==1.29.0 +google-auth-oauthlib==0.4.4 +google-pasta==0.2.0 +grpcio==1.37.0 h5py==2.10.0 -Markdown==3.1.1 -mock==3.0.5 -mpi4py==3.0.2 -numpy==1.17.2 -protobuf==3.9.1 -psutil==5.6.3 -pycparser==2.19 -six==1.12.0 -Werkzeug==0.15.6 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.0 -tensorboard==1.13.1 -tensorflow-estimator==1.13.0 -tensorflow-gpu==1.13.1 +horovod==0.20.3 +hpc4neuro @ git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git@57a560b4085dba2ba3262d4d3238ef70991be877 +idna==2.10 +Keras-Preprocessing==1.1.2 +Markdown==3.3.4 +mpi4py==3.0.3 +numpy==1.18.5 +oauthlib==3.1.0 +opt-einsum==3.3.0 +protobuf==3.15.8 +psutil==5.8.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +PyYAML==5.4.1 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7.2 +six==1.15.0 +tensorboard==2.5.0 +tensorboard-data-server==0.6.0 +tensorboard-plugin-wit==1.8.0 +tensorflow==2.3.1 +tensorflow-estimator==2.3.0 termcolor==1.1.0 -keras==2.3.1 -horovod==0.16.2 \ No newline at end of file +urllib3==1.26.4 +Werkzeug==1.0.1 +wrapt==1.12.1 diff --git a/caffe/lenet_python/snapshots/.gitkeep b/tensorflow2/checkpoints/.gitkeep similarity index 100% rename from caffe/lenet_python/snapshots/.gitkeep rename to tensorflow2/checkpoints/.gitkeep diff --git a/tensorflow2/juwels_booster_job b/tensorflow2/juwels_booster_job new file mode 100755 index 0000000000000000000000000000000000000000..625afac97d8e8a59df4cb5afdf7ec8233466752f --- /dev/null +++ b/tensorflow2/juwels_booster_job @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=booster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow2/keras_mnist.py b/tensorflow2/keras_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..e44456030f75c420cfc49110292afcf338c62983 --- /dev/null +++ b/tensorflow2/keras_mnist.py @@ -0,0 +1,107 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys + +import tensorflow as tf +import horovod.tensorflow.keras as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) + +# Horovod: adjust learning rate based on number of GPUs. +scaled_lr = 0.001 * hvd.size() +opt = tf.optimizers.Adam(scaled_lr) + +# Horovod: add Horovod DistributedOptimizer. +opt = hvd.DistributedOptimizer(opt) + +# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow +# uses hvd.DistributedOptimizer() to compute gradients. +mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), + optimizer=opt, + metrics=['accuracy'], + experimental_run_tf_function=False) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(tf.keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) + +# Horovod: write logs on worker 0. +verbose = 1 if hvd.rank() == 0 else 0 + +# Train the model. +# Horovod: adjust number of steps based on number of GPUs. +mnist_model.fit(dataset, steps_per_epoch=50 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose) \ No newline at end of file diff --git a/tensorflow2/mnist.py b/tensorflow2/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..53cb1dad8d0c5705954e39df48ea895edd02fb62 --- /dev/null +++ b/tensorflow2/mnist.py @@ -0,0 +1,109 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys + +import tensorflow as tf +import horovod.tensorflow as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) +loss = tf.losses.SparseCategoricalCrossentropy() + +# Horovod: adjust learning rate based on number of GPUs. +opt = tf.optimizers.Adam(0.001 * hvd.size()) + +checkpoint_dir = 'checkpoints/' +checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) + + +@tf.function +def training_step(images, labels, first_batch): + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) + + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) + + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + + return loss_value + + +# Horovod: adjust number of steps based on number of GPUs. +for batch, (images, labels) in enumerate(dataset.take(1000 // hvd.size())): + loss_value = training_step(images, labels, batch == 0) + + if batch % 10 == 0 and hvd.local_rank() == 0: + print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + +# Horovod: save checkpoints only on worker 0 to prevent other workers from +# corrupting it. +if hvd.rank() == 0: + checkpoint.save(checkpoint_dir)