diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..775c8feb9d6f8d5925ddcf5dc75f13c612e17a3e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,11 @@ +datasets/mnist/caffe/mnist_test_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text +datasets/mnist/caffe/mnist_test_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text +datasets/mnist/caffe/mnist_train_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text +datasets/mnist/caffe/mnist_train_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text +datasets/mnist/keras/mnist.npz filter=lfs diff=lfs merge=lfs -text +datasets/mnist/pytorch/data/processed/training.pt filter=lfs diff=lfs merge=lfs -text +datasets/mnist/pytorch/data/processed/test.pt filter=lfs diff=lfs merge=lfs -text +datasets/mnist/raw/t10k-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text +datasets/mnist/raw/t10k-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text +datasets/mnist/raw/train-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text +datasets/mnist/raw/train-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..51cee13fcf0d14f8f0314f9878f75a4e92248e14 --- /dev/null +++ b/.gitignore @@ -0,0 +1,115 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +venv3/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# PyCharm +.idea + +keras.json + +# Tensorflow/keras Checkpoints +mnist_convnet_model/ diff --git a/README.md b/README.md index 3128749d575b9b7c91e337269e46c5af01494f64..a832b01d94925dc39a81ee36a55d8d817b2a3b38 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,190 @@ -# ml_dl_on_supercomputers +# Getting started with ML/DL on Supercomputers -Samples and documentation for the "Getting started with ML/DL on Supercomputers" tutorial. \ No newline at end of file +This repository is intended to serve as a tutorial for anyone interested in utilizing the supercomputers +available at the JSC for ML/DL related projects. It is assumed that the reader is proficient in one or +more of the following frameworks: + +* [Tensorflow](https://www.tensorflow.org/) +* [Keras](https://keras.io/) +* [PyTorch](https://pytorch.org/) +* [Caffe](http://caffe.berkeleyvision.org/) +* [Horovod](https://github.com/horovod/horovod) + +**Note:** This tutorial is by no means intended as an introduction to ML/DL, or to any of the +above mentioned frameworks. If you are interested in educational resources for beginners, please +visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/Education) page. + +### A word regarding the code samples + +Samples for each framework are available in the correspondingly named directory. Each such +directory typically contains at least one code sample, which trains a simple artificial neural +network on the canonical MNIST hand-written digit classification task. Moreover, job submission +scripts are included for all the supercomputers on which this tutorial has been tested. The job +scripts will hopefully make it easier to figure out which modules to load. Finally, +a `README.md` file contains further information about the contents of the directory. + +**Disclaimer:** Neither are the samples intended to serve as examples of optimized code, nor do these +represent programming best practices. + +### Changes made to support loading of pre-downloaded datasets + +It is worth mentioning that all the code samples were taken from the corresponding framework's +official samples/tutorials repository, as practitioners are likely familiar with these (links +to the original code samples are included in the directory-local `README.md`). However, the +original examples are designed to automatically download the required dataset in a +framework-defined directory. This is not a feasible option as compute nodes on the supercomputers +do not have access to the Internet. Therefore, the samples have been slightly modified to load data from +the `datasets` directory included in this repository; specific code changes, at least for now, +have been marked by comments prefixed with the `[HPCNS]` tag. For more information see the `README.md` +available in the `datasets` directory. + +## 1. Applying for user accounts on supercomputers + +In case you do not already have an account on your supercomputer of interest, please take a look at the +instructions provided in the following sub-sections. + +### 1.1 JURECA and JUWELS + +For more information on getting accounts on JURECA and JUWELS, click +[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html). + +### 1.2 JURON + +To get a user account on JURON, please follow the steps below: + +1. Write an email to [Dirk Pleiter](http://www.fz-juelich.de/SharedDocs/Personen/IAS/JSC/EN/staff/pleiter_d.html?nn=362224), +in which please introduce yourself and mention why you need the account. +2. Apply for the account via the [JuDoor](https://dspserv.zam.kfa-juelich.de/judoor/login) portal +(more information about JuDoor is available [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/JuDoor.html?nn=945700)). +If your work is related to the Human Brain Project (HBP), please join the `PCP0` and `CPCP0` projects. +Otherwise please join the `PADC` and `CPADC` projects. + +## 2. Logging on to the supercomputers + +Assuming JURECA is the target supercomputer, following are the steps required to login +(more information [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/JURECA/UserInfo/QuickIntroduction.html?nn=1803700)). + +1. Use SSH to login: + + `ssh <username>@jureca.fz-juelich.de` +2. Upon successful login, activate your project environment (more information [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)): + + + `jutil env activate -p <project name> -A <accounting project name>` +3. Change to the project directory: + + `cd $PROJECT` + +You should be in your project directory at this point. If you'd like to clone this repository +elsewhere, please change to that directory. + +**Note:** The same steps are valid for logging on to JURON, except that the server address in +step 1 should be: `juron.fz-juelich.de` + +## 3. Cloning the repository + +In order to store the datasets within the repository, we use Git LFS. This makes cloning the +repository a little bit different. Please find below the instructions on how to clone on different +systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/). + +**Note:** During the cloning process you will most likely be prompted for your username and +password twice; this is as expected. + +### 3.1 JURECA + +1. Load the Git LFS module: + + `module load git-lfs/2.6.1` +2. Initialize Git LFS: + + `git lfs install` +3. Clone the repository, including the datasets: + + `git lfs clone https://gitlab.version.fz-juelich.de/khalid1/dl_framework_testing.git` + +### 3.2 JURON + +No additional setup is required on JURON. You can simply clone the repository along with the +datasets using the following command: + + git lfs clone https://gitlab.version.fz-juelich.de/khalid1/dl_framework_testing.git + +## 4. Running a sample + +Let us consider a scenario where you would like to run the `mnist.py` sample available in the `keras` +directory. This sample trains a CNN on MNIST using Keras on a single GPU. The following sub-sections list +the steps required for different supercomputers. + +### 4.1 JURECA + +1. Assuming you are in the repository root, change to the keras directory: + + `cd keras` +2. Submit the job to run the sample: + + `sbatch submit_job_jureca_python3.sh` + +That's it; this is all you need for job submission. If you'd like to receive email notifications +regarding the status of the job, add the following statement to the "SLURM job configuration" +block in the `submit_job_jureca_python3.sh` script (replace `<your email address here>` with your +email address). + + #SBATCH --mail-user=<your email address here> + +Output from the job is available in the `error` and `output` files, as specified in the job +configuration. + +### 4.2 JURON + +1. Assuming you are in the repository root, change to the keras directory: + + `cd keras` +2. Submit the job to run the sample: + + `bsub < submit_job_juron_python3.sh` + +Please note that unlike JURECA, JURON uses LSF for job submission, which is why a different +syntax is required for job configuration and submission. Moreover, email notifications are not +supported on JURON. For more information on how to use LSF on JURON, use the following command: + + man 7 juron-lsf + +Output from the job is available in the `error` and `output` files, as specified in the job +configuration. + +## 5. Python 2 support + +All the code samples are compatible with both Python 2 and Python 3. However, not all frameworks on all +machines are available for Python 2 (yet); in certain cases these are only available for Python 3. We have +included separate job submission scripts for Python 2 and Python 3. In cases where Python 2 is not +supported, only the job submission script for Python 3 is available. We will try our best to make +all frameworks available with Python 2 as well, but this will not be a priority as the official support +for Python 2 will be discontinued in the year 2020. + +## 6. Distributed training + +[Horovod](https://github.com/horovod/horovod) provides a simple and efficient solution for +training artificial neural networks on multiple GPUs across multiple nodes in a cluster. It can +be used with Tensorflow, Keras, and PyTorch (some other frameworks are supported as well, but +not Caffe). In this repository, the `horovod` directory contains further sub-directories; one +for each compatible framework that has been tested. E.g., there is a `keras` sub-directory that +contains samples that utilize distributed training with Keras and Horovod (more information is available +in the directory-local `README.md`). + +Please note that Horovod currently only supports a distribution strategy where the entire model is +replicated on all GPUs. It is the data that is distributed across the GPUs. If you are interested +in model-parallel training, where the model itself can be split and distributed, a different +solution is required. We hope to add a sample for model-parallel training at a later time. + +Caffe does not support multi-node training. However, it has built-in support for [multi-GPU +training](https://github.com/BVLC/caffe/blob/master/docs/multigpu.md) on a single node (only +via the C/C++ interface). The `mnist_cmd` sample in the `caffe` directory contains the job +script that can be used to train the model on multiple GPUs. Please see the +directory-local `README.md` for further information. + +## Credits + +* **Created by:** Fahad Khalid (SLNS/HPCNS, JSC) +* **Installation of modules on JURON:** Andreas Herten (HPCNS, JSC) +* **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi Deepu (SLNS/HPCNS, JSC) +* **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC) diff --git a/caffe/README.md b/caffe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db7976c5db0695311aaced72d53bea908f6481a1 --- /dev/null +++ b/caffe/README.md @@ -0,0 +1,38 @@ +# Notes + +There are three ways in which Caffe can be used, +1. As a command line tool with only built-in layers +2. As a library from within a Python program. Either only built-in layers can be used, +or one or more custom layers can be written in Python. +3. As a command line tool with one or more custom C++ layers. + +## Caffe as a command line tool + +The `mnist_cmd` sub-directory contains configuration and job scripts for running +Caffe as a command line tool with only built-in layers. This example represents use +case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` +were taken from the MNIST examples directory available in the Caffe repository available +[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have +been made just so the path to the input dataset is correct. The `caffe` command +in the job submission scripts can be modified as follows to run training on +all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`): + + caffe train --solver=lenet_solver.prototxt -gpu all + +## Using Caffe within a Python program + +The `lenet_python` sub-directory contains the required files for an example of +using Caffe as a library from within a Python program. This corresponds to use case +2 as described above. The `train_lenet.py` file contains source code adapted from +the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples +[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example +results in the generation of a learning curve plot in the current directory. + +## Caffe with custom C++ layers + +Working with custom C++ layers requires recompiling Caffe with the custom code. As +this is not possible with a system-wide installation, we have decided not to +include an example of this use case. Nevertheless, if you must work with custom +C++ layers and require assistance, please send an email to the mailing list +(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)). + diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..44af3ad6cecd7a8090902160666e5453622f8be6 --- /dev/null +++ b/caffe/lenet_python/lenet_auto_solver.prototxt @@ -0,0 +1,24 @@ +# The train/test net protocol buffer definition +train_net: "lenet_auto_train.prototxt" +test_net: "lenet_auto_test.prototxt" +# test_iter specifies how many forward passes the test should carry out. +# In the case of MNIST, we have test batch size 100 and 100 test iterations, +# covering the full 10,000 testing images. +test_iter: 100 +# Carry out testing every 500 training iterations. +test_interval: 500 +# The base learning rate, momentum and the weight decay of the network. +base_lr: 0.01 +momentum: 0.9 +weight_decay: 0.0005 +# The learning rate policy +lr_policy: "inv" +gamma: 0.0001 +power: 0.75 +# Display every 100 iterations +display: 100 +# The maximum number of iterations +max_iter: 10000 +# snapshot intermediate results +snapshot: 5000 +snapshot_prefix: "snapshots/lenet" diff --git a/caffe/lenet_python/snapshots/.gitkeep b/caffe/lenet_python/snapshots/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/caffe/lenet_python/submit_job_jureca_python2.sh b/caffe/lenet_python/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..75069256157eb55f4122b0ebc2f390b925f89396 --- /dev/null +++ b/caffe/lenet_python/submit_job_jureca_python2.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=CAFFE_LENET_PYTHON +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load Caffe/1.0-Python-2.7.15 + +# Run the program +srun python -u train_lenet.py diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..2025a389b89bb90c6593b598231f14c8fb1fdcf0 --- /dev/null +++ b/caffe/lenet_python/submit_job_juron_python2.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J CAFFE_LENET_PYTHON + +# Load the Python and Caffe modules +module load python/2.7.14 +module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 + +# Train LeNet +python -u train_lenet.py diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..7e737766bcb4ee609fdefab0d52f6adcc95e12e8 --- /dev/null +++ b/caffe/lenet_python/submit_job_juron_python3.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J CAFFE_LENET_PYTHON + +# Load the Python and Caffe modules +module load python/3.6.1 +module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 + +# Train LeNet +python -u train_lenet.py diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5cae3bf4d6a7f1f9a418b802418714efb6ee67 --- /dev/null +++ b/caffe/lenet_python/train_lenet.py @@ -0,0 +1,107 @@ +import os +import sys +import matplotlib + +# Force matplotlib to not use any Xwindows backend. +matplotlib.use('Agg') +import pylab + +import caffe +from caffe import layers as L, params as P + +# Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + + +# Prepares network specification +def lenet(lmdb, batch_size): + # Caffe's version of LeNet: a series of linear and simple nonlinear transformations + n = caffe.NetSpec() + + n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb, + transform_param=dict(scale=1. / 255), ntop=2) + + n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) + n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) + n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) + n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) + n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier')) + n.relu1 = L.ReLU(n.fc1, in_place=True) + n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier')) + n.loss = L.SoftmaxWithLoss(n.score, n.label) + + return n.to_proto() + + +# Names of the directories containing the LMDB files for TRAIN and TEST phases +test_dir = 'mnist/caffe/mnist_test_lmdb' +train_dir = 'mnist/caffe/mnist_train_lmdb' + +# Validated path to the data root +DataValidator.validated_data_dir(train_dir) +data_dir = DataValidator.validated_data_dir(test_dir) + +# Write the prototxt for TRAIN phase +with open('lenet_auto_train.prototxt', 'w') as f: + f.write(str(lenet(os.path.join(data_dir, train_dir), 64))) + +# Write the prototxt for TEST phase +with open('lenet_auto_test.prototxt', 'w') as f: + f.write(str(lenet(os.path.join(data_dir, test_dir), 100))) + +# Use the GPU for training +caffe.set_device(0) +caffe.set_mode_gpu() + +# Load the solver and create train and test nets +solver = None # ignore this workaround for lmdb data (can't instantiate two solvers on the same data) +solver = caffe.SGDSolver('lenet_auto_solver.prototxt') + +solver.net.forward() # train net +solver.test_nets[0].forward() # test net (there can be more than one) + +niter = 200 +test_interval = 25 +# losses will also be stored in the log +train_loss = pylab.zeros(niter) +test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval))) +output = pylab.zeros((niter, 8, 10)) + +# the main solver loop +for it in range(niter): + solver.step(1) # SGD by Caffe + + # store the train loss + train_loss[it] = solver.net.blobs['loss'].data + + # store the output on the first test batch + # (start the forward pass at conv1 to avoid loading new data) + solver.test_nets[0].forward(start='conv1') + output[it] = solver.test_nets[0].blobs['score'].data[:8] + + # run a full test every so often + # (Caffe can also do this for us and write to a log, but we show here + # how to do it directly in Python, where more complicated things are easier.) + if it % test_interval == 0: + print('Iteration', it, 'testing...') + correct = 0 + for test_it in range(100): + solver.test_nets[0].forward() + correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1) + == solver.test_nets[0].blobs['label'].data) + test_acc[it // test_interval] = correct / 1e4 + +# Plot the training curve +_, ax1 = pylab.subplots() +ax2 = ax1.twinx() +ax1.plot(pylab.arange(niter), train_loss) +ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r') +ax1.set_xlabel('iteration') +ax1.set_ylabel('train loss') +ax2.set_ylabel('test accuracy') +ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1])) + +# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace +pylab.savefig('learning_curve.png', bbox_inches='tight') diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..103b2e757061c84e3bb00a83a54f55606b3ce64b --- /dev/null +++ b/caffe/mnist_cmd/lenet_solver.prototxt @@ -0,0 +1,25 @@ +# The train/test net protocol buffer definition +net: "lenet_train_test.prototxt" +# test_iter specifies how many forward passes the test should carry out. +# In the case of MNIST, we have test batch size 100 and 100 test iterations, +# covering the full 10,000 testing images. +test_iter: 100 +# Carry out testing every 500 training iterations. +test_interval: 500 +# The base learning rate, momentum and the weight decay of the network. +base_lr: 0.01 +momentum: 0.9 +weight_decay: 0.0005 +# The learning rate policy +lr_policy: "inv" +gamma: 0.0001 +power: 0.75 +# Display every 100 iterations +display: 100 +# The maximum number of iterations +max_iter: 10000 +# snapshot intermediate results +snapshot: 5000 +snapshot_prefix: "snapshots/lenet" +# solver mode: CPU or GPU +solver_mode: GPU diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..f34ab716ec5467584ac059af3bd5d087a9d2fb34 --- /dev/null +++ b/caffe/mnist_cmd/lenet_train_test.prototxt @@ -0,0 +1,168 @@ +name: "LeNet" +layer { + name: "mnist" + type: "Data" + top: "data" + top: "label" + include { + phase: TRAIN + } + transform_param { + scale: 0.00390625 + } + data_param { + source: "../../datasets/mnist/caffe/mnist_train_lmdb" + batch_size: 64 + backend: LMDB + } +} +layer { + name: "mnist" + type: "Data" + top: "data" + top: "label" + include { + phase: TEST + } + transform_param { + scale: 0.00390625 + } + data_param { + source: "../../datasets/mnist/caffe/mnist_test_lmdb" + batch_size: 100 + backend: LMDB + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 20 + kernel_size: 5 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "pool1" + top: "conv2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + convolution_param { + num_output: 50 + kernel_size: 5 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "ip1" + type: "InnerProduct" + bottom: "pool2" + top: "ip1" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + inner_product_param { + num_output: 500 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "ip1" + top: "ip1" +} +layer { + name: "ip2" + type: "InnerProduct" + bottom: "ip1" + top: "ip2" + param { + lr_mult: 1 + } + param { + lr_mult: 2 + } + inner_product_param { + num_output: 10 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + } + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "ip2" + bottom: "label" + top: "accuracy" + include { + phase: TEST + } +} +layer { + name: "loss" + type: "SoftmaxWithLoss" + bottom: "ip2" + bottom: "label" + top: "loss" +} diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/caffe/mnist_cmd/submit_job_jureca_python2.sh b/caffe/mnist_cmd/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..029520e3308a4e322cfd14c3d863e982fb5ac02e --- /dev/null +++ b/caffe/mnist_cmd/submit_job_jureca_python2.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=CAFFE_MNIST_CMD +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load Caffe/1.0-Python-2.7.15 + +# Train the model using the 'caffe' binary +srun caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..b5ee63c60aa1dddad9708367d6623deccc57022f --- /dev/null +++ b/caffe/mnist_cmd/submit_job_juron_python2.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J CAFFE_MNIST_CMD + +# Load the Python and Caffe modules +module load python/2.7.14 +module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 + +# Train a model for MNIST +caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..bdac4a2aef6d670bff2fcf4a928bf3586df3781b --- /dev/null +++ b/caffe/mnist_cmd/submit_job_juron_python3.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J CAFFE_MNIST_CMD + +# Load the Python and Caffe modules +module load python/3.6.1 +module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 + +# Train a model for MNIST +caffe train --solver=lenet_solver.prototxt -gpu 0 diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..19e9a405851d6d230941357bb39546e4d31284e0 --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,19 @@ +# Notes + +To keep the code samples as simple as possible, all examples use the +[MNIST](http://yann.lecun.com/exdb/mnist/) dataset for training a Convolutional +Neural Network on the hand-written digit classification problem. Furthermore, we +decided to take code samples from the official models/examples repositories +maintained by the respective framework developers, as these are the same samples one +uses when getting started with the framework. + +However, the original examples are designed to automatically download the required +dataset in a framework-defined directory. This is not a feasible option as compute +nodes on the supercomputers do not have access to the Internet. Therefore, the samples +have been slightly modified to load data from this `datasets` directory. It contains +the MNIST dataset in different formats because samples for different frameworks expect +the dataset in a different format. + +It is possible to set the `DL_TEST_DATA_HOME` environment variable to point to a +different directory, however, the contents of that directory must contain a +recursive copy of the `mnist` sub-directory as available here. \ No newline at end of file diff --git a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb new file mode 100644 index 0000000000000000000000000000000000000000..760ab4233ddcb5b432bac7ad418179c380c18127 --- /dev/null +++ b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a70974534a27eaa5dc42638940ad311981b0259f1f089ea46c695bfd9c1862da +size 8749056 diff --git a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb new file mode 100644 index 0000000000000000000000000000000000000000..eda8c00824c606c2c5eb4d5db6ccbbfb85da9a01 --- /dev/null +++ b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0667461174c505913de02429312bcbd9c6cab774b4495c7a2bbe7061ce3ccea +size 8192 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb new file mode 100644 index 0000000000000000000000000000000000000000..4432b2e157c90b01c117caabfd241e9e54e46bee --- /dev/null +++ b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eea94f5e1ea128f16ff0e18f9e287cc2676a54a3218105c525e602f375666c1 +size 50757632 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb new file mode 100644 index 0000000000000000000000000000000000000000..d961b47989b1ea9cda34eb5a19ed516938c40482 --- /dev/null +++ b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33569d983c9d6d527cd7d3202c31a2a7395b254fb8076f59b84ecaecb9207906 +size 8192 diff --git a/datasets/mnist/keras/mnist.npz b/datasets/mnist/keras/mnist.npz new file mode 100644 index 0000000000000000000000000000000000000000..c0329306fa8ab17b093038c6fc3033f6a5314f61 --- /dev/null +++ b/datasets/mnist/keras/mnist.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1 +size 11490434 diff --git a/datasets/mnist/pytorch/data/processed/test.pt b/datasets/mnist/pytorch/data/processed/test.pt new file mode 100644 index 0000000000000000000000000000000000000000..94b65e861140519fae72363621049cc6a0c231c7 --- /dev/null +++ b/datasets/mnist/pytorch/data/processed/test.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351753ceb47ffe74395c0689c44f4e5f3eacd8f8c9d9382531d0e1b86a72eb82 +size 7920442 diff --git a/datasets/mnist/pytorch/data/processed/training.pt b/datasets/mnist/pytorch/data/processed/training.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f292961bf2fbad4e68d0b39fba704b3f0df41cc --- /dev/null +++ b/datasets/mnist/pytorch/data/processed/training.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca0471d295e5146aed72b4bc651509d4aa83210a8b23cab01e6472152c825ed +size 47520442 diff --git a/datasets/mnist/raw/t10k-images-idx3-ubyte.gz b/datasets/mnist/raw/t10k-images-idx3-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..aa17dfe485689242a90be276702dcadd17d406f4 --- /dev/null +++ b/datasets/mnist/raw/t10k-images-idx3-ubyte.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6 +size 1648877 diff --git a/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz b/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..d1995bebe8e5b3faeaae99149ce4eb7a68c5764d --- /dev/null +++ b/datasets/mnist/raw/t10k-labels-idx1-ubyte.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6 +size 4542 diff --git a/datasets/mnist/raw/train-images-idx3-ubyte.gz b/datasets/mnist/raw/train-images-idx3-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..9e9852c14333d6b633709fec2c6df84941243c9d --- /dev/null +++ b/datasets/mnist/raw/train-images-idx3-ubyte.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609 +size 9912422 diff --git a/datasets/mnist/raw/train-labels-idx1-ubyte.gz b/datasets/mnist/raw/train-labels-idx1-ubyte.gz new file mode 100644 index 0000000000000000000000000000000000000000..a7ebf9b5b685e9014530844158807071ae717f7f --- /dev/null +++ b/datasets/mnist/raw/train-labels-idx1-ubyte.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c +size 28881 diff --git a/horovod/README.md b/horovod/README.md new file mode 100644 index 0000000000000000000000000000000000000000..35499659b5f85b8b1de15b0ea310bd0a1b84cba2 --- /dev/null +++ b/horovod/README.md @@ -0,0 +1,53 @@ +# Notes + +All source code samples were taken from the Horovod examples repository +[here](https://github.com/uber/horovod/tree/master/examples) +(last checked: February 19, 2019). The samples that work with MNIST data have been +slightly modified. Our changes are limited to, + +* The data loading mechanism +* A bit of code cleanup +* A few additional comments pertaining to our custom data loading mechanism + +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All +statements that demonstrate the use of Horovod follow a comment beginning with +`[Horovod]` (as added by Horovod developers). + +**Caution:** Where job submission scripts are available for both Python 2 and Python 3, please +do not submit both Python 2 and Python 3 jobs simultaneously, as one of the jobs might fail. If +you would like to try both, please run these in tandem. + +## Keras samples + +The following Keras samples are included: + +1. `mnist.py`: A simple MNIST processing example with only the essential Horovod code +for distributed training. +2. `mnist_advanced.py`: This sample is primarily the same as `mnist.py`. However, a +few more advanced Horovod features are used. + +## PyTorch samples + +The following PyTorch samples are included: + +1. `mnist.py`: Demonstrates distributed training using Horovod with PyTorch. A +simple convolutional neural network is trained on the MNIST dataset. +2. `synthetic_benchmark.py`: A benchmark that can be used to measure performance +of PyTorch with Horovod without using any external dataset. + +**Note:** The job scripts for JURECA are prefixed with `.` for these samples, so that +these scripts do not appear in the directory listing. The reason for doing this is +that our testing revealed issues with multi-node training. As soon as the issue has +been resolved, we'll make the scripts available. + +## Tensorflow samples + +The following Tensorflow samples are included: + +1. `mnist.py`: Demonstrates distributed training using Horovod with the low-level +Tensorflow API. A simple convolutional neural network is trained on the MNIST dataset. +2. `mnist_estimator.py`: Demonstrates distributed training using Horovod with the +high-level Estimator API in Tensorflow. A simple convolutional neural network is +trained on the MNIST dataset. +3. `synthetic_benchmark.py`: A simple benchmark that can be used to measure performance +of Tensorflow with Horovod without using any external dataset. diff --git a/horovod/keras/checkpoints/.gitkeep b/horovod/keras/checkpoints/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..b098f2602a721c5e5f1089aca9abc352b21645f1 --- /dev/null +++ b/horovod/keras/mnist.py @@ -0,0 +1,111 @@ +from __future__ import print_function +import os +import sys +import keras +from keras.datasets import mnist +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten +from keras.layers import Conv2D, MaxPooling2D +from keras import backend as K +import math +import tensorflow as tf +import horovod.keras as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +batch_size = 128 +num_classes = 10 + +# Horovod: adjust number of epochs based on number of GPUs. +epochs = int(math.ceil(12.0 / hvd.size())) + +# Input image dimensions +img_rows, img_cols = 28, 28 + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +# [HPCNS] Load MNIST dataset +(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) + +if K.image_data_format() == 'channels_first': + x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) + x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) +else: + x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) + x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# Convert class vectors to binary class matrices +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, kernel_size=(3, 3), + activation='relu', + input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation='relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) +model.add(Flatten()) +model.add(Dense(128, activation='relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes, activation='softmax')) + +# Horovod: adjust learning rate based on number of GPUs. +opt = keras.optimizers.Adadelta(1.0 * hvd.size()) + +# Horovod: add Horovod Distributed Optimizer. +opt = hvd.DistributedOptimizer(opt) + +model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=opt, + metrics=['accuracy']) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) + +model.fit(x_train, y_train, + batch_size=batch_size, + callbacks=callbacks, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test)) +score = model.evaluate(x_test, y_test, verbose=0) +print('Test loss:', score[0]) +print('Test accuracy:', score[1]) diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py new file mode 100644 index 0000000000000000000000000000000000000000..9337026bc92b2bd2b4b570976381f8c01f2f87b8 --- /dev/null +++ b/horovod/keras/mnist_advanced.py @@ -0,0 +1,143 @@ +from __future__ import print_function +import os +import sys +import keras +from keras.datasets import mnist +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten +from keras.layers import Conv2D, MaxPooling2D +from keras.preprocessing.image import ImageDataGenerator +from keras import backend as K +import tensorflow as tf +import horovod.keras as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +config.gpu_options.allow_growth = True +config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +batch_size = 128 +num_classes = 10 + +# Enough epochs to demonstrate learning rate warmup and the reduction of +# learning rate when training plateaues. +epochs = 12 + +# Input image dimensions +img_rows, img_cols = 28, 28 + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +# [HPCNS] Load MNIST dataset. +(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) + +# Determine how many batches are there in train and test sets +train_batches = len(x_train) // batch_size +test_batches = len(x_test) // batch_size + +if K.image_data_format() == 'channels_first': + x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) + x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) +else: + x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) + x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# Convert class vectors to binary class matrices +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, kernel_size=(3, 3), + activation='relu', + input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation='relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) +model.add(Flatten()) +model.add(Dense(128, activation='relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes, activation='softmax')) + +# Horovod: adjust learning rate based on number of GPUs. +opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size()) + +# Horovod: add Horovod Distributed Optimizer. +opt = hvd.DistributedOptimizer(opt) + +model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=opt, + metrics=['accuracy']) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), + + # Reduce the learning rate if training plateaues. + keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) + +# Set up ImageDataGenerators to do data augmentation for the training images. +train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, + height_shift_range=0.08, zoom_range=0.08) +test_gen = ImageDataGenerator() + +# Train the model. +# Horovod: the training will randomly sample 1 / N batches of training data and +# 3 / N batches of validation data on every worker, where N is the number of workers. +# Over-sampling of validation data helps to increase probability that every validation +# example will be evaluated. +model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), + steps_per_epoch=train_batches // hvd.size(), + callbacks=callbacks, + epochs=epochs, + verbose=1, + validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), + validation_steps=3 * test_batches // hvd.size()) + +# Evaluate the model on the full data set. +score = model.evaluate(x_test, y_test, verbose=0) +print('Test loss:', score[0]) +print('Test accuracy:', score[1]) diff --git a/horovod/keras/run_on_localMachine.sh b/horovod/keras/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35 --- /dev/null +++ b/horovod/keras/run_on_localMachine.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Run the program +mpirun -np 1 -H localhost:1 \ + -bind-to none -map-by slot \ + -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ + -mca pml ob1 -mca btl ^openib \ + python -u mnist.py diff --git a/horovod/keras/submit_job_jureca_python2.sh b/horovod/keras/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..d3f39c54154eec58a32d10ecc61f44516af76301 --- /dev/null +++ b/horovod/keras/submit_job_jureca_python2.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_KERAS_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load TensorFlow/1.12.0-GPU-Python-2.7.15 +module load Keras/2.2.4-GPU-Python-2.7.15 +module load Horovod/0.15.2-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..33ba711d3a5f77acc09241b76dc82b404cb48220 --- /dev/null +++ b/horovod/keras/submit_job_jureca_python3.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_KERAS_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load TensorFlow/1.12.0-GPU-Python-3.6.6 +module load Keras/2.2.4-GPU-Python-3.6.6 +module load Horovod/0.15.2-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/horovod/keras/submit_job_juron_python2.sh b/horovod/keras/submit_job_juron_python2.sh new file mode 100644 index 0000000000000000000000000000000000000000..cd5f8dd051c5b46502ac9b3256a7ae0e01dc3572 --- /dev/null +++ b/horovod/keras/submit_job_juron_python2.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" +#BSUB -gpu "num=2" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HOROVOD_KERAS_MNIST + +# Load the required modules +module load python/2.7.14 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 +module load keras/2.2.4 + +# Run the program +mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ + -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/keras/submit_job_juron_python3.sh b/horovod/keras/submit_job_juron_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..03182786d1f52c2cb8cacd9e8c709f1c9d93cc40 --- /dev/null +++ b/horovod/keras/submit_job_juron_python3.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" +#BSUB -gpu "num=2" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HOROVOD_KERAS_MNIST + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 +module load keras/2.2.4 + +# Run the program +mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ + -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/pytorch/.submit_job_jureca_python2.sh b/horovod/pytorch/.submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..885763864fc144947211309994ec8eb5bf539291 --- /dev/null +++ b/horovod/pytorch/.submit_job_jureca_python2.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_PYTORCH_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load PyTorch/1.0.0-GPU-Python-2.7.15 +module load torchvision/0.2.1-GPU-Python-2.7.15 +module load Horovod/0.15.2-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/horovod/pytorch/.submit_job_jureca_python3.sh b/horovod/pytorch/.submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..41628882e0e202fb5fea56afabb1c0e3e2dc2a3b --- /dev/null +++ b/horovod/pytorch/.submit_job_jureca_python3.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_PYTORCH_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load PyTorch/1.0.0-GPU-Python-3.6.6 +module load torchvision/0.2.1-GPU-Python-3.6.6 +module load Horovod/0.15.2-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1b9c584ab4079dfddc9fe5f6633ad9ab2145b4 --- /dev/null +++ b/horovod/pytorch/mnist.py @@ -0,0 +1,195 @@ +from __future__ import print_function +import os +import sys +import shutil +import argparse +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +import torch.utils.data.distributed +import horovod.torch as hvd + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') +parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') +parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') +parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') +parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--seed', type=int, default=42, metavar='S', + help='random seed (default: 42)') +parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/pytorch/data' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize library. +hvd.init() +torch.manual_seed(args.seed) + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + torch.cuda.manual_seed(args.seed) + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +# [HPCNS] Dataset filename for this rank +dataset_for_rank = 'MNIST-data-%d' % hvd.rank() + +# [HPCNS] If the path already exists, remove it +if os.path.exists(dataset_for_rank): + shutil.rmtree(dataset_for_rank) + +# [HPCNS] Make a copy of the dataset for this rank +shutil.copytree(dataset_file, dataset_for_rank) + +kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} +train_dataset = \ + datasets.MNIST(dataset_for_rank, train=True, download=False, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the training data. +train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) + +test_dataset = \ + datasets.MNIST(dataset_for_rank, train=False, download=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) +# Horovod: use DistributedSampler to partition the test data. +test_sampler = torch.utils.data.distributed.DistributedSampler( + test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, + sampler=test_sampler, **kwargs) + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +model = Net() + +if args.cuda: + # Move model to GPU. + model.cuda() + +# Horovod: broadcast parameters. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) + +# Horovod: scale learning rate by the number of GPUs. +optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), + momentum=args.momentum) + +# Horovod: (optional) compression algorithm. +compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + compression=compression) + + +def train(epoch): + model.train() + # Horovod: set epoch to sampler for shuffling. + train_sampler.set_epoch(epoch) + for batch_idx, (data, target) in enumerate(train_loader): + if args.cuda: + data, target = data.cuda(), target.cuda() + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + # Horovod: use train_sampler to determine the number of examples in + # this worker's partition. + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_sampler), + 100. * batch_idx / len(train_loader), loss.item())) + + +def metric_average(val, name): + tensor = torch.tensor(val) + avg_tensor = hvd.allreduce(tensor, name=name) + return avg_tensor.item() + + +def test(): + model.eval() + test_loss = 0. + test_accuracy = 0. + for data, target in test_loader: + if args.cuda: + data, target = data.cuda(), target.cuda() + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, size_average=False).item() + # get the index of the max log-probability + pred = output.data.max(1, keepdim=True)[1] + test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() + + # Horovod: use test_sampler to determine the number of examples in + # this worker's partition. + test_loss /= len(test_sampler) + test_accuracy /= len(test_sampler) + + # Horovod: average metric values across workers. + test_loss = metric_average(test_loss, 'avg_loss') + test_accuracy = metric_average(test_accuracy, 'avg_accuracy') + + # Horovod: print output only on first rank. + if hvd.rank() == 0: + print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( + test_loss, 100. * test_accuracy)) + + +for epoch in range(1, args.epochs + 1): + train(epoch) + test() + +# [HPCNS] Remove the copied dataset +shutil.rmtree(dataset_for_rank) diff --git a/horovod/pytorch/run_on_localMachine.sh b/horovod/pytorch/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35 --- /dev/null +++ b/horovod/pytorch/run_on_localMachine.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Run the program +mpirun -np 1 -H localhost:1 \ + -bind-to none -map-by slot \ + -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ + -mca pml ob1 -mca btl ^openib \ + python -u mnist.py diff --git a/horovod/pytorch/submit_job_juron_python3.sh b/horovod/pytorch/submit_job_juron_python3.sh new file mode 100644 index 0000000000000000000000000000000000000000..126c939b04c3f0cf8b3180e251b009c03ad69d0e --- /dev/null +++ b/horovod/pytorch/submit_job_juron_python3.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" +#BSUB -gpu "num=2" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J PYTORCH_HOROVOD_MNIST + +# Load the required modules +module load python/3.6.1 +module load pytorch/1.0.1-gcc_5.4.0-cuda_10.0.130 +module load torchvision/0.2.1 +module load horovod/0.15.2 + +# Run the program +mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ + -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/pytorch/synthetic_benchmark.py b/horovod/pytorch/synthetic_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a177f8b4e8583cb8169d308660fec8b7fc1664 --- /dev/null +++ b/horovod/pytorch/synthetic_benchmark.py @@ -0,0 +1,110 @@ +from __future__ import print_function + +import argparse +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import models +import horovod.torch as hvd +import timeit +import numpy as np + +# Benchmark settings +parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') + +parser.add_argument('--model', type=str, default='resnet50', + help='model to benchmark') +parser.add_argument('--batch-size', type=int, default=32, + help='input batch size') + +parser.add_argument('--num-warmup-batches', type=int, default=10, + help='number of warm-up batches that don\'t count towards benchmark') +parser.add_argument('--num-batches-per-iter', type=int, default=10, + help='number of batches per benchmark iteration') +parser.add_argument('--num-iters', type=int, default=10, + help='number of benchmark iterations') + +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +hvd.init() + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + +cudnn.benchmark = True + +# Set up standard model. +model = getattr(models, args.model)() + +if args.cuda: + # Move model to GPU. + model.cuda() + +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Horovod: (optional) compression algorithm. +compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer(optimizer, + named_parameters=model.named_parameters(), + compression=compression) + +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + +# Set up fixed fake data +data = torch.randn(args.batch_size, 3, 224, 224) +target = torch.LongTensor(args.batch_size).random_() % 1000 +if args.cuda: + data, target = data.cuda(), target.cuda() + + +def benchmark_step(): + optimizer.zero_grad() + output = model(data) + loss = F.cross_entropy(output, target) + loss.backward() + optimizer.step() + + +def log(s, nl=True): + if hvd.rank() != 0: + return + print(s, end='\n' if nl else '') + + +log('Model: %s' % args.model) +log('Batch size: %d' % args.batch_size) +device = 'GPU' if args.cuda else 'CPU' +log('Number of %ss: %d' % (device, hvd.size())) + +# Warm-up +log('Running warmup...') +timeit.timeit(benchmark_step, number=args.num_warmup_batches) + +# Benchmark +log('Running benchmark...') +img_secs = [] +for x in range(args.num_iters): + time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) + img_secs.append(img_sec) + +# Results +img_sec_mean = np.mean(img_secs) +img_sec_conf = 1.96 * np.std(img_secs) +log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) +log('Total img/sec on %d %s(s): %.1f +-%.1f' % + (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) diff --git a/horovod/tensorflow/checkpoints/.gitkeep b/horovod/tensorflow/checkpoints/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..da37944b01335cb3d78b20e5245d9518fae8779e --- /dev/null +++ b/horovod/tensorflow/mnist.py @@ -0,0 +1,169 @@ +# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys +import tensorflow as tf +import horovod.tensorflow as hvd +import numpy as np +import shutil + +from tensorflow import keras + +layers = tf.layers + +tf.logging.set_verbosity(tf.logging.INFO) + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + + +def conv_model(feature, target, mode): + """2-layer convolution model.""" + # Convert the target to a one-hot tensor of shape (batch_size, 10) and + # with a on-value of 1 for each one-hot vector of length 10. + target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) + + # Reshape feature to 4d tensor with 2nd and 3rd dimensions being + # image width and height final dimension being the number of color channels. + feature = tf.reshape(feature, [-1, 28, 28, 1]) + + # First conv layer will compute 32 features for each 5x5 patch + with tf.variable_scope('conv_layer1'): + h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], + activation=tf.nn.relu, padding="SAME") + h_pool1 = tf.nn.max_pool( + h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + + # Second conv layer will compute 64 features for each 5x5 patch. + with tf.variable_scope('conv_layer2'): + h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], + activation=tf.nn.relu, padding="SAME") + h_pool2 = tf.nn.max_pool( + h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') + # reshape tensor into a batch of vectors + h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) + + # Densely connected layer with 1024 neurons. + h_fc1 = layers.dropout( + layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu), + rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) + + # Compute logits (1 per class) and compute loss. + logits = layers.dense(h_fc1, 10, activation=None) + loss = tf.losses.softmax_cross_entropy(target, logits) + + return tf.argmax(logits, 1), loss + + +def train_input_generator(x_train, y_train, batch_size=64): + assert len(x_train) == len(y_train) + while True: + p = np.random.permutation(len(x_train)) + x_train, y_train = x_train[p], y_train[p] + index = 0 + while index <= len(x_train) - batch_size: + yield x_train[index:index + batch_size], \ + y_train[index:index + batch_size], + index += batch_size + + +def main(_): + # Horovod: initialize Horovod. + hvd.init() + + # [HPCNS] Fully qualified dataset file name + dataset_file = os.path.join(data_dir, data_file) + + # [HPCNS] Dataset filename for this rank + dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) + + # [HPCNS] Make a copy of the dataset for this rank + shutil.copyfile(dataset_file, dataset_for_rank) + + # [HPCNS] Load MNIST dataset + (x_train, y_train), (x_test, y_test) = \ + keras.datasets.mnist.load_data(dataset_for_rank) + + # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it + # into (-1, 784) to feed into our network. Also, need to normalize the + # features between 0 and 1. + x_train = np.reshape(x_train, (-1, 784)) / 255.0 + x_test = np.reshape(x_test, (-1, 784)) / 255.0 + + # Build model... + with tf.name_scope('input'): + image = tf.placeholder(tf.float32, [None, 784], name='image') + label = tf.placeholder(tf.float32, [None], name='label') + predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) + + # Horovod: adjust learning rate based on number of GPUs. + opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) + + # Horovod: add Horovod Distributed Optimizer. + opt = hvd.DistributedOptimizer(opt) + + global_step = tf.train.get_or_create_global_step() + train_op = opt.minimize(loss, global_step=global_step) + + hooks = [ + # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states + # from rank 0 to all other processes. This is necessary to ensure consistent + # initialization of all workers when training is started with random weights + # or restored from a checkpoint. + hvd.BroadcastGlobalVariablesHook(0), + + # Horovod: adjust number of steps based on number of GPUs. + tf.train.StopAtStepHook(last_step=20000 // hvd.size()), + + tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, + every_n_iter=10), + ] + + # Horovod: pin GPU to be used to process local rank (one GPU per process) + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = str(hvd.local_rank()) + + # Horovod: save checkpoints only on worker 0 to prevent other workers from + # corrupting them. + checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None + training_batch_generator = train_input_generator(x_train, + y_train, batch_size=100) + # The MonitoredTrainingSession takes care of session initialization, + # restoring from a checkpoint, saving to a checkpoint, and closing when done + # or an error occurs. + with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, + hooks=hooks, + config=config) as mon_sess: + while not mon_sess.should_stop(): + # Run a training step synchronously. + image_, label_ = next(training_batch_generator) + mon_sess.run(train_op, feed_dict={image: image_, label: label_}) + + # [HPCNS] Remove the copied dataset + os.remove(dataset_for_rank) + + +if __name__ == "__main__": + tf.app.run() diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py new file mode 100644 index 0000000000000000000000000000000000000000..861de50549b470685462a688643dbf3cd8e86288 --- /dev/null +++ b/horovod/tensorflow/mnist_estimator.py @@ -0,0 +1,223 @@ +# Copyright 2018 Uber Technologies, Inc. All Rights Reserved. +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import shutil +import numpy as np +import tensorflow as tf +import horovod.tensorflow as hvd + +from tensorflow import keras + +tf.logging.set_verbosity(tf.logging.INFO) + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + + +def cnn_model_fn(features, labels, mode): + """Model function for CNN.""" + # Input Layer + # Reshape X to 4-D tensor: [batch_size, width, height, channels] + # MNIST images are 28x28 pixels, and have one color channel + input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) + + # Convolutional Layer #1 + # Computes 32 features using a 5x5 filter with ReLU activation. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 28, 28, 1] + # Output Tensor Shape: [batch_size, 28, 28, 32] + conv1 = tf.layers.conv2d( + inputs=input_layer, + filters=32, + kernel_size=[5, 5], + padding="same", + activation=tf.nn.relu) + + # Pooling Layer #1 + # First max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 28, 28, 32] + # Output Tensor Shape: [batch_size, 14, 14, 32] + pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) + + # Convolutional Layer #2 + # Computes 64 features using a 5x5 filter. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 14, 14, 32] + # Output Tensor Shape: [batch_size, 14, 14, 64] + conv2 = tf.layers.conv2d( + inputs=pool1, + filters=64, + kernel_size=[5, 5], + padding="same", + activation=tf.nn.relu) + + # Pooling Layer #2 + # Second max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 14, 14, 64] + # Output Tensor Shape: [batch_size, 7, 7, 64] + pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) + + # Flatten tensor into a batch of vectors + # Input Tensor Shape: [batch_size, 7, 7, 64] + # Output Tensor Shape: [batch_size, 7 * 7 * 64] + pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) + + # Dense Layer + # Densely connected layer with 1024 neurons + # Input Tensor Shape: [batch_size, 7 * 7 * 64] + # Output Tensor Shape: [batch_size, 1024] + dense = tf.layers.dense(inputs=pool2_flat, units=1024, + activation=tf.nn.relu) + + # Add dropout operation; 0.6 probability that element will be kept + dropout = tf.layers.dropout( + inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) + + # Logits layer + # Input Tensor Shape: [batch_size, 1024] + # Output Tensor Shape: [batch_size, 10] + logits = tf.layers.dense(inputs=dropout, units=10) + + predictions = { + # Generate predictions (for PREDICT and EVAL mode) + "classes": tf.argmax(input=logits, axis=1), + # Add `softmax_tensor` to the graph. It is used for PREDICT and by the + # `logging_hook`. + "probabilities": tf.nn.softmax(logits, name="softmax_tensor") + } + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) + + # Calculate Loss (for both TRAIN and EVAL modes) + onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) + loss = tf.losses.softmax_cross_entropy( + onehot_labels=onehot_labels, logits=logits) + + # Configure the Training Op (for TRAIN mode) + if mode == tf.estimator.ModeKeys.TRAIN: + # Horovod: scale learning rate by the number of workers. + optimizer = tf.train.MomentumOptimizer( + learning_rate=0.001 * hvd.size(), momentum=0.9) + + # Horovod: add Horovod Distributed Optimizer. + optimizer = hvd.DistributedOptimizer(optimizer) + + train_op = optimizer.minimize( + loss=loss, + global_step=tf.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, + train_op=train_op) + + # Add evaluation metrics (for EVAL mode) + eval_metric_ops = { + "accuracy": tf.metrics.accuracy( + labels=labels, predictions=predictions["classes"])} + return tf.estimator.EstimatorSpec( + mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + + +def main(unused_argv): + # Horovod: initialize Horovod. + hvd.init() + + # [HPCNS] Fully qualified dataset file name + dataset_file = os.path.join(data_dir, data_file) + + # [HPCNS] Dataset filename for this rank + dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) + + # [HPCNS] Make a copy of the dataset for this rank + shutil.copyfile(dataset_file, dataset_for_rank) + + # [HPCNS] Load MNIST dataset + (train_data, train_labels), (eval_data, eval_labels) = \ + keras.datasets.mnist.load_data(dataset_for_rank) + + # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it + # into (-1, 784) to feed into our network. Also, need to normalize the + # features between 0 and 1. + train_data = np.reshape(train_data, (-1, 784)) / 255.0 + eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 + + # Horovod: pin GPU to be used to process local rank (one GPU per process) + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = str(hvd.local_rank()) + + # Horovod: save checkpoints only on worker 0 to prevent other workers from + # corrupting them. + model_dir = 'checkpoints/mnist_convnet_model' if hvd.rank() == 0 else None + + # Create the Estimator + mnist_classifier = tf.estimator.Estimator( + model_fn=cnn_model_fn, model_dir=model_dir, + config=tf.estimator.RunConfig(session_config=config)) + + # Set up logging for predictions + # Log the values in the "Softmax" tensor with label "probabilities" + tensors_to_log = {"probabilities": "softmax_tensor"} + logging_hook = tf.train.LoggingTensorHook( + tensors=tensors_to_log, every_n_iter=500) + + # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from + # rank 0 to all other processes. This is necessary to ensure consistent + # initialization of all workers when training is started with random weights or + # restored from a checkpoint. + bcast_hook = hvd.BroadcastGlobalVariablesHook(0) + + # Train the model + train_input_fn = tf.estimator.inputs.numpy_input_fn( + x={"x": train_data}, + y=train_labels, + batch_size=100, + num_epochs=None, + shuffle=True) + + # Horovod: adjust number of steps based on number of GPUs. + mnist_classifier.train( + input_fn=train_input_fn, + steps=500 // hvd.size(), + hooks=[logging_hook, bcast_hook]) + + # Evaluate the model and print results + eval_input_fn = tf.estimator.inputs.numpy_input_fn( + x={"x": eval_data}, + y=eval_labels, + num_epochs=1, + shuffle=False) + eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) + print(eval_results) + + # [HPCNS] Remove the copied dataset + os.remove(dataset_for_rank) + + +if __name__ == "__main__": + tf.app.run() diff --git a/horovod/tensorflow/run_on_localMachine.sh b/horovod/tensorflow/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c9afb4b58ee9f4a42480997dd298b6e33c71a35 --- /dev/null +++ b/horovod/tensorflow/run_on_localMachine.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Run the program +mpirun -np 1 -H localhost:1 \ + -bind-to none -map-by slot \ + -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ + -mca pml ob1 -mca btl ^openib \ + python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca_python2.sh b/horovod/tensorflow/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..c9a386afeee610280ab4a6c51610f4261ec0ea11 --- /dev/null +++ b/horovod/tensorflow/submit_job_jureca_python2.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_TFLOW_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load TensorFlow/1.12.0-GPU-Python-2.7.15 +module load Horovod/0.15.2-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..60122fe53c43e4635a16fdf626d5cb183a73d52d --- /dev/null +++ b/horovod/tensorflow/submit_job_jureca_python3.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=2 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HOROVOD_TFLOW_MNIST +#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load TensorFlow/1.12.0-GPU-Python-3.6.6 +module load Horovod/0.15.2-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juron_python2.sh b/horovod/tensorflow/submit_job_juron_python2.sh new file mode 100644 index 0000000000000000000000000000000000000000..85b2ee684ae7732ea48124530e5f3c4416eea69c --- /dev/null +++ b/horovod/tensorflow/submit_job_juron_python2.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" +#BSUB -gpu "num=2" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HOROVOD_TFLOW_MNIST + +# Load the required modules +module load python/2.7.14 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 + +# Run the program +mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ + -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juron_python3.sh b/horovod/tensorflow/submit_job_juron_python3.sh new file mode 100644 index 0000000000000000000000000000000000000000..01075474bae35cafb29c70239f29214de904a6ca --- /dev/null +++ b/horovod/tensorflow/submit_job_juron_python3.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 4 +#BSUB -R "span[ptile=2]" +#BSUB -gpu "num=2" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J HOROVOD_TFLOW_MNIST + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load horovod/0.15.2 + +# Run the program +mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ + -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/tensorflow/synthetic_benchmark.py b/horovod/tensorflow/synthetic_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..abbdd20fdb933dbde47f7d92f644da2454dbd8e7 --- /dev/null +++ b/horovod/tensorflow/synthetic_benchmark.py @@ -0,0 +1,120 @@ +from __future__ import absolute_import, division, print_function + +import argparse +import os +import numpy as np +import timeit + +import tensorflow as tf +import horovod.tensorflow as hvd +from tensorflow.keras import applications + +# Benchmark settings +parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') + +parser.add_argument('--model', type=str, default='ResNet50', + help='model to benchmark') +parser.add_argument('--batch-size', type=int, default=32, + help='input batch size') + +parser.add_argument('--num-warmup-batches', type=int, default=10, + help='number of warm-up batches that don\'t count towards benchmark') +parser.add_argument('--num-batches-per-iter', type=int, default=10, + help='number of batches per benchmark iteration') +parser.add_argument('--num-iters', type=int, default=10, + help='number of benchmark iterations') + +parser.add_argument('--eager', action='store_true', default=False, + help='enables eager execution') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + +args = parser.parse_args() +args.cuda = not args.no_cuda + +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +if args.cuda: + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = str(hvd.local_rank()) +else: + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + config.gpu_options.allow_growth = False + config.gpu_options.visible_device_list = '' + +if args.eager: + tf.enable_eager_execution(config) + +# Set up standard model. +model = getattr(applications, args.model)(weights=None) + +opt = tf.train.GradientDescentOptimizer(0.01) + +# Horovod: (optional) compression algorithm. +compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + +# Horovod: wrap optimizer with DistributedOptimizer. +opt = hvd.DistributedOptimizer(opt, compression=compression) + +init = tf.global_variables_initializer() +bcast_op = hvd.broadcast_global_variables(0) + +data = tf.random_uniform([args.batch_size, 224, 224, 3]) +target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) + + +def loss_function(): + logits = model(data, training=True) + return tf.losses.sparse_softmax_cross_entropy(target, logits) + + +def log(s, nl=True): + if hvd.rank() != 0: + return + print(s, end='\n' if nl else '') + + +log('Model: %s' % args.model) +log('Batch size: %d' % args.batch_size) +device = 'GPU' if args.cuda else 'CPU' +log('Number of %ss: %d' % (device, hvd.size())) + + +def run(benchmark_step): + # Warm-up + log('Running warmup...') + timeit.timeit(benchmark_step, number=args.num_warmup_batches) + + # Benchmark + log('Running benchmark...') + img_secs = [] + for x in range(args.num_iters): + time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) + img_secs.append(img_sec) + + # Results + img_sec_mean = np.mean(img_secs) + img_sec_conf = 1.96 * np.std(img_secs) + log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) + log('Total img/sec on %d %s(s): %.1f +-%.1f' % + (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) + + +if tf.executing_eagerly(): + with tf.device(device): + run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) +else: + with tf.Session(config=config) as session: + init.run() + bcast_op.run() + + loss = loss_function() + train_opt = opt.minimize(loss) + run(lambda: session.run(train_opt)) diff --git a/keras/README.md b/keras/README.md new file mode 100644 index 0000000000000000000000000000000000000000..598f4e1f95aca48216c4d10b1e48c18ef7466363 --- /dev/null +++ b/keras/README.md @@ -0,0 +1,13 @@ +# Notes + +The `mnist.py` sample is a slightly modified version of `mnist_cnn.py` +available in the Keras examples repository +[here](https://github.com/keras-team/keras/tree/master/examples) +(last checked: February 19, 2019). Our changes are +limited to, + +* The data loading mechanism +* A bit of code cleanup +* A few additional comments pertaining to our custom data loading mechanism + +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file diff --git a/keras/mnist.py b/keras/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..c1831694e02ccc3d1546fb8955f5798474c870e6 --- /dev/null +++ b/keras/mnist.py @@ -0,0 +1,87 @@ +"""Trains a simple convnet on the MNIST dataset. + +Gets to 99.25% test accuracy after 12 epochs +(there is still a lot of margin for parameter tuning). +16 seconds per epoch on a GRID K520 GPU. +""" + +from __future__ import print_function +import os +import sys +import keras +from keras.datasets import mnist +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten +from keras.layers import Conv2D, MaxPooling2D +from keras import backend as K + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +batch_size = 128 +num_classes = 10 +epochs = 12 + +# input image dimensions +img_rows, img_cols = 28, 28 + +# [HPCNS] Load MNIST dataset +# the data, split between train and test sets +(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) + +if K.image_data_format() == 'channels_first': + x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) + x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) +else: + x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) + x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# convert class vectors to binary class matrices +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, kernel_size=(3, 3), + activation='relu', + input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation='relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) +model.add(Flatten()) +model.add(Dense(128, activation='relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes, activation='softmax')) + +model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adadelta(), + metrics=['accuracy']) + +model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + verbose=1, + validation_data=(x_test, y_test)) +score = model.evaluate(x_test, y_test, verbose=0) +print('Test loss:', score[0]) +print('Test accuracy:', score[1]) diff --git a/keras/run_on_localMachine.sh b/keras/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9dade0afcb3dbdad0e3570d1643511cc4bf206bb --- /dev/null +++ b/keras/run_on_localMachine.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# Run the program +python -u mnist_cnn.py diff --git a/keras/submit_job_jureca_python2.sh b/keras/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..59cfe31442312248018eedae4fb7ec7f14655875 --- /dev/null +++ b/keras/submit_job_jureca_python2.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=KERAS_MNIST_CNN +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load TensorFlow/1.12.0-GPU-Python-2.7.15 +module load Keras/2.2.4-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/keras/submit_job_jureca_python3.sh b/keras/submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..5057614a50135d9693248abe7ff7a70d44131d6b --- /dev/null +++ b/keras/submit_job_jureca_python3.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=KERAS_MNIST_CNN +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load TensorFlow/1.12.0-GPU-Python-3.6.6 +module load Keras/2.2.4-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/keras/submit_job_juron_python2.sh b/keras/submit_job_juron_python2.sh new file mode 100644 index 0000000000000000000000000000000000000000..91ae8c778668e2dd852fd75d59f00ad14d1a78d0 --- /dev/null +++ b/keras/submit_job_juron_python2.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J KERAS_MNIST_CNN + +# Load the required modules +module load python/2.7.14 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load keras/2.2.4 + +# Run the program +python -u mnist.py diff --git a/keras/submit_job_juron_python3.sh b/keras/submit_job_juron_python3.sh new file mode 100644 index 0000000000000000000000000000000000000000..7927b03679f2f4b515c90bcbc564447a23433e08 --- /dev/null +++ b/keras/submit_job_juron_python3.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J KERAS_MNIST_CNN + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 +module load keras/2.2.4 + +# Run the program +python -u mnist.py diff --git a/pytorch/README.md b/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ac1ac2f2d168d7843d479c27a82d288faf10176a --- /dev/null +++ b/pytorch/README.md @@ -0,0 +1,13 @@ +# Notes + +The `mnist.py` sample is a slightly modified version of `main.py` +available in the PyTorch examples repository +[here](https://github.com/pytorch/examples/tree/master/mnist) +(last checked: February 19, 2019). Our changes are +limited to, + +* The data loading mechanism +* A bit of code cleanup +* A few additional comments pertaining to our custom data loading mechanism + +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file diff --git a/pytorch/mnist.py b/pytorch/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..d4092b614e9cc2045952884199c63eafef5f7e5b --- /dev/null +++ b/pytorch/mnist.py @@ -0,0 +1,151 @@ +from __future__ import print_function + +import os +import sys +import shutil +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + + +def main(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + # [HPCNS] Name of the dataset file + data_file = 'mnist/pytorch/data' + + # [HPCNS] Path to the directory containing the dataset file + data_dir = DataValidator.validated_data_dir(data_file) + + # [HPCNS] Fully qualified dataset file name + dataset_file = os.path.join(data_dir, data_file) + + # [HPCNS] A copy of the dataset in the current directory + dataset_copy = 'MNIST-data' + + # [HPCNS] If the path already exists, remove it + if os.path.exists(dataset_copy): + shutil.rmtree(dataset_copy) + + # [HPCNS] Make a copy of the dataset, as the torch data loader used + # below expects the dataset in the current directory + shutil.copytree(dataset_file, dataset_copy) + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + datasets.MNIST(dataset_copy, train=True, download=False, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST(dataset_copy, train=False, download=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.test_batch_size, shuffle=True, **kwargs) + + model = Net().to(device) + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(args, model, device, test_loader) + + if (args.save_model): + torch.save(model.state_dict(), "mnist_cnn.pt") + + # [HPCNS] Remove the copied dataset + shutil.rmtree(dataset_copy) + + +if __name__ == '__main__': + main() diff --git a/pytorch/run_on_localMachine.sh b/pytorch/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c5737c9fc9d6bca93e25fca9f785e52320131fc --- /dev/null +++ b/pytorch/run_on_localMachine.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# Run the program +python -u mnist.py \ No newline at end of file diff --git a/pytorch/submit_job_jureca_python2.sh b/pytorch/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..f757354a7784027bb813d98bef11ce4002a5480a --- /dev/null +++ b/pytorch/submit_job_jureca_python2.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=PYTORCH_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load PyTorch/1.0.0-GPU-Python-2.7.15 +module load torchvision/0.2.1-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/submit_job_jureca_python3.sh b/pytorch/submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..0f66a30a0c87d45da544ed74025df0ee428933f7 --- /dev/null +++ b/pytorch/submit_job_jureca_python3.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=PYTORCH_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load MVAPICH2/2.3-GDR +module load PyTorch/1.0.0-GPU-Python-3.6.6 +module load torchvision/0.2.1-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/pytorch/submit_job_juron_python3.sh b/pytorch/submit_job_juron_python3.sh new file mode 100644 index 0000000000000000000000000000000000000000..061139f19cf8f9cdc03e8d4ced3d1c15f66ae49c --- /dev/null +++ b/pytorch/submit_job_juron_python3.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J PYTORCH_MNIST + +# Load the required modules +module load python/3.6.1 +module load pytorch/1.0.1-gcc_5.4.0-cuda_10.0.130 +module load torchvision/0.2.1 + +# Run the program +python -u mnist.py \ No newline at end of file diff --git a/tensorflow/README.md b/tensorflow/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cbf485424ae35ac8a1e8fcdd4650ffa8a08114df --- /dev/null +++ b/tensorflow/README.md @@ -0,0 +1,13 @@ +# Notes + +The `mnist.py` sample is a slightly modified version of `convolutional.py` +available in the Tensorflow models repository +[here](https://github.com/tensorflow/models/blob/master/tutorials/image/mnist) +(last checked: February 19, 2019). Our changes are +limited to, + +* The data loading mechanism +* A bit of code cleanup +* A few additional comments pertaining to our custom data loading mechanism + +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7ba4bdc5fb1b25bc0744308a26ad22856f729c26 --- /dev/null +++ b/tensorflow/mnist.py @@ -0,0 +1,338 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example. + +This should achieve a test error of 0.7%. Please keep this model as simple and +linear as possible, it is meant as a tutorial for simple convolutional models. +Run with --self_test on the command line to execute a short self-test. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import gzip +import os +import sys +import time + +import numpy +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +IMAGE_SIZE = 28 +NUM_CHANNELS = 1 +PIXEL_DEPTH = 255 +NUM_LABELS = 10 +VALIDATION_SIZE = 5000 # Size of the validation set. +SEED = 66478 # Set to None for random seed. +BATCH_SIZE = 64 +NUM_EPOCHS = 10 +EVAL_BATCH_SIZE = 64 +EVAL_FREQUENCY = 100 # Number of steps between evaluations. + +FLAGS = None + + +def data_type(): + """Return the type of the activations, weights, and placeholder variables.""" + if FLAGS.use_fp16: + return tf.float16 + else: + return tf.float32 + + +def extract_data(filename, num_images): + """Extract the images into a 4D tensor [image index, y, x, channels]. + + Values are rescaled from [0, 255] down to [-0.5, 0.5]. + """ + print('Extracting', filename) + with gzip.open(filename) as bytestream: + bytestream.read(16) + buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) + data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32) + data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH + data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) + return data + + +def extract_labels(filename, num_images): + """Extract the labels into a vector of int64 label IDs.""" + print('Extracting', filename) + with gzip.open(filename) as bytestream: + bytestream.read(8) + buf = bytestream.read(1 * num_images) + labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64) + return labels + + +def fake_data(num_images): + """Generate a fake dataset that matches the dimensions of MNIST.""" + data = numpy.ndarray( + shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS), + dtype=numpy.float32) + labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64) + for image in xrange(num_images): + label = image % 2 + data[image, :, :, 0] = label - 0.5 + labels[image] = label + return data, labels + + +def error_rate(predictions, labels): + """Return the error rate based on dense predictions and sparse labels.""" + return 100.0 - ( + 100.0 * + numpy.sum(numpy.argmax(predictions, 1) == labels) / + predictions.shape[0]) + + +def main(_): + if FLAGS.self_test: + print('Running self-test.') + train_data, train_labels = fake_data(256) + validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) + test_data, test_labels = fake_data(EVAL_BATCH_SIZE) + num_epochs = 1 + else: + # [HPCNS]: Data files relative to the 'datasets' directory + train_data_filename = 'mnist/raw/train-images-idx3-ubyte.gz' + train_labels_filename = 'mnist/raw/train-labels-idx1-ubyte.gz' + test_data_filename = 'mnist/raw/t10k-images-idx3-ubyte.gz' + test_labels_filename = 'mnist/raw/t10k-labels-idx1-ubyte.gz' + + # [HPCNS]: Update data file information with validated and fully qualified filenames + train_data_filename = os.path.join( + DataValidator.validated_data_dir(train_data_filename), train_data_filename) + train_labels_filename = os.path.join( + DataValidator.validated_data_dir(train_labels_filename), train_labels_filename) + test_data_filename = os.path.join( + DataValidator.validated_data_dir(test_data_filename), test_data_filename) + test_labels_filename = os.path.join( + DataValidator.validated_data_dir(test_labels_filename), test_labels_filename) + + # Extract it into numpy arrays. + train_data = extract_data(train_data_filename, 60000) + train_labels = extract_labels(train_labels_filename, 60000) + test_data = extract_data(test_data_filename, 10000) + test_labels = extract_labels(test_labels_filename, 10000) + + # Generate a validation set. + validation_data = train_data[:VALIDATION_SIZE, ...] + validation_labels = train_labels[:VALIDATION_SIZE] + train_data = train_data[VALIDATION_SIZE:, ...] + train_labels = train_labels[VALIDATION_SIZE:] + num_epochs = NUM_EPOCHS + + train_size = train_labels.shape[0] + + # This is where training samples and labels are fed to the graph. + # These placeholder nodes will be fed a batch of training data at each + # training step using the {feed_dict} argument to the Run() call below. + train_data_node = tf.placeholder( + data_type(), + shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) + train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) + eval_data = tf.placeholder( + data_type(), + shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) + + # The variables below hold all the trainable weights. They are passed an + # initial value which will be assigned when we call: + # {tf.global_variables_initializer().run()} + conv1_weights = tf.Variable( + tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. + stddev=0.1, + seed=SEED, dtype=data_type())) + conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) + conv2_weights = tf.Variable(tf.truncated_normal( + [5, 5, 32, 64], stddev=0.1, + seed=SEED, dtype=data_type())) + conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) + fc1_weights = tf.Variable( # fully connected, depth 512. + tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], + stddev=0.1, + seed=SEED, + dtype=data_type())) + fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) + fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], + stddev=0.1, + seed=SEED, + dtype=data_type())) + fc2_biases = tf.Variable(tf.constant( + 0.1, shape=[NUM_LABELS], dtype=data_type())) + + # We will replicate the model structure for the training subgraph, as well + # as the evaluation subgraphs, while sharing the trainable parameters. + def model(data, train=False): + """The Model definition.""" + # 2D convolution, with 'SAME' padding (i.e. the output feature map has + # the same size as the input). Note that {strides} is a 4D array whose + # shape matches the data layout: [image index, y, x, depth]. + conv = tf.nn.conv2d(data, + conv1_weights, + strides=[1, 1, 1, 1], + padding='SAME') + # Bias and rectified linear non-linearity. + relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) + # Max pooling. The kernel size spec {ksize} also follows the layout of + # the data. Here we have a pooling window of 2, and a stride of 2. + pool = tf.nn.max_pool(relu, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME') + conv = tf.nn.conv2d(pool, + conv2_weights, + strides=[1, 1, 1, 1], + padding='SAME') + relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) + pool = tf.nn.max_pool(relu, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME') + # Reshape the feature map cuboid into a 2D matrix to feed it to the + # fully connected layers. + pool_shape = pool.get_shape().as_list() + reshape = tf.reshape( + pool, + [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) + # Fully connected layer. Note that the '+' operation automatically + # broadcasts the biases. + hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) + # Add a 50% dropout during training only. Dropout also scales + # activations such that no rescaling is needed at evaluation time. + if train: + hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) + return tf.matmul(hidden, fc2_weights) + fc2_biases + + # Training computation: logits + cross-entropy loss. + logits = model(train_data_node, True) + loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=train_labels_node, logits=logits)) + + # L2 regularization for the fully connected parameters. + regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + + tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) + # Add the regularization term to the loss. + loss += 5e-4 * regularizers + + # Optimizer: set up a variable that's incremented once per batch and + # controls the learning rate decay. + batch = tf.Variable(0, dtype=data_type()) + # Decay once per epoch, using an exponential schedule starting at 0.01. + learning_rate = tf.train.exponential_decay( + 0.01, # Base learning rate. + batch * BATCH_SIZE, # Current index into the dataset. + train_size, # Decay step. + 0.95, # Decay rate. + staircase=True) + # Use simple momentum for the optimization. + optimizer = tf.train.MomentumOptimizer(learning_rate, + 0.9).minimize(loss, + global_step=batch) + + # Predictions for the current training minibatch. + train_prediction = tf.nn.softmax(logits) + + # Predictions for the test and validation, which we'll compute less often. + eval_prediction = tf.nn.softmax(model(eval_data)) + + # Small utility function to evaluate a dataset by feeding batches of data to + # {eval_data} and pulling the results from {eval_predictions}. + # Saves memory and enables this to run on smaller GPUs. + def eval_in_batches(data, sess): + """Get all predictions for a dataset by running it in small batches.""" + size = data.shape[0] + if size < EVAL_BATCH_SIZE: + raise ValueError("batch size for evals larger than dataset: %d" % size) + predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) + for begin in xrange(0, size, EVAL_BATCH_SIZE): + end = begin + EVAL_BATCH_SIZE + if end <= size: + predictions[begin:end, :] = sess.run( + eval_prediction, + feed_dict={eval_data: data[begin:end, ...]}) + else: + batch_predictions = sess.run( + eval_prediction, + feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) + predictions[begin:, :] = batch_predictions[begin - size:, :] + return predictions + + # Create a local session to run the training. + start_time = time.time() + with tf.Session() as sess: + # Run all the initializers to prepare the trainable parameters. + tf.global_variables_initializer().run() + print('Initialized!') + # Loop through training steps. + for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): + # Compute the offset of the current minibatch in the data. + # Note that we could use better randomization across epochs. + offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) + batch_data = train_data[offset:(offset + BATCH_SIZE), ...] + batch_labels = train_labels[offset:(offset + BATCH_SIZE)] + # This dictionary maps the batch data (as a numpy array) to the + # node in the graph it should be fed to. + feed_dict = {train_data_node: batch_data, + train_labels_node: batch_labels} + # Run the optimizer to update weights. + sess.run(optimizer, feed_dict=feed_dict) + # print some extra information once reach the evaluation frequency + if step % EVAL_FREQUENCY == 0: + # fetch some extra nodes' data + l, lr, predictions = sess.run([loss, learning_rate, train_prediction], + feed_dict=feed_dict) + elapsed_time = time.time() - start_time + start_time = time.time() + print('Step %d (epoch %.2f), %.1f ms' % + (step, float(step) * BATCH_SIZE / train_size, + 1000 * elapsed_time / EVAL_FREQUENCY)) + print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) + print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) + print('Validation error: %.1f%%' % error_rate( + eval_in_batches(validation_data, sess), validation_labels)) + sys.stdout.flush() + # Finally print the result! + test_error = error_rate(eval_in_batches(test_data, sess), test_labels) + print('Test error: %.1f%%' % test_error) + if FLAGS.self_test: + print('test_error', test_error) + assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( + test_error,) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--use_fp16', + default=False, + help='Use half floats instead of full floats if True.', + action='store_true') + parser.add_argument( + '--self_test', + default=False, + action='store_true', + help='True if running a self test.') + + FLAGS, unparsed = parser.parse_known_args() + tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/run_on_localMachine.sh b/tensorflow/run_on_localMachine.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c5737c9fc9d6bca93e25fca9f785e52320131fc --- /dev/null +++ b/tensorflow/run_on_localMachine.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# Run the program +python -u mnist.py \ No newline at end of file diff --git a/tensorflow/submit_job_jureca_python2.sh b/tensorflow/submit_job_jureca_python2.sh new file mode 100755 index 0000000000000000000000000000000000000000..6672f8c3d71e4a80774914590cbde11325459273 --- /dev/null +++ b/tensorflow/submit_job_jureca_python2.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TENSORFLOW_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load TensorFlow/1.12.0-GPU-Python-2.7.15 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/submit_job_jureca_python3.sh b/tensorflow/submit_job_jureca_python3.sh new file mode 100755 index 0000000000000000000000000000000000000000..c0831c9213d937a41e6d22dc4a0b5c8e07b2e745 --- /dev/null +++ b/tensorflow/submit_job_jureca_python3.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TENSORFLOW_MNIST +#SBATCH --gres=gpu:1 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module use /usr/local/software/jureca/OtherStages +module load Stages/Devel-2018b +module load GCC/7.3.0 +module load TensorFlow/1.12.0-GPU-Python-3.6.6 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/submit_job_juron_python2.sh b/tensorflow/submit_job_juron_python2.sh new file mode 100644 index 0000000000000000000000000000000000000000..6270cd2bbe665e5d405f36d96a8cf22ca62f07d6 --- /dev/null +++ b/tensorflow/submit_job_juron_python2.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J TENSORFLOW_MNIST + +# Load the required modules +module load python/2.7.14 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 + +# Run the program +python -u mnist.py diff --git a/tensorflow/submit_job_juron_python3.sh b/tensorflow/submit_job_juron_python3.sh new file mode 100644 index 0000000000000000000000000000000000000000..30fa2043f2059fc8d4d6ac673f52ba0bebb3ac2d --- /dev/null +++ b/tensorflow/submit_job_juron_python3.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +#BSUB -q normal +#BSUB -W 10 +#BSUB -n 1 +#BSUB -R "span[ptile=1]" +#BSUB -gpu "num=1" +#BSUB -e "error.%J.er" +#BSUB -o "output_%J.out" +#BSUB -J TENSORFLOW_MNIST + +# Load the required modules +module load python/3.6.1 +module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 + +# Run the program +python -u mnist.py diff --git a/utils/data_utils.py b/utils/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..df15e9aa067b6e3b1ae20047ebd651c1cbd4c533 --- /dev/null +++ b/utils/data_utils.py @@ -0,0 +1,63 @@ +""" + A collections of utilities for data manipulation. + + It was created to simplify the process of working with pre-downloaded + datasets. + +""" + +import os + + +class DataValidator: + """ + This class provides functions for validation of input data. + + """ + + def __init__(self): + pass + + @staticmethod + def validated_data_dir(filename): + """ + Checks if the given 'filename' exists, and is available in any of the + recognized input data directory locations. If the check is passed, + returns the fully qualified path to the input data directory. + + Parameters + ---------- + filename: + Name of the data file to be checked + + Returns + ------- + string: + Fully qualified path to the input data directory + + """ + + # Check the environment variable + if 'DL_TEST_DATA_HOME' in os.environ: + # Read the data directory path from the environment variable + data_dir = os.environ.get('DL_TEST_DATA_HOME') + else: + # Set path to the 'datasets' directory in the project root + data_dir = os.path.join(os.path.abspath('../datasets')) + + # We are two levels deep when executing Horovod samples + if not os.path.exists(data_dir): + data_dir = os.path.join(os.path.abspath('../../datasets')) + + print('Using %s as the data directory.' % data_dir) + + # Check if the directory exists + assert os.path.exists(data_dir), \ + data_dir + ' refers to a non-existing directory. '\ + 'Please either correctly set the DL_TEST_DATA_HOME environment variable, ' \ + 'or make sure the datasets are available in the project root.' + + assert os.path.exists(os.path.join(data_dir, filename)), \ + 'Unable to locate ' + filename + ' in ' + data_dir + + return data_dir