diff --git a/.gitattributes b/.gitattributes index dbf6f0e70d6c4d59aec7f4640ea71f42aeafb226..36df28f8653e70761509bd5349b120c86ffa9a32 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,7 +1,3 @@ -datasets/mnist/caffe/mnist_test_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_test_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_train_lmdb/data.mdb filter=lfs diff=lfs merge=lfs -text -datasets/mnist/caffe/mnist_train_lmdb/lock.mdb filter=lfs diff=lfs merge=lfs -text datasets/mnist/keras/mnist.npz filter=lfs diff=lfs merge=lfs -text datasets/mnist/pytorch/data/processed/training.pt filter=lfs diff=lfs merge=lfs -text datasets/mnist/pytorch/data/processed/test.pt filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 9c4d6d54d73dda0e2cb980abdc7a3ae2181155cb..05043c35182c436015ec43525fd8f994c38439fb 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,6 @@ mnist_convnet_model/ # Error and output files from the supercomputers *.er *.out + +# MacOS +.DS_Store \ No newline at end of file diff --git a/NOTICE b/NOTICE index 22a9d695eaaf27a76217db1873fd8544854f4451..11aba545ddd25a22fcc79f159124998bf12589dd 100644 --- a/NOTICE +++ b/NOTICE @@ -18,7 +18,7 @@ limitations under the License. Tensorflow -Copyright 2016 The TensorFlow Authors. All rights reserved. +Copyright 2019 The TensorFlow Authors. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -34,38 +34,16 @@ limitations under the License. Keras -All contributions by François Chollet: -Copyright (c) 2015 - 2019, François Chollet. -All rights reserved. +Copyright 2015 The TensorFlow Authors. All rights reserved. -All contributions by Google: -Copyright (c) 2015 - 2019, Google, Inc. -All rights reserved. - -All contributions by Microsoft: -Copyright (c) 2017 - 2019, Microsoft, Inc. -All rights reserved. - -All other contributions: -Copyright (c) 2015 - 2019, the respective contributors. -All rights reserved. - -Licensed under The MIT License (MIT) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. + http://www.apache.org/licenses/LICENSE-2.0 -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 0990dfd391900b1bc3fe9fdbf4b9ba979a8bca8a..7c8641841608d025086c6c0911405f2d9dd1a834 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,46 @@ # Getting started with Deep Learning on Supercomputers -This repository is intended to serve as a tutorial for anyone interested in utilizing the supercomputers -available at the Jülich Supercomputing Center (JSC) for deep learning based projects. It is assumed that -the reader is proficient in one or more of the following frameworks: +This repository is intended to serve as a tutorial for anyone interested in +utilizing the supercomputers available at the Jülich Supercomputing Center (JSC) +for deep learning based projects. It is assumed that the reader is proficient in +the following frameworks: * [Tensorflow](https://www.tensorflow.org/) -* [Keras](https://keras.io/) * [Horovod](https://github.com/horovod/horovod) -* [Caffe](http://caffe.berkeleyvision.org/) (limited support) -**Note:** This tutorial is by no means intended as an introduction to deep learning, or to any of the -above mentioned frameworks. If you are interested in educational resources for beginners, please -visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/Education) page. +**Note:** This tutorial is by no means intended as an introduction to deep +learning, or to any of the above mentioned frameworks. If you are interested in +educational resources for beginners, please visit +[this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/-/wikis/home) +page. ### Announcements -* **November 28, 2019:** Slides and code samples for the "Deep Learning on Supercomputers" talk given -as part of the [Introduction to the programming and usage of the supercomputer resources at Jülich](https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) -course are now available in the `course_material` directory. -* **November 22, 2019:** Samples for Caffe are no longer supported on JURECA due to system-wide -MVAPICH2 module changes. -* **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code -samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism -where the training data is distributed instead of epochs. Further information is available in the -directory-local `README.md`. -* **September 02, 2019:** Even though PyTorch is available as a system-wide module on the JSC supercomputers, all PyTorch -examples have been removed from this tutorial. This is due to the fact that the tutorial -developers are not currently working with PyTorch, and are therefore not in a position to provide -support for PyTorch related issues. +* **April 26, 2021:** The tutorial has been updated to use Tensorflow2. Also, + code samples and datasets that are no longer relevant, e.g., those for Caffe, + have been removed. +* **November 28, 2019:** Slides and code samples for the "Deep Learning on + Supercomputers" talk given as part of the [Introduction to the programming + and usage of the supercomputer resources at Jülich]( + https://www.fz-juelich.de/SharedDocs/Termine/IAS/JSC/EN/courses/2019/supercomputer-2019-11.html?nn=944302) + course are now available in the `course_material` directory. +* **November 22, 2019:** Samples for Caffe are no longer supported on JURECA + due to system-wide MVAPICH2 module changes. +* **November 18, 2019:** The `horovod_data_distributed` directory has been + added that contains code samples to illustrate proper data-distributed + training with Horovod, i.e., a distribution mechanism where the training data + is distributed instead of epochs. Further information is available in the + directory-local `README.md`. +* **September 02, 2019:** Even though PyTorch is available as a system-wide + module on the JSC supercomputers, all PyTorch examples have been removed from + this tutorial. This is due to the fact that the tutorial developers are not + currently working with PyTorch, and are therefore not in a position to + provide support for PyTorch related issues. * **August 23, 2019:** - * Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well. - * Python 2 support has been removed from the tutorial for all frameworks except Caffe. + * Tensorflow and Keras examples (with and without Horovod) are now fully + functional on JUWELS as well. + * Python 2 support has been removed from the tutorial for all frameworks + except Caffe. # Table of contents <!-- TOC --> @@ -38,133 +48,97 @@ support for PyTorch related issues. 1. [A word regarding the code samples](#1-a-word-regarding-the-code-samples) 2. [Changes made to support loading of pre-downloaded datasets](#2-changes-made-to-support-loading-of-pre-downloaded-datasets) 3. [Applying for user accounts on supercomputers](#3-applying-for-user-accounts-on-supercomputers) - * [3.1. JURECA and JUWELS](#31-jureca-and-juwels) - * [3.2. JURON](#32-juron) 4. [Logging on to the supercomputers](#4-logging-on-to-the-supercomputers) - * [4.1. JURECA and JUWELS](#41-jureca-and-juwels) - * [4.2. JURON](#42-juron) 5. [Cloning the repository](#5-cloning-the-repository) - * [5.1. JURECA and JUWELS](#51-jureca-and-juwels) - * [5.2. JURON](#52-juron) 6. [Running a sample](#6-running-a-sample) - * [6.1. JURECA and JUWELS](#61-jureca-and-juwels) - * [6.2. JURON](#62-juron) -7. [Python 2 support](#7-python-2-support) -8. [Distributed training](#8-distributed-training) -9. [Credits](#9-credits) +7. [Distributed training](#7-distributed-training) +8. [Credits](#8-credits) <!-- /TOC --> ## 1. A word regarding the code samples -Samples for each framework are available in the correspondingly named directory. Each such -directory typically contains at least one code sample, which trains a simple artificial neural -network on the canonical MNIST hand-written digit classification task. Moreover, job submission -scripts are included for all the supercomputers on which this tutorial has been tested. The job -scripts will hopefully make it easier to figure out which modules to load. Finally, -a `README.md` file contains further information about the contents of the directory. +Samples for each framework are available in the correspondingly named directory. +Each such directory typically contains at least one code sample, which trains a +simple artificial neural network on the canonical MNIST hand-written digit +classification task. Moreover, job submission scripts are included for all the +supercomputers on which this tutorial has been tested. The job scripts will +hopefully make it easier to figure out which modules to load. Finally, a +`README.md` file contains further information about the contents of the +directory. -**Disclaimer:** Neither are the samples intended to serve as examples of optimized code, nor do these -represent programming best practices. +**Disclaimer:** Neither are the samples intended to serve as examples of +optimized code, nor do these represent programming best practices. ## 2. Changes made to support loading of pre-downloaded datasets -It is worth mentioning that all the code samples were taken from the corresponding framework's -official samples/tutorials repository, as practitioners are likely familiar with these (links -to the original code samples are included in the directory-local `README.md`). However, the -original examples are designed to automatically download the required dataset in a -framework-defined directory. This is not a feasible option while working with supercomputers as compute nodes -do not have access to the Internet. Therefore, the samples have been slightly modified to load data from -the `datasets` directory included in this repository; specific code changes, at least for now, -have been marked by comments prefixed with the `[HPCNS]` tag. For more information see the `README.md` -available in the `datasets` directory. +It is worth mentioning that all the code samples were taken from the +corresponding framework's official samples/tutorials repository, as +practitioners are likely familiar with these (links to the original code samples +are included in the directory-local `README.md`). However, the original examples +are designed to automatically download the required dataset in a +framework-defined directory. This is not a feasible option while working with +supercomputers as compute nodes do not have access to the Internet. Therefore, +the samples have been slightly modified to load data from the `datasets` +directory included in this repository; specific code changes, at least for now, +have been marked by comments prefixed with the `[HPCNS]` tag. For more +information see the `README.md` available in the `datasets` directory. ## 3. Applying for user accounts on supercomputers -In case you do not already have an account on your supercomputer of interest, please take a look at the -instructions provided in the following sub-sections. - -### 3.1 JURECA and JUWELS - -For more information on getting accounts on JURECA and JUWELS, click -[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html). - -### 3.2 JURON - -To get a user account on JURON, please follow the steps below: - -1. Write an email to [Dirk Pleiter](http://www.fz-juelich.de/SharedDocs/Personen/IAS/JSC/EN/staff/pleiter_d.html?nn=362224), -in which please introduce yourself and mention why you need the account. -2. Apply for the account via the [JuDoor](https://dspserv.zam.kfa-juelich.de/judoor/login) portal -(more information about JuDoor is available [here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/JuDoor.html?nn=945700)). -If your work is related to the Human Brain Project (HBP), please join the `PCP0` and `CPCP0` projects. -Otherwise please join the `PADC` and `CPADC` projects. +In case you do not already have an account on your supercomputer of interest, +please refer to the instructions available [here]( +http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/ComputingTime/computingTime_node.html), +as you will need to apply for computing time before an account is created for you. ## 4. Logging on to the supercomputers -**Note:** From here on it is assumed that you already have an account on your required supercomputer. +**Note:** From here on it is assumed that you already have an account on your +required supercomputer. -### 4.1 JURECA and JUWELS +**Note:** This tutorial is supported for the following supercomputers: JURECA, +JUWELS, JUWELS Booster, and JUSUF. Following are the steps required to login (more information: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/access.html#access), -[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access)). +[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#access), +[JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/access.html)). + +For the purpose of this tutorial, we will assume that our system of interest is +JURECA. If you intend to use a different system, you can simply replace the +system name in the commands below; the procedure is precisely the same for all +machines. -1. Use SSH to login. Use one of the following commands, depending on your target system: +1. Use SSH to login: - `ssh <username>@jureca.fz-juelich.de` or `ssh <username>@juwels.fz-juelich.de` + `ssh -i ~/.ssh/<keyfile> <username>@jureca.fz-juelich.de` 2. Upon successful login, activate your project environment: `jutil env activate -p <name of compute project> -A <name of budget>` - **Note:** To view a list of all project and budget names available to you, please use the following command: - `jutil user projects -o columns`. Each name under the column titled "project" has a corresponding type under the - column titled "project-type". All projects with "project-type" "C" are compute projects, and - can be used in the `<name of compute project>` field for the command above. The `<name of budget>` field should then - contain the corresponding name under the "budgets" column. Please click [here]( + **Note:** To view a list of all project and budget names available to you, + please use the following command: `jutil user projects -o columns`. Each + name under the column titled "project" has a corresponding type under the + column titled "project-type". All projects with "project-type" "C" are + compute projects, and can be used in the `<name of compute project>` field + for the command above. The `<name of budget>` field should then contain the + corresponding name under the "budgets" column. Please click [here]( http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html) for more information. 3. Change to the project directory: `cd $PROJECT` -You should be in your project directory at this point. As the project directory is shared with other project -members, it is recommended to create a new directory with your username, and change to that directory. If -you'd like to clone this repository elsewhere, please change to the directory of your choice. - -### 4.2 JURON - -Following are the steps required to login. - -1. Use SSH to login: - - `ssh <username>@juron.fz-juelich.de` -2. Upon successful login, activate your project environment (more information -[here](http://www.fz-juelich.de/ias/jsc/EN/Expertise/Supercomputers/NewUsageModel/NewUsageModel_node.html)). - - `jutil env activate -p <name of compute project>` - - The `<name of compute project>` can be either `CPCP0` or `CPADC`, depending on whether you are a member - of `CPCP0` or `CPADC` (to view a list of all project names available to you, please use the following - command: `jutil user projects -o columns`). Note that as opposed to the corresponding section on JURECA, - the `<name of budget>` is not included. This is because the `CPCP0` and `CPADC` projects do not support - accounting. -3. Change to the project directory: - - `cd $PROJECT` - -You should be in your project directory at this point. As the `CPCP0` and `CPADC` project directories -are shared amongst many users from different institutes and organizations, it is recommended to create -a personal directory (named after your username) withing the project directory. You can then use your -personal directory for all your work, including cloning this tutorial. +You should be in your project directory at this point. As the project directory +is shared with other project members, it is recommended to create a new +directory with your username, and change to that directory. If you'd like to +clone this repository elsewhere, please change to the directory of your choice. ## 5. Cloning the repository -In order to store the datasets within the repository, we use Git LFS. This makes cloning the -repository a little bit different. Please find below the instructions on how to clone on different -systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/). - -### 5.1 JURECA and JUWELS +In order to store the datasets within the repository, we use Git LFS. This makes +cloning the repository slightly different. Please find below the instructions +on how to clone the repository. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-juelich.de/lfs/). 1. Load the Git LFS module: @@ -176,107 +150,66 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju `git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git` -### 5.2 JURON - -The process is simpler on JURON. You can simply clone the repository along with the datasets using -the following command: - - git lfs clone https://gitlab.version.fz-juelich.de/hpc4ns/dl_on_supercomputers.git - ## 6. Running a sample -Let us consider a scenario where you would like to run the `mnist.py` sample available in the `keras` -directory. This sample trains a CNN on MNIST using Keras on a single GPU. The following sub-sections list -the steps required for different supercomputers. - -### 6.1 JURECA and JUWELS +Let us consider a scenario where you would like to run the `keras_mnist.py` +sample available in the `tensorflow` directory. This sample trains a CNN on +MNIST using Tensorflow's Keras API. Following steps can be used to run the +sample: 1. Change directory to the repository root: `cd dl_on_supercomputers` -2. Change to the keras sub-directory: +2. Change to the tensorflow sub-directory: - `cd keras` + `cd tensorflow` 3. Submit the job to run the sample: - `sbatch submit_job_jureca.sh` or `sbatch submit_job_juwels.sh` + `sbatch jureca_job.sh` -That's it; this is all you need for job submission. If you'd like to receive email notifications -regarding the status of the job, add the following statement to the "SLURM job configuration" -block in the `submit_job_jureca.sh` (or `submit_job_juwels.sh`) script (replace `<your email address here>` with your -email address). +That's it; this is all you need for job submission. If you'd like to receive +email notifications regarding the status of the job, add the following statement +to the "SLURM job configuration" block in the `jureca_job.sh`script (replace +`<your email address here>` with your email address). #SBATCH --mail-user=<your email address here> -Output from the job is available in the `error` and `output` files, as specified in the job -configuration. - -**Note:** In the job submission scripts, the `--partition` value is set to `develgpus`, as jobs -are often (but not always) scheduled faster on this partition than the `gpus` partition. However, -resources in `develgpus` are limited -(as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), -[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions)). Therefore, -it is highly recommended that users familiarize themselves with the limitations, and use the `gpus` -partition for all production use, as well as when developing/testing with more resources than are -available on the `develgpus` partition. - -### 6.2 JURON - -1. Change directory to the repository root: - - `cd dl_on_supercomputers` -2. Change to the keras sub-directory: - - `cd keras` -3. Submit the job to run the sample: - - `bsub < submit_job_juron.sh` - -Please note that unlike JURECA and JUWELS, JURON uses LSF for job submission, which is why a different -syntax is required for job configuration and submission. Moreover, email notifications are not -supported on JURON. For more information on how to use LSF on JURON, use the following command: - - man 7 juron-lsf - -Output from the job is available in the `error` and `output` files, as specified in the job -configuration. - -## 7. Python 2 support - -As the official support for Python 2 will be be discontinued in 2020, we decided to encourage our -users to make the switch to Python 3 already. This also enables us to provide better support for -Python 3 based modules, as we no longer have to spend time maintaining Python 2 modules. - -The only exception is Caffe, as on JURECA it is available with Python 2 only. Please note however that -other than on JURON, Caffe is only available in the JURECA Stage 2018b, i.e., one of the previous stages. -We do not intend to provide support for Caffe from Stage 2019a and onward. This is due to the fact that -Caffe is no longer being developed. - -## 8. Distributed training - -[Horovod](https://github.com/horovod/horovod) provides a simple and efficient solution for -training artificial neural networks on multiple GPUs across multiple nodes in a cluster. It can -be used with Tensorflow and Keras (some other frameworks are supported as well, but -not Caffe). In this repository, the `horovod` directory contains further sub-directories; one -for each compatible framework that has been tested. E.g., there is a `keras` sub-directory that -contains samples that utilize distributed training with Keras and Horovod (more information is available -in the directory-local `README.md`). - -Please note that Horovod currently only supports a distribution strategy where the entire model is -replicated on every GPU. It is the data that is distributed across the GPUs. If you are interested -in model-parallel training, where the model itself can be split and distributed, a different -solution is required. We hope to add a sample for model-parallel training at a later time. - -Caffe does not support multi-node training. However, it has built-in support for [multi-GPU -training](https://github.com/BVLC/caffe/blob/master/docs/multigpu.md) on a single node (only -via the C/C++ interface). The `mnist_cmd` sample in the `caffe` directory contains the job -script that can be used to train the model on multiple GPUs. Please see the -directory-local `README.md` for further information. - -## 9. Credits +Output from the job is available in the `error` and `output` files, as specified +in the job configuration. + +**Note:** Please note that the job scripts for all systems are almost exactly +the same, except for the `--partition` value. This is because partition names +vary from system to system. Nevertheless, for each system, this tutorial uses +the corresponding development partition, e.g., `dc-gpu-devel` on JURECA. This is +because jobs are often (but not always) scheduled faster on this partition than +the production partition. However, resources in the development partitions are +limited (as described in: [JURECA](https://apps.fz-juelich.de/jsc/hps/jureca/quickintro.html#available-partitions), +[JUWELS](https://apps.fz-juelich.de/jsc/hps/juwels/quickintro.html#available-partitions), +and [JUSUF](https://apps.fz-juelich.de/jsc/hps/jusuf/cluster/quickintro.html#quick-avail-partitions)). +Therefore, it is highly recommended that users familiarize themselves with the +limitations, and use the production partition for all production use, as well as +when developing/testing with more resources than are available on the +development partition. + +## 7. Distributed training + +[Horovod](https://github.com/horovod/horovod) provides a simple and efficient +solution for training artificial neural networks on multiple GPUs across +multiple nodes in a cluster. It can be used with Tensorflow (some +other frameworks are supported as well). Since this tutorial primarily concerns +distributed training, only code samples that utilize Horovod are included. + +Please note that Horovod currently only supports a distribution strategy where +the entire model is replicated on every GPU. It is the data that is distributed +across the GPUs. If you are interested in model-parallel training, where the +model itself can be split and distributed, a different solution is required. We +hope to add a sample for model-parallel training at a later time. + +## 8. Credits * **Created by:** Fahad Khalid (SLNS/HPCNS, JSC) * **Installation of modules on JURON:** Andreas Herten (HPCNS, JSC) -* **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi Deepu (SLNS/HPCNS, JSC) -* **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea Kirchner (SLNS/HPCNS, JSC), -Susanne Wenzel (INM-1) +* **Installation of modules on JURECA:** Damian Alvarez (JSC), Rajalekshmi + Deepu (SLNS/HPCNS, JSC) +* **Review/suggestions/testing:** Kai Krajsek (SLNS/HPCNS, JSC), Tabea + Kirchner (SLNS/HPCNS, JSC), Susanne Wenzel (INM-1) diff --git a/caffe/README.md b/caffe/README.md deleted file mode 100644 index 1804dceeab230c4754b7616d3c111da8a51873cb..0000000000000000000000000000000000000000 --- a/caffe/README.md +++ /dev/null @@ -1,43 +0,0 @@ -**Caution:** Caffe is no longer being actively developed, which is why we prefer not to support -it as a system-wide module on the supercomputers for long. This is why Caffe is available with -Python 2 support only on JURECA, while it is not at all supported on JUWELS. The users are advised -to switch to other frameworks such as Tensorflow/Keras and PyTorch. - -# Notes - -There are three ways in which Caffe can be used, -1. As a command line tool with only built-in layers -2. As a library from within a Python program. Either only built-in layers can be used, -or one or more custom layers can be written in Python. -3. As a command line tool with one or more custom C++ layers. - -## Caffe as a command line tool - -The `mnist_cmd` sub-directory contains configuration and job scripts for running -Caffe as a command line tool with only built-in layers. This example represents use -case 1 as described above. The `lenet_solver.prototxt` and `lenet_train_test.prototxt` -were taken from the MNIST examples directory available in the Caffe repository -[here](https://github.com/BVLC/caffe/tree/master/examples/mnist). Minor changes have -been made just so the path to the input dataset is correct. The `caffe` command -in the job submission scripts can be modified as follows to run training on -all available GPUs on the node (value for the `-gpu` option has been changed from `0` to `all`): - - caffe train --solver=lenet_solver.prototxt -gpu all - -## Using Caffe within a Python program - -The `lenet_python` sub-directory contains the required files for an example of -using Caffe as a library from within a Python program. This corresponds to use case -2 as described above. The `train_lenet.py` file contains source code adapted from -the IPython notebook `01-learning-lenet.ipynb` available in the Caffe examples -[here](https://github.com/BVLC/caffe/tree/master/examples). Running this example -results in the generation of a learning curve plot in the current directory. - -## Caffe with custom C++ layers - -Working with custom C++ layers requires recompiling Caffe with the custom code. As -this is not possible with a system-wide installation, we have decided not to -include an example of this use case. Nevertheless, if you must work with custom -C++ layers and require assistance, please send an email to the JULAIN mailing list -(more information [here](https://lists.fz-juelich.de/mailman/listinfo/ml)). - diff --git a/caffe/lenet_python/.submit_job_jureca_python2.sh b/caffe/lenet_python/.submit_job_jureca_python2.sh deleted file mode 100755 index 75069256157eb55f4122b0ebc2f390b925f89396..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_LENET_PYTHON -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Run the program -srun python -u train_lenet.py diff --git a/caffe/lenet_python/lenet_auto_solver.prototxt b/caffe/lenet_python/lenet_auto_solver.prototxt deleted file mode 100644 index 44af3ad6cecd7a8090902160666e5453622f8be6..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/lenet_auto_solver.prototxt +++ /dev/null @@ -1,24 +0,0 @@ -# The train/test net protocol buffer definition -train_net: "lenet_auto_train.prototxt" -test_net: "lenet_auto_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" diff --git a/caffe/lenet_python/submit_job_juron_python2.sh b/caffe/lenet_python/submit_job_juron_python2.sh deleted file mode 100755 index 2025a389b89bb90c6593b598231f14c8fb1fdcf0..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/submit_job_juron_python3.sh b/caffe/lenet_python/submit_job_juron_python3.sh deleted file mode 100755 index 7e737766bcb4ee609fdefab0d52f6adcc95e12e8..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_LENET_PYTHON - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train LeNet -python -u train_lenet.py diff --git a/caffe/lenet_python/train_lenet.py b/caffe/lenet_python/train_lenet.py deleted file mode 100644 index ad5cae3bf4d6a7f1f9a418b802418714efb6ee67..0000000000000000000000000000000000000000 --- a/caffe/lenet_python/train_lenet.py +++ /dev/null @@ -1,107 +0,0 @@ -import os -import sys -import matplotlib - -# Force matplotlib to not use any Xwindows backend. -matplotlib.use('Agg') -import pylab - -import caffe -from caffe import layers as L, params as P - -# Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - - -# Prepares network specification -def lenet(lmdb, batch_size): - # Caffe's version of LeNet: a series of linear and simple nonlinear transformations - n = caffe.NetSpec() - - n.data, n.label = L.Data(batch_size=batch_size, backend=P.Data.LMDB, source=lmdb, - transform_param=dict(scale=1. / 255), ntop=2) - - n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) - n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) - n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) - n.fc1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier')) - n.relu1 = L.ReLU(n.fc1, in_place=True) - n.score = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier')) - n.loss = L.SoftmaxWithLoss(n.score, n.label) - - return n.to_proto() - - -# Names of the directories containing the LMDB files for TRAIN and TEST phases -test_dir = 'mnist/caffe/mnist_test_lmdb' -train_dir = 'mnist/caffe/mnist_train_lmdb' - -# Validated path to the data root -DataValidator.validated_data_dir(train_dir) -data_dir = DataValidator.validated_data_dir(test_dir) - -# Write the prototxt for TRAIN phase -with open('lenet_auto_train.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, train_dir), 64))) - -# Write the prototxt for TEST phase -with open('lenet_auto_test.prototxt', 'w') as f: - f.write(str(lenet(os.path.join(data_dir, test_dir), 100))) - -# Use the GPU for training -caffe.set_device(0) -caffe.set_mode_gpu() - -# Load the solver and create train and test nets -solver = None # ignore this workaround for lmdb data (can't instantiate two solvers on the same data) -solver = caffe.SGDSolver('lenet_auto_solver.prototxt') - -solver.net.forward() # train net -solver.test_nets[0].forward() # test net (there can be more than one) - -niter = 200 -test_interval = 25 -# losses will also be stored in the log -train_loss = pylab.zeros(niter) -test_acc = pylab.zeros(int(pylab.ceil(niter / test_interval))) -output = pylab.zeros((niter, 8, 10)) - -# the main solver loop -for it in range(niter): - solver.step(1) # SGD by Caffe - - # store the train loss - train_loss[it] = solver.net.blobs['loss'].data - - # store the output on the first test batch - # (start the forward pass at conv1 to avoid loading new data) - solver.test_nets[0].forward(start='conv1') - output[it] = solver.test_nets[0].blobs['score'].data[:8] - - # run a full test every so often - # (Caffe can also do this for us and write to a log, but we show here - # how to do it directly in Python, where more complicated things are easier.) - if it % test_interval == 0: - print('Iteration', it, 'testing...') - correct = 0 - for test_it in range(100): - solver.test_nets[0].forward() - correct += sum(solver.test_nets[0].blobs['score'].data.argmax(1) - == solver.test_nets[0].blobs['label'].data) - test_acc[it // test_interval] = correct / 1e4 - -# Plot the training curve -_, ax1 = pylab.subplots() -ax2 = ax1.twinx() -ax1.plot(pylab.arange(niter), train_loss) -ax2.plot(test_interval * pylab.arange(len(test_acc)), test_acc, 'r') -ax1.set_xlabel('iteration') -ax1.set_ylabel('train loss') -ax2.set_ylabel('test accuracy') -ax2.set_title('Test Accuracy: {:.2f}'.format(test_acc[-1])) - -# Save the plot to file. Use "bbox_inches='tight'" to remove surrounding whitespace -pylab.savefig('learning_curve.png', bbox_inches='tight') diff --git a/caffe/mnist_cmd/.submit_job_jureca_python2.sh b/caffe/mnist_cmd/.submit_job_jureca_python2.sh deleted file mode 100755 index 029520e3308a4e322cfd14c3d863e982fb5ac02e..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/.submit_job_jureca_python2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=CAFFE_MNIST_CMD -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/Devel-2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load Caffe/1.0-Python-2.7.15 - -# Train the model using the 'caffe' binary -srun caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/lenet_solver.prototxt b/caffe/mnist_cmd/lenet_solver.prototxt deleted file mode 100644 index 103b2e757061c84e3bb00a83a54f55606b3ce64b..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/lenet_solver.prototxt +++ /dev/null @@ -1,25 +0,0 @@ -# The train/test net protocol buffer definition -net: "lenet_train_test.prototxt" -# test_iter specifies how many forward passes the test should carry out. -# In the case of MNIST, we have test batch size 100 and 100 test iterations, -# covering the full 10,000 testing images. -test_iter: 100 -# Carry out testing every 500 training iterations. -test_interval: 500 -# The base learning rate, momentum and the weight decay of the network. -base_lr: 0.01 -momentum: 0.9 -weight_decay: 0.0005 -# The learning rate policy -lr_policy: "inv" -gamma: 0.0001 -power: 0.75 -# Display every 100 iterations -display: 100 -# The maximum number of iterations -max_iter: 10000 -# snapshot intermediate results -snapshot: 5000 -snapshot_prefix: "snapshots/lenet" -# solver mode: CPU or GPU -solver_mode: GPU diff --git a/caffe/mnist_cmd/lenet_train_test.prototxt b/caffe/mnist_cmd/lenet_train_test.prototxt deleted file mode 100644 index f34ab716ec5467584ac059af3bd5d087a9d2fb34..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/lenet_train_test.prototxt +++ /dev/null @@ -1,168 +0,0 @@ -name: "LeNet" -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TRAIN - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_train_lmdb" - batch_size: 64 - backend: LMDB - } -} -layer { - name: "mnist" - type: "Data" - top: "data" - top: "label" - include { - phase: TEST - } - transform_param { - scale: 0.00390625 - } - data_param { - source: "../../datasets/mnist/caffe/mnist_test_lmdb" - batch_size: 100 - backend: LMDB - } -} -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 20 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "conv1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 50 - kernel_size: 5 - stride: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "conv2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 2 - stride: 2 - } -} -layer { - name: "ip1" - type: "InnerProduct" - bottom: "pool2" - top: "ip1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 500 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "ip1" - top: "ip1" -} -layer { - name: "ip2" - type: "InnerProduct" - bottom: "ip1" - top: "ip2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 10 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "ip2" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "ip2" - bottom: "label" - top: "loss" -} diff --git a/caffe/mnist_cmd/snapshots/.gitkeep b/caffe/mnist_cmd/snapshots/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/caffe/mnist_cmd/submit_job_juron_python2.sh b/caffe/mnist_cmd/submit_job_juron_python2.sh deleted file mode 100755 index b5ee63c60aa1dddad9708367d6623deccc57022f..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/submit_job_juron_python2.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/2.7.14 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 \ No newline at end of file diff --git a/caffe/mnist_cmd/submit_job_juron_python3.sh b/caffe/mnist_cmd/submit_job_juron_python3.sh deleted file mode 100755 index bdac4a2aef6d670bff2fcf4a928bf3586df3781b..0000000000000000000000000000000000000000 --- a/caffe/mnist_cmd/submit_job_juron_python3.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J CAFFE_MNIST_CMD - -# Load the Python and Caffe modules -module load python/3.6.1 -module load caffe/1.0-gcc_5.4.0-cuda_10.0.130 - -# Train a model for MNIST -caffe train --solver=lenet_solver.prototxt -gpu 0 diff --git a/course_material/examples/mnist_epoch_distributed.py b/course_material/examples/mnist_epoch_distributed.py index 7c9080e63af23eabeb6a6b47c8e89edf26e7190f..504b2a8b99f2bcc1206159a9314806890dd2c682 100644 --- a/course_material/examples/mnist_epoch_distributed.py +++ b/course_material/examples/mnist_epoch_distributed.py @@ -4,8 +4,6 @@ # Version 2.0 (see the NOTICE file for details). """ - This program is an adaptation of the following code sample: - https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py. The program creates and trains a shallow ANN for handwritten digit classification using the MNIST dataset. @@ -13,14 +11,14 @@ example epochs are distributed across the Horovod ranks, not data. To run this sample use the following command on your - workstation/laptop equipped with a GPU: + workstation/laptop: - mpirun -np 1 python -u mnist_epoch_distributed.py + mpirun -np 1 python -u mnist_epoch_distributed.py If you have more than one GPU on your system, you can increase the number of ranks accordingly. - The code has been tested with Python 3.7.5, tensorflow-gpu 1.13.1, and + The code has been tested with Python 3.8.7, tensorflow 2.3.1, and horovod 0.16.2. Note: This code will NOT work on the supercomputers. @@ -30,16 +28,17 @@ import math import tensorflow as tf import horovod.tensorflow.keras as hvd -from tensorflow.python.keras import backend as K # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) +gpus = tf.config.experimental.list_physical_devices('GPU') +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) # Reference to the MNIST dataset mnist = tf.keras.datasets.mnist diff --git a/course_material/examples/mnist_single_gpu.py b/course_material/examples/mnist_single_gpu.py index 794150fe230348b0001d86158d32a9a9e5e52cbd..2918cd027cb8b5c88d49ba0c83eaa4944f8aa8f4 100644 --- a/course_material/examples/mnist_single_gpu.py +++ b/course_material/examples/mnist_single_gpu.py @@ -4,17 +4,16 @@ # Version 2.0 (see the NOTICE file for details). """ - This program is an adaptation of the code sample available at - https://www.tensorflow.org/tutorials/. The program creates - and trains a shallow ANN for handwritten digit classification - using the MNIST dataset. + This program is an adaptation of a previously available code sample + at https://www.tensorflow.org/tutorials/. The program creates and trains a + shallow ANN for handwritten digit classification using the MNIST dataset. To run this sample use the following command on your - workstation/laptop equipped with a GPU: + workstation/laptop: - python -u mnist.py + python -u mnist.py - The code has been tested with Python 3.7.5 and tensorflow-gpu 1.13.1. + The code has been tested with Python 3.8.7 and tensorflow 2.3.1 Note: This code will NOT work on the supercomputers. diff --git a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb b/datasets/mnist/caffe/mnist_test_lmdb/data.mdb deleted file mode 100644 index 760ab4233ddcb5b432bac7ad418179c380c18127..0000000000000000000000000000000000000000 --- a/datasets/mnist/caffe/mnist_test_lmdb/data.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a70974534a27eaa5dc42638940ad311981b0259f1f089ea46c695bfd9c1862da -size 8749056 diff --git a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb deleted file mode 100644 index eda8c00824c606c2c5eb4d5db6ccbbfb85da9a01..0000000000000000000000000000000000000000 --- a/datasets/mnist/caffe/mnist_test_lmdb/lock.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0667461174c505913de02429312bcbd9c6cab774b4495c7a2bbe7061ce3ccea -size 8192 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb b/datasets/mnist/caffe/mnist_train_lmdb/data.mdb deleted file mode 100644 index 4432b2e157c90b01c117caabfd241e9e54e46bee..0000000000000000000000000000000000000000 --- a/datasets/mnist/caffe/mnist_train_lmdb/data.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3eea94f5e1ea128f16ff0e18f9e287cc2676a54a3218105c525e602f375666c1 -size 50757632 diff --git a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb b/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb deleted file mode 100644 index d961b47989b1ea9cda34eb5a19ed516938c40482..0000000000000000000000000000000000000000 --- a/datasets/mnist/caffe/mnist_train_lmdb/lock.mdb +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33569d983c9d6d527cd7d3202c31a2a7395b254fb8076f59b84ecaecb9207906 -size 8192 diff --git a/horovod/README.md b/horovod/README.md deleted file mode 100644 index 3d63a23deb70123b799da301b71b89cdafc7d649..0000000000000000000000000000000000000000 --- a/horovod/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Notes - -All source code samples were taken from the Horovod examples repository -[here](https://github.com/uber/horovod/tree/master/examples) -(last checked: September 02, 2019). The samples that work with MNIST data have been -slightly modified. Our changes are limited to, - -* The data loading mechanism -* A bit of code cleanup -* A few additional comments pertaining to our custom data loading mechanism - -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All -statements that demonstrate the use of Horovod follow a comment beginning with -`[Horovod]` (as added by Horovod developers). - -## Keras samples - -The following Keras samples are included: - -1. `mnist.py`: A simple MNIST processing example with only the essential Horovod code -for distributed training. -2. `mnist_advanced.py`: This sample is primarily the same as `mnist.py`. However, a -few more advanced Horovod features are used. - -## Tensorflow samples - -The following Tensorflow samples are included: - -1. `mnist.py`: Demonstrates distributed training using Horovod with the low-level -Tensorflow API. A simple convolutional neural network is trained on the MNIST dataset. -2. `mnist_estimator.py`: Demonstrates distributed training using Horovod with the -high-level Estimator API in Tensorflow. A simple convolutional neural network is -trained on the MNIST dataset. -3. `synthetic_benchmark.py`: A simple benchmark that can be used to measure performance -of Tensorflow with Horovod without using any external dataset. diff --git a/horovod/keras/checkpoints/.gitkeep b/horovod/keras/checkpoints/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py deleted file mode 100644 index 0c46a771047d0adfa6d61017176d6ef2c6de0d67..0000000000000000000000000000000000000000 --- a/horovod/keras/mnist.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K -import math -import tensorflow as tf -import horovod.keras as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -# Horovod: adjust number of epochs based on number of GPUs. -epochs = int(math.ceil(16.0 / hvd.size())) - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -# [HPCNS] Load MNIST dataset -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -# Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(1.0 * hvd.size()) - -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) - -model.fit(x_train, y_train, - batch_size=batch_size, - callbacks=callbacks, - epochs=epochs, - verbose=1 if hvd.rank() == 0 else 0, - validation_data=(x_test, y_test)) -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py deleted file mode 100644 index ba60b6d64d61feac9e19e0af213b5134087f887b..0000000000000000000000000000000000000000 --- a/horovod/keras/mnist_advanced.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist_advanced.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras.preprocessing.image import ImageDataGenerator -from keras import backend as K -import tensorflow as tf -import horovod.keras as hvd - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -config.gpu_options.allow_growth = True -config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -# Enough epochs to demonstrate learning rate warmup and the reduction of -# learning rate when training plateaues. -epochs = 16 - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -# [HPCNS] Load MNIST dataset. -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -# Determine how many batches are there in train and test sets -train_batches = len(x_train) // batch_size -test_batches = len(x_test) // batch_size - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -# Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size()) - -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), - - # Horovod: average metrics among workers at the end of every epoch. - # - # Note: This callback must be in the list before the ReduceLROnPlateau, - # TensorBoard or other metrics-based callbacks. - hvd.callbacks.MetricAverageCallback(), - - # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final - # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during - # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), - - # Reduce the learning rate if training plateaues. - keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) - -# Set up ImageDataGenerators to do data augmentation for the training images. -train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, - height_shift_range=0.08, zoom_range=0.08) -test_gen = ImageDataGenerator() - -# Train the model. -# Horovod: the training will randomly sample 1 / N batches of training data and -# 3 / N batches of validation data on every worker, where N is the number of workers. -# Over-sampling of validation data helps to increase probability that every validation -# example will be evaluated. -model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), - steps_per_epoch=train_batches // hvd.size(), - callbacks=callbacks, - epochs=epochs, - verbose=1, - validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), - validation_steps=3 * test_batches // hvd.size()) - -# Evaluate the model on the full data set. -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/horovod/keras/run_on_localMachine.sh b/horovod/keras/run_on_localMachine.sh deleted file mode 100644 index 9c9afb4b58ee9f4a42480997dd298b6e33c71a35..0000000000000000000000000000000000000000 --- a/horovod/keras/run_on_localMachine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -mpirun -np 1 -H localhost:1 \ - -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ - -mca pml ob1 -mca btl ^openib \ - python -u mnist.py diff --git a/horovod/keras/submit_job_jureca.sh b/horovod/keras/submit_job_jureca.sh deleted file mode 100755 index 3591bbaed30d611a0e1fbde57b061ec16433e01c..0000000000000000000000000000000000000000 --- a/horovod/keras/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/keras/submit_job_juron.sh b/horovod/keras/submit_job_juron.sh deleted file mode 100755 index 03182786d1f52c2cb8cacd9e8c709f1c9d93cc40..0000000000000000000000000000000000000000 --- a/horovod/keras/submit_job_juron.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=2]" -#BSUB -gpu "num=2" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HOROVOD_KERAS_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 -module load keras/2.2.4 - -# Run the program -mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/keras/submit_job_juwels.sh b/horovod/keras/submit_job_juwels.sh deleted file mode 100755 index 3591bbaed30d611a0e1fbde57b061ec16433e01c..0000000000000000000000000000000000000000 --- a/horovod/keras/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/checkpoints/.gitkeep b/horovod/tensorflow/checkpoints/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py deleted file mode 100644 index 3c780accef6f40d6bb3c95196f4feb69aafb96fe..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/mnist.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -import os -import sys -import tensorflow as tf -import horovod.tensorflow as hvd -import numpy as np -import shutil - -from tensorflow import keras - -layers = tf.layers - -tf.logging.set_verbosity(tf.logging.INFO) - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - - -def conv_model(feature, target, mode): - """2-layer convolution model.""" - # Convert the target to a one-hot tensor of shape (batch_size, 10) and - # with a on-value of 1 for each one-hot vector of length 10. - target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) - - # Reshape feature to 4d tensor with 2nd and 3rd dimensions being - # image width and height final dimension being the number of color channels. - feature = tf.reshape(feature, [-1, 28, 28, 1]) - - # First conv layer will compute 32 features for each 5x5 patch - with tf.variable_scope('conv_layer1'): - h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], - activation=tf.nn.relu, padding="SAME") - h_pool1 = tf.nn.max_pool( - h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') - - # Second conv layer will compute 64 features for each 5x5 patch. - with tf.variable_scope('conv_layer2'): - h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], - activation=tf.nn.relu, padding="SAME") - h_pool2 = tf.nn.max_pool( - h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') - # reshape tensor into a batch of vectors - h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) - - # Densely connected layer with 1024 neurons. - h_fc1 = layers.dropout( - layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu), - rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Compute logits (1 per class) and compute loss. - logits = layers.dense(h_fc1, 10, activation=None) - loss = tf.losses.softmax_cross_entropy(target, logits) - - return tf.argmax(logits, 1), loss - - -def train_input_generator(x_train, y_train, batch_size=64): - assert len(x_train) == len(y_train) - while True: - p = np.random.permutation(len(x_train)) - x_train, y_train = x_train[p], y_train[p] - index = 0 - while index <= len(x_train) - batch_size: - yield x_train[index:index + batch_size], \ - y_train[index:index + batch_size], - index += batch_size - - -def main(_): - # Horovod: initialize Horovod. - hvd.init() - - # [HPCNS] Fully qualified dataset file name - dataset_file = os.path.join(data_dir, data_file) - - # [HPCNS] Dataset filename for this rank - dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) - - # [HPCNS] Make a copy of the dataset for this rank - shutil.copyfile(dataset_file, dataset_for_rank) - - # [HPCNS] Load MNIST dataset - (x_train, y_train), (x_test, y_test) = \ - keras.datasets.mnist.load_data(dataset_for_rank) - - # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it - # into (-1, 784) to feed into our network. Also, need to normalize the - # features between 0 and 1. - x_train = np.reshape(x_train, (-1, 784)) / 255.0 - x_test = np.reshape(x_test, (-1, 784)) / 255.0 - - # Build model... - with tf.name_scope('input'): - image = tf.placeholder(tf.float32, [None, 784], name='image') - label = tf.placeholder(tf.float32, [None], name='label') - predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) - - # Horovod: adjust learning rate based on number of GPUs. - opt = tf.train.AdamOptimizer(0.001 * hvd.size()) - - # Horovod: add Horovod Distributed Optimizer. - opt = hvd.DistributedOptimizer(opt) - - global_step = tf.train.get_or_create_global_step() - train_op = opt.minimize(loss, global_step=global_step) - - hooks = [ - # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states - # from rank 0 to all other processes. This is necessary to ensure consistent - # initialization of all workers when training is started with random weights - # or restored from a checkpoint. - hvd.BroadcastGlobalVariablesHook(0), - - # Horovod: adjust number of steps based on number of GPUs. - tf.train.StopAtStepHook(last_step=20000 // hvd.size()), - - tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, - every_n_iter=10), - ] - - # Horovod: pin GPU to be used to process local rank (one GPU per process) - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) - - # Horovod: save checkpoints only on worker 0 to prevent other workers from - # corrupting them. - checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None - training_batch_generator = train_input_generator(x_train, - y_train, batch_size=100) - # The MonitoredTrainingSession takes care of session initialization, - # restoring from a checkpoint, saving to a checkpoint, and closing when done - # or an error occurs. - with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, - hooks=hooks, - config=config) as mon_sess: - while not mon_sess.should_stop(): - # Run a training step synchronously. - image_, label_ = next(training_batch_generator) - mon_sess.run(train_op, feed_dict={image: image_, label: label_}) - - # [HPCNS] Remove the copied dataset - os.remove(dataset_for_rank) - - -if __name__ == "__main__": - tf.app.run() diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py deleted file mode 100644 index 792c0577f5e6324eddca6e54d23d6669a21ab3c4..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/mnist_estimator.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_estimator.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). - -"""Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import shutil -import numpy as np -import tensorflow as tf -import horovod.tensorflow as hvd - -from tensorflow import keras - -tf.logging.set_verbosity(tf.logging.INFO) - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - - -def cnn_model_fn(features, labels, mode): - """Model function for CNN.""" - # Input Layer - # Reshape X to 4-D tensor: [batch_size, width, height, channels] - # MNIST images are 28x28 pixels, and have one color channel - input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) - - # Convolutional Layer #1 - # Computes 32 features using a 5x5 filter with ReLU activation. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 28, 28, 1] - # Output Tensor Shape: [batch_size, 28, 28, 32] - conv1 = tf.layers.conv2d( - inputs=input_layer, - filters=32, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #1 - # First max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 28, 28, 32] - # Output Tensor Shape: [batch_size, 14, 14, 32] - pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) - - # Convolutional Layer #2 - # Computes 64 features using a 5x5 filter. - # Padding is added to preserve width and height. - # Input Tensor Shape: [batch_size, 14, 14, 32] - # Output Tensor Shape: [batch_size, 14, 14, 64] - conv2 = tf.layers.conv2d( - inputs=pool1, - filters=64, - kernel_size=[5, 5], - padding="same", - activation=tf.nn.relu) - - # Pooling Layer #2 - # Second max pooling layer with a 2x2 filter and stride of 2 - # Input Tensor Shape: [batch_size, 14, 14, 64] - # Output Tensor Shape: [batch_size, 7, 7, 64] - pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) - - # Flatten tensor into a batch of vectors - # Input Tensor Shape: [batch_size, 7, 7, 64] - # Output Tensor Shape: [batch_size, 7 * 7 * 64] - pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) - - # Dense Layer - # Densely connected layer with 1024 neurons - # Input Tensor Shape: [batch_size, 7 * 7 * 64] - # Output Tensor Shape: [batch_size, 1024] - dense = tf.layers.dense(inputs=pool2_flat, units=1024, - activation=tf.nn.relu) - - # Add dropout operation; 0.6 probability that element will be kept - dropout = tf.layers.dropout( - inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) - - # Logits layer - # Input Tensor Shape: [batch_size, 1024] - # Output Tensor Shape: [batch_size, 10] - logits = tf.layers.dense(inputs=dropout, units=10) - - predictions = { - # Generate predictions (for PREDICT and EVAL mode) - "classes": tf.argmax(input=logits, axis=1), - # Add `softmax_tensor` to the graph. It is used for PREDICT and by the - # `logging_hook`. - "probabilities": tf.nn.softmax(logits, name="softmax_tensor") - } - if mode == tf.estimator.ModeKeys.PREDICT: - return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) - - # Calculate Loss (for both TRAIN and EVAL modes) - onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10) - loss = tf.losses.softmax_cross_entropy( - onehot_labels=onehot_labels, logits=logits) - - # Configure the Training Op (for TRAIN mode) - if mode == tf.estimator.ModeKeys.TRAIN: - # Horovod: scale learning rate by the number of workers. - optimizer = tf.train.MomentumOptimizer( - learning_rate=0.001 * hvd.size(), momentum=0.9) - - # Horovod: add Horovod Distributed Optimizer. - optimizer = hvd.DistributedOptimizer(optimizer) - - train_op = optimizer.minimize( - loss=loss, - global_step=tf.train.get_global_step()) - return tf.estimator.EstimatorSpec(mode=mode, loss=loss, - train_op=train_op) - - # Add evaluation metrics (for EVAL mode) - eval_metric_ops = { - "accuracy": tf.metrics.accuracy( - labels=labels, predictions=predictions["classes"])} - return tf.estimator.EstimatorSpec( - mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) - - -def main(unused_argv): - # Horovod: initialize Horovod. - hvd.init() - - # [HPCNS] Fully qualified dataset file name - dataset_file = os.path.join(data_dir, data_file) - - # [HPCNS] Dataset filename for this rank - dataset_for_rank = os.path.join(data_dir, 'MNIST-data-%d' % hvd.rank()) - - # [HPCNS] Make a copy of the dataset for this rank - shutil.copyfile(dataset_file, dataset_for_rank) - - # [HPCNS] Load MNIST dataset - (train_data, train_labels), (eval_data, eval_labels) = \ - keras.datasets.mnist.load_data(dataset_for_rank) - - # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it - # into (-1, 784) to feed into our network. Also, need to normalize the - # features between 0 and 1. - train_data = np.reshape(train_data, (-1, 784)) / 255.0 - eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 - - # Horovod: pin GPU to be used to process local rank (one GPU per process) - config = tf.ConfigProto() - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) - - # Horovod: save checkpoints only on worker 0 to prevent other workers from - # corrupting them. - model_dir = 'checkpoints/mnist_convnet_model' if hvd.rank() == 0 else None - - # Create the Estimator - mnist_classifier = tf.estimator.Estimator( - model_fn=cnn_model_fn, model_dir=model_dir, - config=tf.estimator.RunConfig(session_config=config)) - - # Set up logging for predictions - # Log the values in the "Softmax" tensor with label "probabilities" - tensors_to_log = {"probabilities": "softmax_tensor"} - logging_hook = tf.train.LoggingTensorHook( - tensors=tensors_to_log, every_n_iter=500) - - # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from - # rank 0 to all other processes. This is necessary to ensure consistent - # initialization of all workers when training is started with random weights or - # restored from a checkpoint. - bcast_hook = hvd.BroadcastGlobalVariablesHook(0) - - # Train the model - train_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": train_data}, - y=train_labels, - batch_size=100, - num_epochs=None, - shuffle=True) - - # Horovod: adjust number of steps based on number of GPUs. - mnist_classifier.train( - input_fn=train_input_fn, - steps=500 // hvd.size(), - hooks=[logging_hook, bcast_hook]) - - # Evaluate the model and print results - eval_input_fn = tf.estimator.inputs.numpy_input_fn( - x={"x": eval_data}, - y=eval_labels, - num_epochs=1, - shuffle=False) - eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) - print(eval_results) - - # [HPCNS] Remove the copied dataset - os.remove(dataset_for_rank) - - -if __name__ == "__main__": - tf.app.run() diff --git a/horovod/tensorflow/run_on_localMachine.sh b/horovod/tensorflow/run_on_localMachine.sh deleted file mode 100644 index 9c9afb4b58ee9f4a42480997dd298b6e33c71a35..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/run_on_localMachine.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -mpirun -np 1 -H localhost:1 \ - -bind-to none -map-by slot \ - -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH \ - -mca pml ob1 -mca btl ^openib \ - python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca.sh b/horovod/tensorflow/submit_job_jureca.sh deleted file mode 100755 index fd12487ff30450bdb5eecb669ae9b22a38d79bfd..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juron.sh b/horovod/tensorflow/submit_job_juron.sh deleted file mode 100644 index 01075474bae35cafb29c70239f29214de904a6ca..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/submit_job_juron.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=2]" -#BSUB -gpu "num=2" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HOROVOD_TFLOW_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 - -# Run the program -mpirun -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \ - -x PATH -mca pml ob1 -mca btl ^openib python -u mnist.py diff --git a/horovod/tensorflow/submit_job_juwels.sh b/horovod/tensorflow/submit_job_juwels.sh deleted file mode 100755 index fd12487ff30450bdb5eecb669ae9b22a38d79bfd..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/horovod/tensorflow/synthetic_benchmark.py b/horovod/tensorflow/synthetic_benchmark.py deleted file mode 100644 index ee401a5cc8ca05def1a87a14c0d66608bab38b18..0000000000000000000000000000000000000000 --- a/horovod/tensorflow/synthetic_benchmark.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import absolute_import, division, print_function - -import argparse -import os -import numpy as np -import timeit - -import tensorflow as tf -import horovod.tensorflow as hvd -from tensorflow.keras import applications - -# Benchmark settings -parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument('--fp16-allreduce', action='store_true', default=False, - help='use fp16 compression during allreduce') - -parser.add_argument('--model', type=str, default='ResNet50', - help='model to benchmark') -parser.add_argument('--batch-size', type=int, default=32, - help='input batch size') - -parser.add_argument('--num-warmup-batches', type=int, default=10, - help='number of warm-up batches that don\'t count towards benchmark') -parser.add_argument('--num-batches-per-iter', type=int, default=10, - help='number of batches per benchmark iteration') -parser.add_argument('--num-iters', type=int, default=10, - help='number of benchmark iterations') - -parser.add_argument('--eager', action='store_true', default=False, - help='enables eager execution') -parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - -args = parser.parse_args() -args.cuda = not args.no_cuda - -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -if args.cuda: - config.gpu_options.allow_growth = True - config.gpu_options.visible_device_list = str(hvd.local_rank()) -else: - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" - config.gpu_options.allow_growth = False - config.gpu_options.visible_device_list = '' - -if args.eager: - tf.enable_eager_execution(config) - -# Set up standard model. -model = getattr(applications, args.model)(weights=None) - -opt = tf.train.GradientDescentOptimizer(0.01) - -# Horovod: (optional) compression algorithm. -compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none - -# Horovod: wrap optimizer with DistributedOptimizer. -opt = hvd.DistributedOptimizer(opt, compression=compression) - -init = tf.global_variables_initializer() -bcast_op = hvd.broadcast_global_variables(0) - -data = tf.random_uniform([args.batch_size, 224, 224, 3]) -target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) - - -def loss_function(): - probs = model(data, training=True) - return tf.losses.sparse_softmax_cross_entropy(target, probs) - - -def log(s, nl=True): - if hvd.rank() != 0: - return - print(s, end='\n' if nl else '') - - -log('Model: %s' % args.model) -log('Batch size: %d' % args.batch_size) -device = 'GPU' if args.cuda else 'CPU' -log('Number of %ss: %d' % (device, hvd.size())) - - -def run(benchmark_step): - # Warm-up - log('Running warmup...') - timeit.timeit(benchmark_step, number=args.num_warmup_batches) - - # Benchmark - log('Running benchmark...') - img_secs = [] - for x in range(args.num_iters): - time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) - img_sec = args.batch_size * args.num_batches_per_iter / time - log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) - img_secs.append(img_sec) - - # Results - img_sec_mean = np.mean(img_secs) - img_sec_conf = 1.96 * np.std(img_secs) - log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) - log('Total img/sec on %d %s(s): %.1f +-%.1f' % - (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) - - -if tf.executing_eagerly(): - with tf.device(device): - run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) -else: - with tf.Session(config=config) as session: - init.run() - bcast_op.run() - - loss = loss_function() - train_opt = opt.minimize(loss) - run(lambda: session.run(train_opt)) diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md deleted file mode 100644 index 3a13e2b83b5b749240563b152bf70598e62c6335..0000000000000000000000000000000000000000 --- a/horovod_data_distributed/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Introduction - -Please see the main docstring in each program for details. - -# Notes - -On JURECA and JUWELS, the `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`]( -https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution) -module for distribution of training data filenames across multiple ranks. On JURON, multiple additional -package are required. Please follow the steps below to setup the environment before submitting the -training job. - -Note that a maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there -are eight training files. - -## JURECA and JUWELS - -1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` -2. Load the system-wide Python module: `module load Python/3.6.8` -3. Install the `hpc4neuro` package: - - `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git` - -4. Submit the job - -## JURON - -1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` -2. Setup a Python virtual environment with the required packages (may take upto 5 minutes): `./setup_juron.sh` -3. Submit the job: `bsub < submit_job_juron.sh` - -**Note:** The setup is required only once. Unless you explicitly remove the virtual environment, the same -setup can be used to run the example multiple times. diff --git a/horovod_data_distributed/setup_juron.sh b/horovod_data_distributed/setup_juron.sh deleted file mode 100755 index 7fa1a24a7361187b627c6a0d64dc57c113b843f4..0000000000000000000000000000000000000000 --- a/horovod_data_distributed/setup_juron.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash - -# Load the Python module -module load python/3.6.1 - -# Create a virtual environment -python -m venv venv_dl_hpc4neuro - -# Activate the virtual environment -source venv_dl_hpc4neuro/bin/activate - -# Upgrade pip and setuptools -pip install -U pip setuptools - -# Install mpi4py -env MPICC=/gpfs/software/opt/openmpi/3.1.2-gcc_5.4.0-cuda_10.0.130/bin/mpicc pip install mpi4py - -# Install six -pip install six - -# Install hpc4neuro -pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git - -printf "%s\n\n" "Setup complete." diff --git a/horovod_data_distributed/submit_job_jureca.sh b/horovod_data_distributed/submit_job_jureca.sh deleted file mode 100755 index eedbacaa4c0a9f7e4644f70c0a9386b51ce1a53c..0000000000000000000000000000000000000000 --- a/horovod_data_distributed/submit_job_jureca.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HVD_DATA_DIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load mpi4py/3.0.1-Python-3.6.8 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juron.sh b/horovod_data_distributed/submit_job_juron.sh deleted file mode 100755 index a71bc471dc2f56c06096cd2eb5897e86dfded09f..0000000000000000000000000000000000000000 --- a/horovod_data_distributed/submit_job_juron.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 4 -#BSUB -R "span[ptile=4]" -#BSUB -gpu "num=4" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J HVD_DATA_DIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load horovod/0.15.2 - -# Activate the virtual environment -source venv_dl_hpc4neuro/bin/activate - -# Run the program -mpirun -bind-to none \ - -map-by slot \ - -x NCCL_DEBUG=INFO \ - -x LD_LIBRARY_PATH \ - -x PATH \ - -mca pml ob1 \ - -mca btl ^openib \ - python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh deleted file mode 100755 index eedbacaa4c0a9f7e4644f70c0a9386b51ce1a53c..0000000000000000000000000000000000000000 --- a/horovod_data_distributed/submit_job_juwels.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=2 -#SBATCH --ntasks=8 -#SBATCH --ntasks-per-node=4 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=HVD_DATA_DIST -#SBATCH --gres=gpu:4 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load MVAPICH2/2.3.2-GDR -module load mpi4py/3.0.1-Python-3.6.8 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Horovod/0.16.2-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist_data_distributed.py diff --git a/keras/README.md b/keras/README.md deleted file mode 100644 index 4e8462ddc50a18a7219ef38e5aacca5283f02411..0000000000000000000000000000000000000000 --- a/keras/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Notes - -The `mnist.py` sample is a slightly modified version of `mnist_cnn.py` -available in the Keras examples repository -[here](https://github.com/keras-team/keras/tree/master/examples) -(last checked: September 02, 2019). Our changes are -limited to, - -* The data loading mechanism -* A bit of code cleanup -* A few additional comments pertaining to our custom data loading mechanism - -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file diff --git a/keras/mnist.py b/keras/mnist.py deleted file mode 100644 index 9fc93f2a56f0aa38318a114e151b7e2a6c2ea15c..0000000000000000000000000000000000000000 --- a/keras/mnist.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py, -# which is also licensed under The MIT License (see the NOTICE file for details). - - -"""Trains a simple convnet on the MNIST dataset. - -Gets to 99.25% test accuracy after 12 epochs -(there is still a lot of margin for parameter tuning). -16 seconds per epoch on a GRID K520 GPU. -""" - -from __future__ import print_function -import os -import sys -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras import backend as K - -# [HPCNS] Import the DataValidator, which can then be used to -# validate and load the path to the already downloaded dataset. -sys.path.insert(0, '../utils') -from data_utils import DataValidator - -# [HPCNS] Name of the dataset file -data_file = 'mnist/keras/mnist.npz' - -# [HPCNS] Path to the directory containing the dataset file -data_dir = DataValidator.validated_data_dir(data_file) - -# [HPCNS] Fully qualified dataset file name -dataset_file = os.path.join(data_dir, data_file) - -batch_size = 128 -num_classes = 10 -epochs = 12 - -# input image dimensions -img_rows, img_cols = 28, 28 - -# [HPCNS] Load MNIST dataset -# the data, split between train and test sets -(x_train, y_train), (x_test, y_test) = mnist.load_data(dataset_file) - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=keras.optimizers.Adadelta(), - metrics=['accuracy']) - -model.fit(x_train, y_train, - batch_size=batch_size, - epochs=epochs, - verbose=1, - validation_data=(x_test, y_test)) -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/keras/run_on_localMachine.sh b/keras/run_on_localMachine.sh deleted file mode 100644 index 1895ec1cb73518c3c458a819926edd4d99274b8e..0000000000000000000000000000000000000000 --- a/keras/run_on_localMachine.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -python -u mnist.py diff --git a/keras/submit_job_jureca.sh b/keras/submit_job_jureca.sh deleted file mode 100755 index 55feebb6f5ca18e8e6707f918205905a7789613d..0000000000000000000000000000000000000000 --- a/keras/submit_job_jureca.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=KERAS_MNIST_CNN -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCCcore/.8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/keras/submit_job_juron.sh b/keras/submit_job_juron.sh deleted file mode 100644 index 7927b03679f2f4b515c90bcbc564447a23433e08..0000000000000000000000000000000000000000 --- a/keras/submit_job_juron.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J KERAS_MNIST_CNN - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 -module load keras/2.2.4 - -# Run the program -python -u mnist.py diff --git a/keras/submit_job_juwels.sh b/keras/submit_job_juwels.sh deleted file mode 100755 index 429c440b9eaea0afb6cb3e4da2423c863a79f778..0000000000000000000000000000000000000000 --- a/keras/submit_job_juwels.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=KERAS_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 -module load Keras/2.2.4-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/requirements.txt b/requirements.txt index 79144dccd44dd967fb51438abbcd9589c6d81937..6a4def7c590fc30601ff616009ff388d1198e244 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,41 @@ -absl-py==0.8.0 -astor==0.8.0 -cffi==1.12.3 -cloudpickle==1.2.1 -gast==0.3.1 -grpcio==1.23.0 +absl-py==0.12.0 +astunparse==1.6.3 +cachetools==4.2.1 +certifi==2020.12.5 +cffi==1.14.5 +chardet==4.0.0 +cloudpickle==1.6.0 +gast==0.3.3 +google-auth==1.29.0 +google-auth-oauthlib==0.4.4 +google-pasta==0.2.0 +grpcio==1.37.0 h5py==2.10.0 -Markdown==3.1.1 -mock==3.0.5 -mpi4py==3.0.2 -numpy==1.17.2 -protobuf==3.9.1 -psutil==5.6.3 -pycparser==2.19 -six==1.12.0 -Werkzeug==0.15.6 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.0 -tensorboard==1.13.1 -tensorflow-estimator==1.13.0 -tensorflow-gpu==1.13.1 +horovod==0.20.3 +hpc4neuro @ git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git@57a560b4085dba2ba3262d4d3238ef70991be877 +idna==2.10 +Keras-Preprocessing==1.1.2 +Markdown==3.3.4 +mpi4py==3.0.3 +numpy==1.18.5 +oauthlib==3.1.0 +opt-einsum==3.3.0 +protobuf==3.15.8 +psutil==5.8.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycparser==2.20 +PyYAML==5.4.1 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7.2 +six==1.15.0 +tensorboard==2.5.0 +tensorboard-data-server==0.6.0 +tensorboard-plugin-wit==1.8.0 +tensorflow==2.3.1 +tensorflow-estimator==2.3.0 termcolor==1.1.0 -keras==2.3.1 -horovod==0.16.2 \ No newline at end of file +urllib3==1.26.4 +Werkzeug==1.0.1 +wrapt==1.12.1 diff --git a/tensorflow/README.md b/tensorflow/README.md index 3bf439c0cf8aaa5020252c1099d6b55b2e6ff07a..a35d6436f70498ff6dd682c2271dd3c35c565dad 100644 --- a/tensorflow/README.md +++ b/tensorflow/README.md @@ -1,13 +1,22 @@ # Notes -The `mnist.py` sample is a slightly modified version of `convolutional.py` -available in the Tensorflow models repository -[here](https://github.com/tensorflow/models/blob/master/tutorials/image/mnist) -(last checked: September 02, 2019). Our changes are -limited to, +All source code samples were taken from the Horovod examples repository +[here](https://github.com/horovod/horovod/tree/master/examples/tensorflow2) +(last checked: April 26, 2021). The samples have been slightly modified. Our +changes are limited to, * The data loading mechanism * A bit of code cleanup * A few additional comments pertaining to our custom data loading mechanism -**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. \ No newline at end of file +**Note:** All newly added statements follow a comment beginning with `[HPCNS]`. All +statements that demonstrate the use of Horovod follow a comment beginning with +`[Horovod]` (as added by Horovod developers). + +The following samples are included: + +1. `keras_mnist.py`: A simple training program for an MNIST classifier that + uses the Keras API with Horovod. +2. `mnist.py`: Also a training program for an MNIST classifier, this sample + demonstrates using Horovod's `DistributedGradientTape` with a custom + training loop. diff --git a/caffe/lenet_python/snapshots/.gitkeep b/tensorflow/checkpoints/.gitkeep similarity index 100% rename from caffe/lenet_python/snapshots/.gitkeep rename to tensorflow/checkpoints/.gitkeep diff --git a/tensorflow/jureca_job.sh b/tensorflow/jureca_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..e818bc030359232fb55d097a776828b4e56aa61d --- /dev/null +++ b/tensorflow/jureca_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=dc-gpu-devel + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/jusuf_job.sh b/tensorflow/jusuf_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..24f3c83acc514255de6f3fd0452cfda8ab553da2 --- /dev/null +++ b/tensorflow/jusuf_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:1 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/juwels_booster_job.sh b/tensorflow/juwels_booster_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..df9cdef250c5c5aed3006f53ba2e32fad6a46db3 --- /dev/null +++ b/tensorflow/juwels_booster_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develbooster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/juwels_job.sh b/tensorflow/juwels_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..55831d0785c0bba72bd68a2339a18997cc45dd69 --- /dev/null +++ b/tensorflow/juwels_job.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist.py diff --git a/tensorflow/keras_mnist.py b/tensorflow/keras_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..b07950c5c62065a82c2fffa4d838061525cbc13f --- /dev/null +++ b/tensorflow/keras_mnist.py @@ -0,0 +1,107 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import sys + +import tensorflow as tf +import horovod.tensorflow.keras as hvd + +# [HPCNS] Import the DataValidator, which can then be used to +# validate and load the path to the already downloaded dataset. +sys.path.insert(0, '../utils') +from data_utils import DataValidator + +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) + +# Horovod: adjust learning rate based on number of GPUs. +scaled_lr = 0.001 * hvd.size() +opt = tf.optimizers.Adam(scaled_lr) + +# Horovod: add Horovod DistributedOptimizer. +opt = hvd.DistributedOptimizer(opt) + +# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow +# uses hvd.DistributedOptimizer() to compute gradients. +mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), + optimizer=opt, + metrics=['accuracy'], + experimental_run_tf_function=False) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(tf.keras.callbacks.ModelCheckpoint('checkpoints/checkpoint-{epoch}.h5')) + +# Horovod: write logs on worker 0. +verbose = 1 if hvd.rank() == 0 else 0 + +# Train the model. +# Horovod: adjust number of steps based on number of GPUs. +mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=10, verbose=verbose) \ No newline at end of file diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py index 30477e153e9c59d2a151b90686049b39885155e4..7e56a705bdf50d337a903c22178196c1b788cc4c 100644 --- a/tensorflow/mnist.py +++ b/tensorflow/mnist.py @@ -1,328 +1,109 @@ -# Copyright (c) 2019 Forschungszentrum Juelich GmbH. -# This code is licensed under MIT license (see the LICENSE file for details). -# This code is derived from https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py, -# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== -"""Simple, end-to-end, LeNet-5-like convolutional MNIST model example. - -This should achieve a test error of 0.7%. Please keep this model as simple and -linear as possible, it is meant as a tutorial for simple convolutional models. -Run with --self_test on the command line to execute a short self-test. -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import gzip import os import sys -import time -import numpy -from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf +import horovod.tensorflow as hvd # [HPCNS] Import the DataValidator, which can then be used to # validate and load the path to the already downloaded dataset. sys.path.insert(0, '../utils') from data_utils import DataValidator -IMAGE_SIZE = 28 -NUM_CHANNELS = 1 -PIXEL_DEPTH = 255 -NUM_LABELS = 10 -VALIDATION_SIZE = 5000 # Size of the validation set. -SEED = 66478 # Set to None for random seed. -BATCH_SIZE = 64 -NUM_EPOCHS = 10 -EVAL_BATCH_SIZE = 64 -EVAL_FREQUENCY = 100 # Number of steps between evaluations. - -FLAGS = None - - -def data_type(): - """Return the type of the activations, weights, and placeholder variables.""" - if FLAGS.use_fp16: - return tf.float16 - else: - return tf.float32 - - -def extract_data(filename, num_images): - """Extract the images into a 4D tensor [image index, y, x, channels]. - - Values are rescaled from [0, 255] down to [-0.5, 0.5]. - """ - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(16) - buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS) - data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32) - data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH - data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS) - return data - - -def extract_labels(filename, num_images): - """Extract the labels into a vector of int64 label IDs.""" - print('Extracting', filename) - with gzip.open(filename) as bytestream: - bytestream.read(8) - buf = bytestream.read(1 * num_images) - labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.int64) - return labels - - -def fake_data(num_images): - """Generate a fake dataset that matches the dimensions of MNIST.""" - data = numpy.ndarray( - shape=(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS), - dtype=numpy.float32) - labels = numpy.zeros(shape=(num_images,), dtype=numpy.int64) - for image in xrange(num_images): - label = image % 2 - data[image, :, :, 0] = label - 0.5 - labels[image] = label - return data, labels - - -def error_rate(predictions, labels): - """Return the error rate based on dense predictions and sparse labels.""" - return 100.0 - ( - 100.0 * - numpy.sum(numpy.argmax(predictions, 1) == labels) / - predictions.shape[0]) - - -def main(_): - if FLAGS.self_test: - print('Running self-test.') - train_data, train_labels = fake_data(256) - validation_data, validation_labels = fake_data(EVAL_BATCH_SIZE) - test_data, test_labels = fake_data(EVAL_BATCH_SIZE) - num_epochs = 1 - else: - # [HPCNS]: Data files relative to the 'datasets' directory - train_data_filename = 'mnist/raw/train-images-idx3-ubyte.gz' - train_labels_filename = 'mnist/raw/train-labels-idx1-ubyte.gz' - test_data_filename = 'mnist/raw/t10k-images-idx3-ubyte.gz' - test_labels_filename = 'mnist/raw/t10k-labels-idx1-ubyte.gz' - - # [HPCNS]: Update data file information with validated and fully qualified filenames - train_data_filename = os.path.join( - DataValidator.validated_data_dir(train_data_filename), train_data_filename) - train_labels_filename = os.path.join( - DataValidator.validated_data_dir(train_labels_filename), train_labels_filename) - test_data_filename = os.path.join( - DataValidator.validated_data_dir(test_data_filename), test_data_filename) - test_labels_filename = os.path.join( - DataValidator.validated_data_dir(test_labels_filename), test_labels_filename) - - # Extract it into numpy arrays. - train_data = extract_data(train_data_filename, 60000) - train_labels = extract_labels(train_labels_filename, 60000) - test_data = extract_data(test_data_filename, 10000) - test_labels = extract_labels(test_labels_filename, 10000) - - # Generate a validation set. - validation_data = train_data[:VALIDATION_SIZE, ...] - validation_labels = train_labels[:VALIDATION_SIZE] - train_data = train_data[VALIDATION_SIZE:, ...] - train_labels = train_labels[VALIDATION_SIZE:] - num_epochs = NUM_EPOCHS - - train_size = train_labels.shape[0] - - # This is where training samples and labels are fed to the graph. - # These placeholder nodes will be fed a batch of training data at each - # training step using the {feed_dict} argument to the Run() call below. - train_data_node = tf.placeholder( - data_type(), - shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,)) - eval_data = tf.placeholder( - data_type(), - shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)) - - # The variables below hold all the trainable weights. They are passed an - # initial value which will be assigned when we call: - # {tf.global_variables_initializer().run()} - conv1_weights = tf.Variable( - tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32. - stddev=0.1, - seed=SEED, dtype=data_type())) - conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type())) - conv2_weights = tf.Variable(tf.truncated_normal( - [5, 5, 32, 64], stddev=0.1, - seed=SEED, dtype=data_type())) - conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type())) - fc1_weights = tf.Variable( # fully connected, depth 512. - tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type())) - fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], - stddev=0.1, - seed=SEED, - dtype=data_type())) - fc2_biases = tf.Variable(tf.constant( - 0.1, shape=[NUM_LABELS], dtype=data_type())) - - # We will replicate the model structure for the training subgraph, as well - # as the evaluation subgraphs, while sharing the trainable parameters. - def model(data, train=False): - """The Model definition.""" - # 2D convolution, with 'SAME' padding (i.e. the output feature map has - # the same size as the input). Note that {strides} is a 4D array whose - # shape matches the data layout: [image index, y, x, depth]. - conv = tf.nn.conv2d(data, - conv1_weights, - strides=[1, 1, 1, 1], - padding='SAME') - # Bias and rectified linear non-linearity. - relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases)) - # Max pooling. The kernel size spec {ksize} also follows the layout of - # the data. Here we have a pooling window of 2, and a stride of 2. - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - conv = tf.nn.conv2d(pool, - conv2_weights, - strides=[1, 1, 1, 1], - padding='SAME') - relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases)) - pool = tf.nn.max_pool(relu, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME') - # Reshape the feature map cuboid into a 2D matrix to feed it to the - # fully connected layers. - pool_shape = pool.get_shape().as_list() - reshape = tf.reshape( - pool, - [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]]) - # Fully connected layer. Note that the '+' operation automatically - # broadcasts the biases. - hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases) - # Add a 50% dropout during training only. Dropout also scales - # activations such that no rescaling is needed at evaluation time. - if train: - hidden = tf.nn.dropout(hidden, 0.5, seed=SEED) - return tf.matmul(hidden, fc2_weights) + fc2_biases - - # Training computation: logits + cross-entropy loss. - logits = model(train_data_node, True) - loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=train_labels_node, logits=logits)) - - # L2 regularization for the fully connected parameters. - regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + - tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) - # Add the regularization term to the loss. - loss += 5e-4 * regularizers - - # Optimizer: set up a variable that's incremented once per batch and - # controls the learning rate decay. - batch = tf.Variable(0, dtype=data_type()) - # Decay once per epoch, using an exponential schedule starting at 0.01. - learning_rate = tf.train.exponential_decay( - 0.01, # Base learning rate. - batch * BATCH_SIZE, # Current index into the dataset. - train_size, # Decay step. - 0.95, # Decay rate. - staircase=True) - # Use simple momentum for the optimization. - optimizer = tf.train.MomentumOptimizer(learning_rate, - 0.9).minimize(loss, - global_step=batch) - - # Predictions for the current training minibatch. - train_prediction = tf.nn.softmax(logits) - - # Predictions for the test and validation, which we'll compute less often. - eval_prediction = tf.nn.softmax(model(eval_data)) - - # Small utility function to evaluate a dataset by feeding batches of data to - # {eval_data} and pulling the results from {eval_predictions}. - # Saves memory and enables this to run on smaller GPUs. - def eval_in_batches(data, sess): - """Get all predictions for a dataset by running it in small batches.""" - size = data.shape[0] - if size < EVAL_BATCH_SIZE: - raise ValueError("batch size for evals larger than dataset: %d" % size) - predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32) - for begin in xrange(0, size, EVAL_BATCH_SIZE): - end = begin + EVAL_BATCH_SIZE - if end <= size: - predictions[begin:end, :] = sess.run( - eval_prediction, - feed_dict={eval_data: data[begin:end, ...]}) - else: - batch_predictions = sess.run( - eval_prediction, - feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]}) - predictions[begin:, :] = batch_predictions[begin - size:, :] - return predictions - - # Create a local session to run the training. - start_time = time.time() - with tf.Session() as sess: - # Run all the initializers to prepare the trainable parameters. - tf.global_variables_initializer().run() - print('Initialized!') - # Loop through training steps. - for step in xrange(int(num_epochs * train_size) // BATCH_SIZE): - # Compute the offset of the current minibatch in the data. - # Note that we could use better randomization across epochs. - offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE) - batch_data = train_data[offset:(offset + BATCH_SIZE), ...] - batch_labels = train_labels[offset:(offset + BATCH_SIZE)] - # This dictionary maps the batch data (as a numpy array) to the - # node in the graph it should be fed to. - feed_dict = {train_data_node: batch_data, - train_labels_node: batch_labels} - # Run the optimizer to update weights. - sess.run(optimizer, feed_dict=feed_dict) - # print some extra information once reach the evaluation frequency - if step % EVAL_FREQUENCY == 0: - # fetch some extra nodes' data - l, lr, predictions = sess.run([loss, learning_rate, train_prediction], - feed_dict=feed_dict) - elapsed_time = time.time() - start_time - start_time = time.time() - print('Step %d (epoch %.2f), %.1f ms' % - (step, float(step) * BATCH_SIZE / train_size, - 1000 * elapsed_time / EVAL_FREQUENCY)) - print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) - print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels)) - print('Validation error: %.1f%%' % error_rate( - eval_in_batches(validation_data, sess), validation_labels)) - sys.stdout.flush() - # Finally print the result! - test_error = error_rate(eval_in_batches(test_data, sess), test_labels) - print('Test error: %.1f%%' % test_error) - if FLAGS.self_test: - print('test_error', test_error) - assert test_error == 0.0, 'expected 0.0 test_error, got %.2f' % ( - test_error,) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--use_fp16', - default=False, - help='Use half floats instead of full floats if True.', - action='store_true') - parser.add_argument( - '--self_test', - default=False, - action='store_true', - help='True if running a self test.') - - FLAGS, unparsed = parser.parse_known_args() - tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) +# [HPCNS] Name of the dataset file +data_file = 'mnist/keras/mnist.npz' + +# [HPCNS] Path to the directory containing the dataset file +data_dir = DataValidator.validated_data_dir(data_file) + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +gpus = tf.config.experimental.list_physical_devices('GPU') +for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) +if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + +# [HPCNS] Fully qualified dataset file name +dataset_file = os.path.join(data_dir, data_file) + +(mnist_images, mnist_labels), _ = \ + tf.keras.datasets.mnist.load_data(dataset_file) + +dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64)) +) +dataset = dataset.repeat().shuffle(10000).batch(128) + +mnist_model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), + tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation='relu'), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation='softmax') +]) +loss = tf.losses.SparseCategoricalCrossentropy() + +# Horovod: adjust learning rate based on number of GPUs. +opt = tf.optimizers.Adam(0.001 * hvd.size()) + +checkpoint_dir = 'checkpoints/' +checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) + + +@tf.function +def training_step(images, labels, first_batch): + with tf.GradientTape() as tape: + probs = mnist_model(images, training=True) + loss_value = loss(labels, probs) + + # Horovod: add Horovod Distributed GradientTape. + tape = hvd.DistributedGradientTape(tape) + + grads = tape.gradient(loss_value, mnist_model.trainable_variables) + opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if first_batch: + hvd.broadcast_variables(mnist_model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + + return loss_value + + +# Horovod: adjust number of steps based on number of GPUs. +for batch, (images, labels) in enumerate(dataset.take(10000 // hvd.size())): + loss_value = training_step(images, labels, batch == 0) + + if batch % 10 == 0 and hvd.local_rank() == 0: + print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + +# Horovod: save checkpoints only on worker 0 to prevent other workers from +# corrupting it. +if hvd.rank() == 0: + checkpoint.save(checkpoint_dir) diff --git a/tensorflow/run_on_localMachine.sh b/tensorflow/run_on_localMachine.sh deleted file mode 100644 index 9c5737c9fc9d6bca93e25fca9f785e52320131fc..0000000000000000000000000000000000000000 --- a/tensorflow/run_on_localMachine.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -# Run the program -python -u mnist.py \ No newline at end of file diff --git a/tensorflow/submit_job_jureca.sh b/tensorflow/submit_job_jureca.sh deleted file mode 100755 index fa294f1cb401c9cda6a1c20ab716419a64262e07..0000000000000000000000000000000000000000 --- a/tensorflow/submit_job_jureca.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TFLOW_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCCcore/.8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/tensorflow/submit_job_juron.sh b/tensorflow/submit_job_juron.sh deleted file mode 100644 index 30fa2043f2059fc8d4d6ac673f52ba0bebb3ac2d..0000000000000000000000000000000000000000 --- a/tensorflow/submit_job_juron.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -#BSUB -q normal -#BSUB -W 10 -#BSUB -n 1 -#BSUB -R "span[ptile=1]" -#BSUB -gpu "num=1" -#BSUB -e "error.%J.er" -#BSUB -o "output_%J.out" -#BSUB -J TENSORFLOW_MNIST - -# Load the required modules -module load python/3.6.1 -module load tensorflow/1.12.0-gcc_5.4.0-cuda_10.0.130 - -# Run the program -python -u mnist.py diff --git a/tensorflow/submit_job_juwels.sh b/tensorflow/submit_job_juwels.sh deleted file mode 100755 index fda7d98fcf5ab5a7f58b09d200d5c56ef258d361..0000000000000000000000000000000000000000 --- a/tensorflow/submit_job_juwels.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -# Slurm job configuration -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --output=output_%j.out -#SBATCH --error=error_%j.er -#SBATCH --time=00:10:00 -#SBATCH --job-name=TFLOW_MNIST -#SBATCH --gres=gpu:1 --partition=develgpus -#SBATCH --mail-type=ALL - -# Load the required modules -module load GCC/8.3.0 -module load TensorFlow/1.13.1-GPU-Python-3.6.8 - -# Run the program -srun python -u mnist.py diff --git a/training_data_distribution/README.md b/training_data_distribution/README.md new file mode 100644 index 0000000000000000000000000000000000000000..374f62318fba816b5a7a9429f5445081c7456153 --- /dev/null +++ b/training_data_distribution/README.md @@ -0,0 +1,27 @@ +# Introduction + +This example distributes the partitioned MNIST data across multiple ranks +for truly data distributed training of a shallow Artificial Neural Network for +handwritten digit classification. + +The Horovod framework is used for seamless distributed training. However, +instead of distributing epochs, this example distributes data amongst the +ranks, so that each rank contributes training based on its local subset of +the training data. + +# Notes + +The `mnist_data_distributed.py` program requires the [`hpc4neuro.distribution`]( +https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro#1-hpc4neurodistribution) +module for distribution of training data filenames across multiple ranks. Please +follow the steps below to install this package before submitting the training +job. + +1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/training_data_distribution` +2. Load the system-wide Python module: `module load Python/3.8.5` +3. Install the `hpc4neuro` package: + + `pip install --user git+https://gitlab.version.fz-juelich.de/hpc4ns/hpc4neuro.git` + +**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, +as there are eight training files. diff --git a/training_data_distribution/jureca_job.sh b/training_data_distribution/jureca_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..96a239bb4dad47e175595c67fb4aad531a4fbc62 --- /dev/null +++ b/training_data_distribution/jureca_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=dc-gpu-devel + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/training_data_distribution/jusuf_job.sh b/training_data_distribution/jusuf_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..95c262d0af9da54549f42a7e8e6426072ba3f260 --- /dev/null +++ b/training_data_distribution/jusuf_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:1 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/training_data_distribution/juwels_booster_job.sh b/training_data_distribution/juwels_booster_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..374f63d6d5e2f899555b974d138d6dd0bd08d858 --- /dev/null +++ b/training_data_distribution/juwels_booster_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develbooster + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/training_data_distribution/juwels_job.sh b/training_data_distribution/juwels_job.sh new file mode 100755 index 0000000000000000000000000000000000000000..b2b764141593c82565b91fce091620aa15abfbc8 --- /dev/null +++ b/training_data_distribution/juwels_job.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=TUTORIAL +#SBATCH --gres=gpu:4 +#SBATCH --partition=develgpus + +# Load the required modules +module load GCC/9.3.0 +module load OpenMPI/4.1.0rc1 +module load mpi4py/3.0.3-Python-3.8.5 +module load TensorFlow/2.3.1-Python-3.8.5 +module load Horovod/0.20.3-Python-3.8.5 + +# Enable MPI multi-threading for Horovod +export HOROVOD_MPI_THREADS_DISABLE=0 + +# Make all GPUs visible per node +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/horovod_data_distributed/mnist_data_distributed.py b/training_data_distribution/mnist_data_distributed.py similarity index 94% rename from horovod_data_distributed/mnist_data_distributed.py rename to training_data_distribution/mnist_data_distributed.py index d4c68c19174058a41d0198b322a0f4035ef22419..b2335a83ed979ee77d2d30c2dc67d541c87ba2e4 100644 --- a/horovod_data_distributed/mnist_data_distributed.py +++ b/training_data_distribution/mnist_data_distributed.py @@ -20,7 +20,6 @@ import mpi4py import numpy as np import tensorflow as tf import horovod.tensorflow.keras as hvd -from tensorflow.python.keras import backend as K from hpc4neuro.errors import MpiInitError from hpc4neuro.distribution import DataDistributor @@ -102,10 +101,14 @@ def initialize_hvd_and_mpi(): # Bind the local rank to a specific GPU, so that each rank uses # a different GPU - tf_config = tf.ConfigProto() - tf_config.gpu_options.allow_growth = True - tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) - K.set_session(tf.Session(config=tf_config)) + gpus = tf.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + if gpus: + tf.config.experimental.set_visible_devices( + gpus[hvd.local_rank()], + 'GPU' + ) # Verify that MPI multi-threading is supported. Horovod cannot work # with mpi4py (or any other MPI library) otherwise. @@ -113,8 +116,9 @@ def initialize_hvd_and_mpi(): # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163 if not hvd.mpi_threads_supported(): raise MpiInitError( - 'MPI multi-threading is not supported. Horovod cannot work with mpi4py' - 'in this case. Please enable MPI multi-threading and try again.' + 'MPI multi-threading is not supported. Horovod cannot work with ' + 'mpi4py in this case. Please enable MPI multi-threading and try ' + 'again.' ) # Disable automatic MPI initialization on importing mpi4py.MPI, diff --git a/utils/data_utils.py b/utils/data_utils.py index f2d10e4111fe70fdc465287a3b5f18bff8d6981f..21a57ad884eece630f52bbbc8f8720969ade20c3 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -47,19 +47,19 @@ class DataValidator: if not os.path.exists(data_dir): data_dir = os.path.join(os.path.abspath('../../datasets')) - print('Using {} as the data directory.'.format(data_dir)) + print(f'Using {data_dir} as the data directory.') # Check if the directory exists if not os.path.exists(data_dir): raise DatasetNotFoundError( - '{} refers to a non-existing directory. Please either correctly set ' - 'the DL_TEST_DATA_HOME environment variable, or make sure the datasets are ' - 'available in the project root.'.format(data_dir) + f'{data_dir} refers to a non-existing directory. Please either ' + f'correctly set the DL_TEST_DATA_HOME environment variable, or ' + f'make sure the datasets are available in the project root.' ) if not os.path.exists(os.path.join(data_dir, filename)): raise DatasetNotFoundError( - 'Unable to locate {} in {}'.format(filename, data_dir) + f'Unable to locate {filename} in {data_dir}' ) return data_dir