diff --git a/.gitattributes b/.gitattributes index 775c8feb9d6f8d5925ddcf5dc75f13c612e17a3e..dbf6f0e70d6c4d59aec7f4640ea71f42aeafb226 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,3 +9,7 @@ datasets/mnist/raw/t10k-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text datasets/mnist/raw/t10k-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text datasets/mnist/raw/train-images-idx3-ubyte.gz filter=lfs diff=lfs merge=lfs -text datasets/mnist/raw/train-labels-idx1-ubyte.gz filter=lfs diff=lfs merge=lfs -text +datasets/mnist/partitioned/train/x/*.npy filter=lfs diff=lfs merge=lfs -text +datasets/mnist/partitioned/train/y/*.npy filter=lfs diff=lfs merge=lfs -text +datasets/mnist/partitioned/test/x/*.npy filter=lfs diff=lfs merge=lfs -text +datasets/mnist/partitioned/test/y/*.npy filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 7f5ba6e51700856088e0b4379f4163ef9baf1bb6..9c4d6d54d73dda0e2cb980abdc7a3ae2181155cb 100644 --- a/.gitignore +++ b/.gitignore @@ -117,4 +117,4 @@ mnist_convnet_model/ # Error and output files from the supercomputers *.er -*.out \ No newline at end of file +*.out diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..78d9698282c7162167dfe5930388a7d8eae47117 --- /dev/null +++ b/LICENSE @@ -0,0 +1,26 @@ +All contents of this work, except for the contents of the "datasets/mnist" +sub-directory are licensed under The MIT License (see license details below). +Contents of the "datasets/mnist" sub-directory are licensed under the Creative +Commons Attribution-ShareAlike 3.0 Unported License (see "datasets/mnist/LICENSE"). + +MIT License + +Copyright (c) 2019 Forschungszentrum Juelich GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..22a9d695eaaf27a76217db1873fd8544854f4451 --- /dev/null +++ b/NOTICE @@ -0,0 +1,71 @@ +This project includes derived work from the following: + + +Horovod +Copyright 2018 Uber Technologies, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +Tensorflow +Copyright 2016 The TensorFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +Keras +All contributions by François Chollet: +Copyright (c) 2015 - 2019, François Chollet. +All rights reserved. + +All contributions by Google: +Copyright (c) 2015 - 2019, Google, Inc. +All rights reserved. + +All contributions by Microsoft: +Copyright (c) 2017 - 2019, Microsoft, Inc. +All rights reserved. + +All other contributions: +Copyright (c) 2015 - 2019, the respective contributors. +All rights reserved. + +Licensed under The MIT License (MIT) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index a36ef6218ebf89ecdcbb102c57796fcf9f717e4d..dc59596c5436f8576836a23bc79a8f5dbf679fb5 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,17 @@ visit [this](https://gitlab.version.fz-juelich.de/MLDL_FZJ/MLDL_FZJ_Wiki/wikis/E ### Announcements -* Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well. -* Python 2 support has been removed from the tutorial for all frameworks except Caffe. -* Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch +* **November 18, 2019:** The `horovod_data_distributed` directory has been added that contains code +samples to illustrate proper data-distributed training with Horovod, i.e., a distribution mechanism +where the training data is distributed instead of epochs. Further information is available in the +directory-local `README.md`. +* **September 02, 2019:** Even though PyTorch is available as as system-wide module on the JSC supercomputers, all PyTorch examples have been removed from this tutorial. This is due to the fact that the tutorial developers are not currently working with PyTorch, and are therefore not in a position to provide support for PyTorch related issues. +* **August 23, 2019:** + * Tensorflow and Keras examples (with and without Horovod) are now fully functional on JUWELS as well. + * Python 2 support has been removed from the tutorial for all frameworks except Caffe. # Table of contents <!-- TOC --> diff --git a/datasets/mnist/LICENSE b/datasets/mnist/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..65f46d5ba1251569045f48757b724f95dfa8a488 --- /dev/null +++ b/datasets/mnist/LICENSE @@ -0,0 +1,3 @@ +The mnist directory is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. +To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send +a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. diff --git a/datasets/mnist/NOTICE b/datasets/mnist/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..ff1a1c31ca00ff94110cbcfc389c8491412c9768 --- /dev/null +++ b/datasets/mnist/NOTICE @@ -0,0 +1,12 @@ +The contents of the mnist directory are derived from the MNIST dataset: + +Yann LeCun (Courant Institute, NYU) and Corinna Cortes (Google Labs, New York) +hold the copyright of MNIST dataset (http://yann.lecun.com/exdb/mnist), which is +a derivative work from original NIST datasets. MNIST dataset is made available +under the terms of the Creative Commons Attribution-Share Alike 3.0 license. The +license details are available via the following URL: + +http://creativecommons.org/licenses/by-sa/3.0/ + +Individual images and labels have not been changed in this work. The only changes +made are to the dataset format. diff --git a/datasets/mnist/partitioned/test/x/0.npy b/datasets/mnist/partitioned/test/x/0.npy new file mode 100644 index 0000000000000000000000000000000000000000..23c2d04cf37739ae508d87d7d9bf717737c36a68 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d337b85aa5761f401dd8ef5485ec8365a0254febdf02dc75abd97e852e46672b +size 980128 diff --git a/datasets/mnist/partitioned/test/x/1.npy b/datasets/mnist/partitioned/test/x/1.npy new file mode 100644 index 0000000000000000000000000000000000000000..a24cb47c7acf5aa32333bc5d201aae4af54fcbb8 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/1.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b13a84dd3c642af5a36cd64deda794dec5afe57985db5ae1ef70b2ab9c2c3da +size 980128 diff --git a/datasets/mnist/partitioned/test/x/2.npy b/datasets/mnist/partitioned/test/x/2.npy new file mode 100644 index 0000000000000000000000000000000000000000..a4261b3825bcd94331fb65e0d1be021d2bd7e44e --- /dev/null +++ b/datasets/mnist/partitioned/test/x/2.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce66171e3c983e0a7ac3bfd1e526fa311e1b635d24f3aaffbe9409037e369d88 +size 980128 diff --git a/datasets/mnist/partitioned/test/x/3.npy b/datasets/mnist/partitioned/test/x/3.npy new file mode 100644 index 0000000000000000000000000000000000000000..726e572d4c4d2e46e58e4091ba6f8461538ab6d9 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/3.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfbc845e3668427f859c6dd687e1c440ede252bdb3f51349bb0250a7fc920c6a +size 980128 diff --git a/datasets/mnist/partitioned/test/x/4.npy b/datasets/mnist/partitioned/test/x/4.npy new file mode 100644 index 0000000000000000000000000000000000000000..47dc5edca512450988b570af306e77f3c009bae4 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/4.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3842852bf7abd261b394b32432c97c922c2d4e566ae8a824bec762e96576f974 +size 980128 diff --git a/datasets/mnist/partitioned/test/x/5.npy b/datasets/mnist/partitioned/test/x/5.npy new file mode 100644 index 0000000000000000000000000000000000000000..6c18938cf8a047d24c51473b8507d6a53811d6ba --- /dev/null +++ b/datasets/mnist/partitioned/test/x/5.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c0f2d2e2e6b4aae57a7f7bfd9cf4a35c81640095ca96c00f0ba6ea487dccb5 +size 980128 diff --git a/datasets/mnist/partitioned/test/x/6.npy b/datasets/mnist/partitioned/test/x/6.npy new file mode 100644 index 0000000000000000000000000000000000000000..2e08250eef0a7f87e66f3d2ad915480665d25447 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/6.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2abcd772fa7008092d33452de27d91ee6865f4eedf876e5ad07265a3ec6a33ee +size 980128 diff --git a/datasets/mnist/partitioned/test/x/7.npy b/datasets/mnist/partitioned/test/x/7.npy new file mode 100644 index 0000000000000000000000000000000000000000..7c3fea6af5ee8f90c9b1c01740642cd3c05ec965 --- /dev/null +++ b/datasets/mnist/partitioned/test/x/7.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7241c5639af8ad3c0197f8f6da20a6a6bea510e5fe8dc340e44a270f6f1efae +size 980128 diff --git a/datasets/mnist/partitioned/test/y/0.npy b/datasets/mnist/partitioned/test/y/0.npy new file mode 100644 index 0000000000000000000000000000000000000000..e73ef7cf9a0966fb3d36a40eb6d9a8c4c6ede7b2 --- /dev/null +++ b/datasets/mnist/partitioned/test/y/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292a20ed0440011df9e0057018d8b0e2712963f206b2356f72294dc56d2cd305 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/1.npy b/datasets/mnist/partitioned/test/y/1.npy new file mode 100644 index 0000000000000000000000000000000000000000..1bdbf1bfa4029601a39a71d79cf3117e06e1c3a2 --- /dev/null +++ b/datasets/mnist/partitioned/test/y/1.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:003428c745e9bf48a3c297dfce0465a2524557a13f4e8b9cc0c3c52c85b87ce0 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/2.npy b/datasets/mnist/partitioned/test/y/2.npy new file mode 100644 index 0000000000000000000000000000000000000000..53ae68c609444ba2c22d78b896b3f2ff0067934e --- /dev/null +++ b/datasets/mnist/partitioned/test/y/2.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d836ac22edbec15473d650cbe049473cc287d31d2b5e229a0b6ddb66edd20057 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/3.npy b/datasets/mnist/partitioned/test/y/3.npy new file mode 100644 index 0000000000000000000000000000000000000000..fe6e890887c615a513aa3b7561dd75845d281e8c --- /dev/null +++ b/datasets/mnist/partitioned/test/y/3.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3074d0129a4571fd780fc94e17925122408aed7fcb5bb5425be00c2c26c7b1e1 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/4.npy b/datasets/mnist/partitioned/test/y/4.npy new file mode 100644 index 0000000000000000000000000000000000000000..21cdec7f772677ee2445d1840e3495e9b79eb88d --- /dev/null +++ b/datasets/mnist/partitioned/test/y/4.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16323c30f940623002003fd2ee96029b6c2b1fbb09583408499ada4748a0c537 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/5.npy b/datasets/mnist/partitioned/test/y/5.npy new file mode 100644 index 0000000000000000000000000000000000000000..01f1a0716b946d47462561f6964c9bc0504bddae --- /dev/null +++ b/datasets/mnist/partitioned/test/y/5.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381c928395e7b1e17d537d4f4d42d752d18e5371a511a69d2038ab4f8d828aa3 +size 1378 diff --git a/datasets/mnist/partitioned/test/y/6.npy b/datasets/mnist/partitioned/test/y/6.npy new file mode 100644 index 0000000000000000000000000000000000000000..9d5e67fdca97f1b0786d6bd1ea2934a67ba9bafd --- /dev/null +++ b/datasets/mnist/partitioned/test/y/6.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b6692f8f3e1623f9db7f1f0badda307d121538090418b83ba32a04b666809dc +size 1378 diff --git a/datasets/mnist/partitioned/test/y/7.npy b/datasets/mnist/partitioned/test/y/7.npy new file mode 100644 index 0000000000000000000000000000000000000000..61a67b2afd1e1bb502829cc30e4cc6b068545782 --- /dev/null +++ b/datasets/mnist/partitioned/test/y/7.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d535f9dce9161fd74c357de58abc7c4a1ce6364c553837a4351d56f97f26ca53 +size 1378 diff --git a/datasets/mnist/partitioned/train/x/0.npy b/datasets/mnist/partitioned/train/x/0.npy new file mode 100644 index 0000000000000000000000000000000000000000..f8974773849023ecd8cbc2354c26627afcb28fc3 --- /dev/null +++ b/datasets/mnist/partitioned/train/x/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ab9b1dd7a661bea99a3b0dd91e81904b03855ea725e274fe8bd041780cf18f +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/1.npy b/datasets/mnist/partitioned/train/x/1.npy new file mode 100644 index 0000000000000000000000000000000000000000..26f34102e2a7f2099e3b36cec098613ffd909e90 --- /dev/null +++ b/datasets/mnist/partitioned/train/x/1.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4cbee1c3c137ba8fe6033d11a5f097eba3d581b14f813e57d5bb1a39be03b2c +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/2.npy b/datasets/mnist/partitioned/train/x/2.npy new file mode 100644 index 0000000000000000000000000000000000000000..a6a82251bf6c7df52b57b79276ab8bdbd8765820 --- /dev/null +++ b/datasets/mnist/partitioned/train/x/2.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507c0aa68fb2e56ef97b87f4d4033a51a05657980d1c721fd3247ba6ab78ecdd +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/3.npy b/datasets/mnist/partitioned/train/x/3.npy new file mode 100644 index 0000000000000000000000000000000000000000..603237a36583e273446ff5005cb583f79a06eb08 --- /dev/null +++ b/datasets/mnist/partitioned/train/x/3.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f31fb1045ed250ac6d0f1db02181821232ecf9fde11186ffdaffb41aa57f422 +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/4.npy b/datasets/mnist/partitioned/train/x/4.npy new file mode 100644 index 0000000000000000000000000000000000000000..bdb707e7d8181edd9456a12d60a4d8f0ad14f43a --- /dev/null +++ b/datasets/mnist/partitioned/train/x/4.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2368ef45033da7492e3b5c5819d534158ca7b776cf4388bc8250bdcefd64bff5 +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/5.npy b/datasets/mnist/partitioned/train/x/5.npy new file mode 100644 index 0000000000000000000000000000000000000000..3a7a11b919696928a9ed9e91efec7eca25742439 --- /dev/null +++ b/datasets/mnist/partitioned/train/x/5.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:208beab7854df575b67b40191720c52866893779ea145e13a6da3e38b8fe7352 +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/6.npy b/datasets/mnist/partitioned/train/x/6.npy new file mode 100644 index 0000000000000000000000000000000000000000..f598c3e0ae662e43333174da9603680f201936ff --- /dev/null +++ b/datasets/mnist/partitioned/train/x/6.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5313f1e80e0270b61982dddecaf68dd06cef978b4314af165cdcc39970e164 +size 5880128 diff --git a/datasets/mnist/partitioned/train/x/7.npy b/datasets/mnist/partitioned/train/x/7.npy new file mode 100644 index 0000000000000000000000000000000000000000..3a7db3ddde1d61263eff213def5d9a0c51e189ce --- /dev/null +++ b/datasets/mnist/partitioned/train/x/7.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe736405b00cef238fd06ffae326aba880400a1cf2e953c8f5ef5543f4e7c06 +size 5880128 diff --git a/datasets/mnist/partitioned/train/y/0.npy b/datasets/mnist/partitioned/train/y/0.npy new file mode 100644 index 0000000000000000000000000000000000000000..1fe9885c042696693f31815b65c731d999083d6e --- /dev/null +++ b/datasets/mnist/partitioned/train/y/0.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:581d31f1482fd9ae2ad93bf018a31e24dd0b87c5e8299fa062b5b955ffff7f5e +size 7628 diff --git a/datasets/mnist/partitioned/train/y/1.npy b/datasets/mnist/partitioned/train/y/1.npy new file mode 100644 index 0000000000000000000000000000000000000000..046dacf991d5c364d9b778ca8ff325a70835db29 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/1.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a14441423517041fb8fe61312b60754cd0d4eeb8956ef1060a7f57780a768367 +size 7628 diff --git a/datasets/mnist/partitioned/train/y/2.npy b/datasets/mnist/partitioned/train/y/2.npy new file mode 100644 index 0000000000000000000000000000000000000000..b257e2324424b2d84068392a50d91c48fe358e19 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/2.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0234fa84a17b72bdd6ae29c87b4d2c1efccaec8f29e79adf9445a9cd008cd12 +size 7628 diff --git a/datasets/mnist/partitioned/train/y/3.npy b/datasets/mnist/partitioned/train/y/3.npy new file mode 100644 index 0000000000000000000000000000000000000000..659e67079e1dcf3f6085ca6f510589fff5469d66 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/3.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c8af4ccb59b3a0343fd0f43db4b3b6796962fb871fa4a1e3470a7e506469c0 +size 7628 diff --git a/datasets/mnist/partitioned/train/y/4.npy b/datasets/mnist/partitioned/train/y/4.npy new file mode 100644 index 0000000000000000000000000000000000000000..a5c22bdabce889323e7dab44d39d30fba38aac3a --- /dev/null +++ b/datasets/mnist/partitioned/train/y/4.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:810b5e113d4913a7414b38294c8508e5607e0ff4a22c1e6c3c6fa7221ac37e40 +size 7628 diff --git a/datasets/mnist/partitioned/train/y/5.npy b/datasets/mnist/partitioned/train/y/5.npy new file mode 100644 index 0000000000000000000000000000000000000000..512d28d3f550cef7a19ef1587e98535f8fa3be18 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/5.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64c9e94f8b223ad58e2b19341d2b2f2d69823703a2bb664ae8a55565157136fc +size 7628 diff --git a/datasets/mnist/partitioned/train/y/6.npy b/datasets/mnist/partitioned/train/y/6.npy new file mode 100644 index 0000000000000000000000000000000000000000..f7ad45ead73980972c384355d03f42d9f7edbad4 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/6.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff847487ba5695f4d39241efdb985aeb4e8e7cc70df5bac309bbe9f5025b292 +size 7628 diff --git a/datasets/mnist/partitioned/train/y/7.npy b/datasets/mnist/partitioned/train/y/7.npy new file mode 100644 index 0000000000000000000000000000000000000000..18a15d6e5871c68629244094981ff03c32aab9b2 --- /dev/null +++ b/datasets/mnist/partitioned/train/y/7.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c47569491fc4db006a4b173b2e4dc6f866390598342fa41b0b57e5b5e5f03ff0 +size 7628 diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py index e31aa8a009e2fee923023dc18d5e5979c6d70203..0c46a771047d0adfa6d61017176d6ef2c6de0d67 100644 --- a/horovod/keras/mnist.py +++ b/horovod/keras/mnist.py @@ -1,3 +1,8 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist.py, +# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). + from __future__ import print_function import os import sys diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py index bf52fddb7e312d10e4a4bd7a0da7f9ab87368ff3..ba60b6d64d61feac9e19e0af213b5134087f887b 100644 --- a/horovod/keras/mnist_advanced.py +++ b/horovod/keras/mnist_advanced.py @@ -1,3 +1,9 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/horovod/horovod/blob/master/examples/keras_mnist_advanced.py, +# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). + + from __future__ import print_function import os import sys diff --git a/horovod/tensorflow/mnist.py b/horovod/tensorflow/mnist.py index 8099f1c22a3927c9b38adb7375a60f752b28acf2..3c780accef6f40d6bb3c95196f4feb69aafb96fe 100644 --- a/horovod/tensorflow/mnist.py +++ b/horovod/tensorflow/mnist.py @@ -1,17 +1,7 @@ -# Copyright 2017 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist.py, +# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). import os import sys diff --git a/horovod/tensorflow/mnist_estimator.py b/horovod/tensorflow/mnist_estimator.py index 861de50549b470685462a688643dbf3cd8e86288..792c0577f5e6324eddca6e54d23d6669a21ab3c4 100644 --- a/horovod/tensorflow/mnist_estimator.py +++ b/horovod/tensorflow/mnist_estimator.py @@ -1,17 +1,8 @@ -# Copyright 2018 Uber Technologies, Inc. All Rights Reserved. -# Copyright 2016 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_estimator.py, +# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). + """Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" from __future__ import absolute_import diff --git a/horovod_data_distributed/README.md b/horovod_data_distributed/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c0934fc91d3305c4cecf1d144d9f3c204c848a4b --- /dev/null +++ b/horovod_data_distributed/README.md @@ -0,0 +1,24 @@ +# Introduction + +Please see the main docstring in each program for details. + +# Notes + +The `mnist_data_distributed.py` program requires the [`slns.distribution`]( +https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils#1-slnsdistribution) +module for distribution of training data filenames across multiple ranks. +Please follow the steps below to install the required package. + +1. Change to the source directory for this sample, i.e., to `dl_on_supercomputers/horovod_data_distributed` +2. Load the system-wide Python module. + * On JURECA and JUWELS: `module load Python/3.6.8` + * On JURON: `module load Python/3.6.1` +3. Create a Python virtual environment: `python -m venv venv_dl_slns` +4. Activate the virtual environment: `source activate venv_dl_slns/bin/activate` +5. Install the `slns` package: `python -m pip install git+https://gitlab.version.fz-juelich.de/hpc4ns/slns_utils.git` +6. Open the job submission script you intend to use, and make sure the path to the virtual environment is correct + +Once all the above steps are completed, the job can be submitted. + +**Note:** A maximum of eight ranks can be used to run `mnist_data_distributed.py`, as there +only eight training files. \ No newline at end of file diff --git a/horovod_data_distributed/mnist_data_distributed.py b/horovod_data_distributed/mnist_data_distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..b6add7e7e9210f1b734e02a8475b71ab1e3e0181 --- /dev/null +++ b/horovod_data_distributed/mnist_data_distributed.py @@ -0,0 +1,220 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). + +""" + This program distributes the partitioned MNIST data across multiple ranks + for truly data distributed training of a shallow ANN for handwritten digit + classification. + + The Horovod framework is used for seamless distributed training. However, + instead of distributing epochs, this program distributes data amongst the + ranks, so that each rank contributes training based on its local subset of + the training data. + +""" + +import os +import sys + +import mpi4py +import numpy as np +import tensorflow as tf +import horovod.tensorflow.keras as hvd +from tensorflow.python.keras import backend as K + +from slns.errors import MpiInitError +from slns.distribution import DataDistributor + +sys.path.insert(0, '../../utils') +from data_utils import DataValidator + + +def get_filenames(path): + """ + Returns a list of names of files available on the given path. + + :param path: str. Valid path to an existing directory. + + :return: list. A list of filenames, where each filename is + of type str. + """ + + absolute_path = os.path.join(os.path.abspath(f'{path}/x')) + + return os.listdir(absolute_path) + + +def get_concatenated_data(path, filenames): + """ + Loads all files with the given filenames from the given path, + and concatenates all the loaded tensors into one large + tensor. + + :param path: str. Valid path to an existing directory. + :param filenames: list. A list of filenames, where each filename is + of type str. + + :return: np.ndarray. A tensor with all the loaded content. + """ + + arrays = [ + np.load(os.path.join(path, f)) for f in filenames + ] + + return np.concatenate(arrays) + + +def load_dataset(path, filenames): + """ + Loads the input data and the corresponding labels as + two np.ndarray types, and returns these as a tuple. + + :param path: str. Valid path to an existing directory. + :param filenames: list. A list of filenames, where each filename is + of type str. + + :return: Tuple consisting two np.ndarray types. The value at + the first tuple index is the input tensor, while the + other value is the corresponding array of labels. + """ + + x_dir = os.path.join(os.path.abspath(f'{path}/x')) + y_dir = os.path.join(os.path.abspath(f'{path}/y')) + + x = get_concatenated_data(x_dir, filenames) + y = get_concatenated_data(y_dir, filenames) + + return x, y + + +def initialize_hvd_and_mpi(): + """ + Configure and initialize Horovod and MPI. Also, make sure there + are no conflicts between Horovod and mpi4py communicator + initialization. + + :exception: slns.errors.MpiInitError is raised in the case + of initialization failure. + """ + + # Initialize Horovod. + hvd.init() + + # Bind the local rank to a specific GPU, so that each rank uses + # a different GPU + tf_config = tf.ConfigProto() + tf_config.gpu_options.allow_growth = True + tf_config.gpu_options.visible_device_list = str(hvd.local_rank()) + K.set_session(tf.Session(config=tf_config)) + + # Verify that MPI multi-threading is supported. Horovod cannot work + # with mpi4py (or any other MPI library) otherwise. + # More info on MPI multi-threading: + # https://www.mcs.anl.gov/research/projects/mpi/mpi-standard/mpi-report-2.0/node163.htm#Node163 + if not hvd.mpi_threads_supported(): + raise MpiInitError( + 'MPI multi-threading is not supported. Horovod cannot work with mpi4py' + 'in this case. Please enable MPI multi-threading and try again.' + ) + + # Disable automatic MPI initialization on importing mpi4py.MPI, + # as we are relying on Horovod to take care of the initialization. + mpi4py.rc.initialize = False + + # Verify that Horovod and mpi4py are using the same number of ranks + from mpi4py import MPI + if hvd.size() != MPI.COMM_WORLD.Get_size(): + raise MpiInitError( + 'Mismatch in hvd.size() and MPI.COMM_WORLD size.' + f' No. of ranks in Horovod: {hvd.size()}.' + f' No. of ranks in mpi4py: {MPI.COMM_WORLD.Get_size()}' + ) + + +def main(): + """ Orchestrates the distributed training program. """ + + # Configure and initialize Horovod and mpi4py + initialize_hvd_and_mpi() + + # Flag to indicate whether this is the MPI root + is_root = hvd.rank() == 0 + + # Decorate the get_filenames function so that instead of returning + # a list of all filenames, it returns a list of the subset of + # filenames that are to be processed by the local rank. + dist_decorator = DataDistributor( + mpi_comm=mpi4py.MPI.COMM_WORLD, shutdown_on_error=True + ) + get_rank_local_filenames = dist_decorator(get_filenames) + + # Data directory paths + data_sub_dir = 'mnist/partitioned' + data_dir = DataValidator.validated_data_dir(data_sub_dir) + + # Prepare training data + train_filenames = get_rank_local_filenames( + f'{os.path.join(data_dir, data_sub_dir)}/train') + x_train, y_train = load_dataset( + f'{os.path.join(data_dir, data_sub_dir)}/train', train_filenames) + + # Normalize input samples + x_train = x_train / 255.0 + + if is_root: + # Prepare test data + test_filenames = get_filenames( + f'{os.path.join(data_dir, data_sub_dir)}/test') + x_test, y_test = load_dataset( + f'{os.path.join(data_dir, data_sub_dir)}/test', test_filenames) + x_test = x_test / 255.0 + else: + x_test, y_test = None, None + + # Define the model, i.e., the network + model = tf.keras.models.Sequential([ + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(512, activation=tf.nn.relu), + tf.keras.layers.Dense(10, activation=tf.nn.softmax) + ]) + + # Optimizer + optimizer = tf.keras.optimizers.Adam() + + # Decorate the optimizer with the Horovod Distributed Optimizer + optimizer = hvd.DistributedOptimizer(optimizer) + + # Compile the model + model.compile( + optimizer=optimizer, + loss='sparse_categorical_crossentropy', + metrics=['accuracy'] + ) + + # Fixed No. of epochs + epochs = 24 + + # Training callbacks + callbacks = [ + hvd.callbacks.BroadcastGlobalVariablesCallback(0) + ] + + # Train the model using the training set + model.fit( + x=x_train, + y=y_train, + batch_size=32, + epochs=epochs, + verbose=1 if is_root else 0, + callbacks=callbacks + ) + + if is_root: + # Test the model on the test set + score = model.evaluate(x=x_test, y=y_test, verbose=0) + print('Test loss:', score[0]) + print('Test accuracy:', score[1]) + + +if __name__ == '__main__': + main() diff --git a/horovod_data_distributed/submit_job_juwels.sh b/horovod_data_distributed/submit_job_juwels.sh new file mode 100755 index 0000000000000000000000000000000000000000..24f50d842239dcaecd8c4942d1a27f189919a352 --- /dev/null +++ b/horovod_data_distributed/submit_job_juwels.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# Slurm job configuration +#SBATCH --nodes=2 +#SBATCH --ntasks=8 +#SBATCH --ntasks-per-node=4 +#SBATCH --output=output_%j.out +#SBATCH --error=error_%j.er +#SBATCH --time=00:10:00 +#SBATCH --job-name=HVD_DATA_DIST +#SBATCH --gres=gpu:4 --partition=develgpus +#SBATCH --mail-type=ALL + +# Load the required modules +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 + +# Source the virtual environment +source activate venv_dl_slns/bin/activate + +# Run the program +srun python -u mnist_data_distributed.py diff --git a/keras/mnist.py b/keras/mnist.py index c1831694e02ccc3d1546fb8955f5798474c870e6..9fc93f2a56f0aa38318a114e151b7e2a6c2ea15c 100644 --- a/keras/mnist.py +++ b/keras/mnist.py @@ -1,3 +1,9 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py, +# which is also licensed under The MIT License (see the NOTICE file for details). + + """Trains a simple convnet on the MNIST dataset. Gets to 99.25% test accuracy after 12 epochs diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..79144dccd44dd967fb51438abbcd9589c6d81937 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,24 @@ +absl-py==0.8.0 +astor==0.8.0 +cffi==1.12.3 +cloudpickle==1.2.1 +gast==0.3.1 +grpcio==1.23.0 +h5py==2.10.0 +Markdown==3.1.1 +mock==3.0.5 +mpi4py==3.0.2 +numpy==1.17.2 +protobuf==3.9.1 +psutil==5.6.3 +pycparser==2.19 +six==1.12.0 +Werkzeug==0.15.6 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.0 +tensorboard==1.13.1 +tensorflow-estimator==1.13.0 +tensorflow-gpu==1.13.1 +termcolor==1.1.0 +keras==2.3.1 +horovod==0.16.2 \ No newline at end of file diff --git a/tensorflow/mnist.py b/tensorflow/mnist.py index 7ba4bdc5fb1b25bc0744308a26ad22856f729c26..30477e153e9c59d2a151b90686049b39885155e4 100644 --- a/tensorflow/mnist.py +++ b/tensorflow/mnist.py @@ -1,17 +1,7 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). +# This code is derived from https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py, +# which is licensed under the Apache License, Version 2.0 (see the NOTICE file for details). """Simple, end-to-end, LeNet-5-like convolutional MNIST model example. diff --git a/utils/data_utils.py b/utils/data_utils.py index 048885633c636c08a14ce04e3519f7ead932a7e1..bab6e035ea82e22108998da561519a70eea94eac 100644 --- a/utils/data_utils.py +++ b/utils/data_utils.py @@ -1,3 +1,6 @@ +# Copyright (c) 2019 Forschungszentrum Juelich GmbH. +# This code is licensed under MIT license (see the LICENSE file for details). + """ A collections of utilities for data manipulation.