From 07a891f459629efd7d0324284e9e25ff8784b2c8 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Fri, 23 Aug 2019 09:30:55 +0200
Subject: [PATCH] Updates after testing on JURECA.

---
 .gitignore                                    |  1 +
 README.md                                     |  2 +-
 horovod/keras/mnist.py                        |  2 +-
 horovod/keras/mnist_advanced.py               |  2 +-
 horovod/keras/submit_job_jureca_python3.sh    |  2 +-
 horovod/pytorch/mnist.py                      | 19 ++++++++++++-------
 horovod/pytorch/submit_job_jureca_python3.sh  |  2 +-
 .../tensorflow/submit_job_jureca_python3.sh   |  2 +-
 8 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 492e5e9..7f5ba6e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,6 +89,7 @@ celerybeat-schedule
 env/
 venv/
 venv3/
+venv_2019a/
 ENV/
 env.bak/
 venv.bak/
diff --git a/README.md b/README.md
index 3bbadd9..54f0f02 100644
--- a/README.md
+++ b/README.md
@@ -156,7 +156,7 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju
 
 2.  Load the Git LFS module:
 
-    `module load git-lfs/2.6.1`
+    `module load git-lfs`
 3.  Initialize Git LFS:
 
     `git lfs install`
diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py
index b098f26..85dd944 100644
--- a/horovod/keras/mnist.py
+++ b/horovod/keras/mnist.py
@@ -35,7 +35,7 @@ batch_size = 128
 num_classes = 10
 
 # Horovod: adjust number of epochs based on number of GPUs.
-epochs = int(math.ceil(12.0 / hvd.size()))
+epochs = int(math.ceil(16.0 / hvd.size()))
 
 # Input image dimensions
 img_rows, img_cols = 28, 28
diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py
index 9337026..bf52fdd 100644
--- a/horovod/keras/mnist_advanced.py
+++ b/horovod/keras/mnist_advanced.py
@@ -36,7 +36,7 @@ num_classes = 10
 
 # Enough epochs to demonstrate learning rate warmup and the reduction of
 # learning rate when training plateaues.
-epochs = 12
+epochs = 16
 
 # Input image dimensions
 img_rows, img_cols = 28, 28
diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh
index 76fa6cd..0a771db 100755
--- a/horovod/keras/submit_job_jureca_python3.sh
+++ b/horovod/keras/submit_job_jureca_python3.sh
@@ -2,7 +2,7 @@
 
 # Slurm job configuration
 #SBATCH --nodes=2
-#SBATCH --ntasks=4
+#SBATCH --ntasks=8
 #SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py
index 4d90a01..4f43193 100644
--- a/horovod/pytorch/mnist.py
+++ b/horovod/pytorch/mnist.py
@@ -53,11 +53,15 @@ if args.cuda:
     torch.cuda.set_device(hvd.local_rank())
     torch.cuda.manual_seed(args.seed)
 
+# Horovod: limit # of CPU threads to be used per worker.
+torch.set_num_threads(1)
+
 # [HPCNS] Fully qualified dataset file name
 dataset_file = os.path.join(data_dir, data_file)
 
 # [HPCNS] Dataset filename for this rank
-dataset_for_rank = 'MNIST'
+dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank())
+dataset_for_rank = dataset_root_for_rank + '/MNIST'
 
 # [HPCNS] If the path already exists, remove it
 if os.path.exists(dataset_for_rank):
@@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank)
 
 kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
 train_dataset = \
-    datasets.MNIST('', train=True, download=False,
+    datasets.MNIST(dataset_root_for_rank, train=True, download=False,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
@@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
 
 test_dataset = \
-    datasets.MNIST('', train=False, download=False, transform=transforms.Compose([
+    datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))
     ]))
@@ -116,13 +120,14 @@ if args.cuda:
     # Move model to GPU.
     model.cuda()
 
-# Horovod: broadcast parameters.
-hvd.broadcast_parameters(model.state_dict(), root_rank=0)
-
 # Horovod: scale learning rate by the number of GPUs.
 optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
                       momentum=args.momentum)
 
+# Horovod: broadcast parameters & optimizer state.
+hvd.broadcast_parameters(model.state_dict(), root_rank=0)
+hvd.broadcast_optimizer_state(optimizer, root_rank=0)
+
 # Horovod: (optional) compression algorithm.
 compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
 
@@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1):
     test()
 
 # [HPCNS] Remove the copied dataset
-shutil.rmtree(dataset_for_rank)
+shutil.rmtree(dataset_root_for_rank)
diff --git a/horovod/pytorch/submit_job_jureca_python3.sh b/horovod/pytorch/submit_job_jureca_python3.sh
index 754793f..5070055 100755
--- a/horovod/pytorch/submit_job_jureca_python3.sh
+++ b/horovod/pytorch/submit_job_jureca_python3.sh
@@ -2,7 +2,7 @@
 
 # Slurm job configuration
 #SBATCH --nodes=2
-#SBATCH --ntasks=4
+#SBATCH --ntasks=8
 #SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh
index bf0b4e6..6be2a89 100755
--- a/horovod/tensorflow/submit_job_jureca_python3.sh
+++ b/horovod/tensorflow/submit_job_jureca_python3.sh
@@ -2,7 +2,7 @@
 
 # Slurm job configuration
 #SBATCH --nodes=2
-#SBATCH --ntasks=4
+#SBATCH --ntasks=8
 #SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
-- 
GitLab