From fade019c78cb237d18896e3ebac3e0a3a462f622 Mon Sep 17 00:00:00 2001
From: Fahad Khalid <f.khalid@fz-juelich.de>
Date: Tue, 20 Aug 2019 12:28:25 +0200
Subject: [PATCH] Updated all scripts (except for Caffe) to use the 2019a
 modules.

---
 horovod/keras/submit_job_jureca_python3.sh       | 16 +++++++---------
 horovod/pytorch/mnist.py                         |  6 +++---
 ...a_python3.sh => submit_job_jureca_python3.sh} | 14 +++++++-------
 horovod/tensorflow/submit_job_jureca_python3.sh  | 15 +++++++--------
 pytorch/mnist.py                                 |  6 +++---
 pytorch/submit_job_jureca_python3.sh             |  9 +++------
 tensorflow/submit_job_jureca_python3.sh          |  2 +-
 7 files changed, 31 insertions(+), 37 deletions(-)
 rename horovod/pytorch/{.submit_job_jureca_python3.sh => submit_job_jureca_python3.sh} (54%)

diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh
index 561a45d..76fa6cd 100755
--- a/horovod/keras/submit_job_jureca_python3.sh
+++ b/horovod/keras/submit_job_jureca_python3.sh
@@ -3,22 +3,20 @@
 # Slurm job configuration
 #SBATCH --nodes=2
 #SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=2
+#SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
 #SBATCH --time=00:10:00
 #SBATCH --job-name=HOROVOD_KERAS_MNIST
-#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --gres=gpu:4 --partition=develgpus
 #SBATCH --mail-type=ALL
 
 # Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load TensorFlow/1.12.0-GPU-Python-3.6.6
-module load Keras/2.2.4-GPU-Python-3.6.6
-module load Horovod/0.15.2-GPU-Python-3.6.6
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Keras/2.2.4-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
 
 # Run the program
 srun python -u mnist.py
diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py
index 3d1b9c5..4d90a01 100644
--- a/horovod/pytorch/mnist.py
+++ b/horovod/pytorch/mnist.py
@@ -57,7 +57,7 @@ if args.cuda:
 dataset_file = os.path.join(data_dir, data_file)
 
 # [HPCNS] Dataset filename for this rank
-dataset_for_rank = 'MNIST-data-%d' % hvd.rank()
+dataset_for_rank = 'MNIST'
 
 # [HPCNS] If the path already exists, remove it
 if os.path.exists(dataset_for_rank):
@@ -68,7 +68,7 @@ shutil.copytree(dataset_file, dataset_for_rank)
 
 kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
 train_dataset = \
-    datasets.MNIST(dataset_for_rank, train=True, download=False,
+    datasets.MNIST('', train=True, download=False,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
@@ -80,7 +80,7 @@ train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
 
 test_dataset = \
-    datasets.MNIST(dataset_for_rank, train=False, download=False, transform=transforms.Compose([
+    datasets.MNIST('', train=False, download=False, transform=transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))
     ]))
diff --git a/horovod/pytorch/.submit_job_jureca_python3.sh b/horovod/pytorch/submit_job_jureca_python3.sh
similarity index 54%
rename from horovod/pytorch/.submit_job_jureca_python3.sh
rename to horovod/pytorch/submit_job_jureca_python3.sh
index 1afd801..754793f 100755
--- a/horovod/pytorch/.submit_job_jureca_python3.sh
+++ b/horovod/pytorch/submit_job_jureca_python3.sh
@@ -3,20 +3,20 @@
 # Slurm job configuration
 #SBATCH --nodes=2
 #SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=2
+#SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
 #SBATCH --time=00:10:00
 #SBATCH --job-name=HOROVOD_PYTORCH_MNIST
-#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --gres=gpu:4 --partition=develgpus
 #SBATCH --mail-type=ALL
 
 # Load the required modules
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load PyTorch/1.0.0-GPU-Python-3.6.6
-module load torchvision/0.2.1-GPU-Python-3.6.6
-module load Horovod/0.15.2-GPU-Python-3.6.6
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load PyTorch/1.1.0-GPU-Python-3.6.8
+module load torchvision/0.3.0-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
 
 # Run the program
 srun python -u mnist.py
diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh
index 2a50c8b..bf0b4e6 100755
--- a/horovod/tensorflow/submit_job_jureca_python3.sh
+++ b/horovod/tensorflow/submit_job_jureca_python3.sh
@@ -3,21 +3,20 @@
 # Slurm job configuration
 #SBATCH --nodes=2
 #SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=2
+#SBATCH --ntasks-per-node=4
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
 #SBATCH --time=00:10:00
 #SBATCH --job-name=HOROVOD_TFLOW_MNIST
-#SBATCH --gres=gpu:2 --partition=develgpus
+#SBATCH --gres=gpu:4 --partition=develgpus
 #SBATCH --mail-type=ALL
 
 # Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load TensorFlow/1.12.0-GPU-Python-3.6.6
-module load Horovod/0.15.2-GPU-Python-3.6.6
+module load GCC/8.3.0
+module load MVAPICH2/2.3.1-GDR
+module load TensorFlow/1.13.1-GPU-Python-3.6.8
+module load Keras/2.2.4-GPU-Python-3.6.8
+module load Horovod/0.16.2-GPU-Python-3.6.8
 
 # Run the program
 srun python -u mnist.py
diff --git a/pytorch/mnist.py b/pytorch/mnist.py
index d4092b6..19bcac0 100644
--- a/pytorch/mnist.py
+++ b/pytorch/mnist.py
@@ -108,7 +108,7 @@ def main():
     dataset_file = os.path.join(data_dir, data_file)
 
     # [HPCNS] A copy of the dataset in the current directory
-    dataset_copy = 'MNIST-data'
+    dataset_copy = 'MNIST'
 
     # [HPCNS] If the path already exists, remove it
     if os.path.exists(dataset_copy):
@@ -120,14 +120,14 @@ def main():
 
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
-        datasets.MNIST(dataset_copy, train=True, download=False,
+        datasets.MNIST('', train=True, download=False,
                        transform=transforms.Compose([
                            transforms.ToTensor(),
                            transforms.Normalize((0.1307,), (0.3081,))
                        ])),
         batch_size=args.batch_size, shuffle=True, **kwargs)
     test_loader = torch.utils.data.DataLoader(
-        datasets.MNIST(dataset_copy, train=False, download=False, transform=transforms.Compose([
+        datasets.MNIST('', train=False, download=False, transform=transforms.Compose([
             transforms.ToTensor(),
             transforms.Normalize((0.1307,), (0.3081,))
         ])),
diff --git a/pytorch/submit_job_jureca_python3.sh b/pytorch/submit_job_jureca_python3.sh
index d6d0de1..15f53ac 100755
--- a/pytorch/submit_job_jureca_python3.sh
+++ b/pytorch/submit_job_jureca_python3.sh
@@ -12,12 +12,9 @@
 #SBATCH --mail-type=ALL
 
 # Load the required modules
-module use /usr/local/software/jureca/OtherStages
-module load Stages/2018b
-module load GCC/7.3.0
-module load MVAPICH2/2.3-GDR
-module load PyTorch/1.0.0-GPU-Python-3.6.6
-module load torchvision/0.2.1-GPU-Python-3.6.6
+module load GCC/8.3.0
+module load PyTorch/1.1.0-GPU-Python-3.6.8
+module load torchvision/0.3.0-GPU-Python-3.6.8
 
 # Run the program
 srun python -u mnist.py
diff --git a/tensorflow/submit_job_jureca_python3.sh b/tensorflow/submit_job_jureca_python3.sh
index f9791cf..fa294f1 100755
--- a/tensorflow/submit_job_jureca_python3.sh
+++ b/tensorflow/submit_job_jureca_python3.sh
@@ -7,7 +7,7 @@
 #SBATCH --output=output_%j.out
 #SBATCH --error=error_%j.er
 #SBATCH --time=00:10:00
-#SBATCH --job-name=TENSORFLOW_MNIST
+#SBATCH --job-name=TFLOW_MNIST
 #SBATCH --gres=gpu:1 --partition=develgpus
 #SBATCH --mail-type=ALL
 
-- 
GitLab