From fade019c78cb237d18896e3ebac3e0a3a462f622 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Tue, 20 Aug 2019 12:28:25 +0200 Subject: [PATCH] Updated all scripts (except for Caffe) to use the 2019a modules. --- horovod/keras/submit_job_jureca_python3.sh | 16 +++++++--------- horovod/pytorch/mnist.py | 6 +++--- ...a_python3.sh => submit_job_jureca_python3.sh} | 14 +++++++------- horovod/tensorflow/submit_job_jureca_python3.sh | 15 +++++++-------- pytorch/mnist.py | 6 +++--- pytorch/submit_job_jureca_python3.sh | 9 +++------ tensorflow/submit_job_jureca_python3.sh | 2 +- 7 files changed, 31 insertions(+), 37 deletions(-) rename horovod/pytorch/{.submit_job_jureca_python3.sh => submit_job_jureca_python3.sh} (54%) diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh index 561a45d..76fa6cd 100755 --- a/horovod/keras/submit_job_jureca_python3.sh +++ b/horovod/keras/submit_job_jureca_python3.sh @@ -3,22 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_KERAS_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load TensorFlow/1.12.0-GPU-Python-3.6.6 -module load Keras/2.2.4-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py index 3d1b9c5..4d90a01 100644 --- a/horovod/pytorch/mnist.py +++ b/horovod/pytorch/mnist.py @@ -57,7 +57,7 @@ if args.cuda: dataset_file = os.path.join(data_dir, data_file) # [HPCNS] Dataset filename for this rank -dataset_for_rank = 'MNIST-data-%d' % hvd.rank() +dataset_for_rank = 'MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_for_rank): @@ -68,7 +68,7 @@ shutil.copytree(dataset_file, dataset_for_rank) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ - datasets.MNIST(dataset_for_rank, train=True, download=False, + datasets.MNIST('', train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -80,7 +80,7 @@ train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ - datasets.MNIST(dataset_for_rank, train=False, download=False, transform=transforms.Compose([ + datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) diff --git a/horovod/pytorch/.submit_job_jureca_python3.sh b/horovod/pytorch/submit_job_jureca_python3.sh similarity index 54% rename from horovod/pytorch/.submit_job_jureca_python3.sh rename to horovod/pytorch/submit_job_jureca_python3.sh index 1afd801..754793f 100755 --- a/horovod/pytorch/.submit_job_jureca_python3.sh +++ b/horovod/pytorch/submit_job_jureca_python3.sh @@ -3,20 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_PYTORCH_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load PyTorch/1.0.0-GPU-Python-3.6.6 -module load torchvision/0.2.1-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh index 2a50c8b..bf0b4e6 100755 --- a/horovod/tensorflow/submit_job_jureca_python3.sh +++ b/horovod/tensorflow/submit_job_jureca_python3.sh @@ -3,21 +3,20 @@ # Slurm job configuration #SBATCH --nodes=2 #SBATCH --ntasks=4 -#SBATCH --ntasks-per-node=2 +#SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 #SBATCH --job-name=HOROVOD_TFLOW_MNIST -#SBATCH --gres=gpu:2 --partition=develgpus +#SBATCH --gres=gpu:4 --partition=develgpus #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load TensorFlow/1.12.0-GPU-Python-3.6.6 -module load Horovod/0.15.2-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load MVAPICH2/2.3.1-GDR +module load TensorFlow/1.13.1-GPU-Python-3.6.8 +module load Keras/2.2.4-GPU-Python-3.6.8 +module load Horovod/0.16.2-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/pytorch/mnist.py b/pytorch/mnist.py index d4092b6..19bcac0 100644 --- a/pytorch/mnist.py +++ b/pytorch/mnist.py @@ -108,7 +108,7 @@ def main(): dataset_file = os.path.join(data_dir, data_file) # [HPCNS] A copy of the dataset in the current directory - dataset_copy = 'MNIST-data' + dataset_copy = 'MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_copy): @@ -120,14 +120,14 @@ def main(): kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( - datasets.MNIST(dataset_copy, train=True, download=False, + datasets.MNIST('', train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( - datasets.MNIST(dataset_copy, train=False, download=False, transform=transforms.Compose([ + datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), diff --git a/pytorch/submit_job_jureca_python3.sh b/pytorch/submit_job_jureca_python3.sh index d6d0de1..15f53ac 100755 --- a/pytorch/submit_job_jureca_python3.sh +++ b/pytorch/submit_job_jureca_python3.sh @@ -12,12 +12,9 @@ #SBATCH --mail-type=ALL # Load the required modules -module use /usr/local/software/jureca/OtherStages -module load Stages/2018b -module load GCC/7.3.0 -module load MVAPICH2/2.3-GDR -module load PyTorch/1.0.0-GPU-Python-3.6.6 -module load torchvision/0.2.1-GPU-Python-3.6.6 +module load GCC/8.3.0 +module load PyTorch/1.1.0-GPU-Python-3.6.8 +module load torchvision/0.3.0-GPU-Python-3.6.8 # Run the program srun python -u mnist.py diff --git a/tensorflow/submit_job_jureca_python3.sh b/tensorflow/submit_job_jureca_python3.sh index f9791cf..fa294f1 100755 --- a/tensorflow/submit_job_jureca_python3.sh +++ b/tensorflow/submit_job_jureca_python3.sh @@ -7,7 +7,7 @@ #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er #SBATCH --time=00:10:00 -#SBATCH --job-name=TENSORFLOW_MNIST +#SBATCH --job-name=TFLOW_MNIST #SBATCH --gres=gpu:1 --partition=develgpus #SBATCH --mail-type=ALL -- GitLab