From 87acffcaaf67021c8836d30d397385a7235c226d Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Fri, 23 Aug 2019 09:42:13 +0200 Subject: [PATCH] Updates replicated from jureca_2019_a. --- .gitignore | 1 + horovod/keras/submit_job_juwels_python3.sh | 2 +- horovod/pytorch/mnist.py | 19 ++++++++++++------- horovod/pytorch/submit_job_juwels_python3.sh | 2 +- .../tensorflow/submit_job_juwels_python3.sh | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 492e5e9..7f5ba6e 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ celerybeat-schedule env/ venv/ venv3/ +venv_2019a/ ENV/ env.bak/ venv.bak/ diff --git a/horovod/keras/submit_job_juwels_python3.sh b/horovod/keras/submit_job_juwels_python3.sh index 76fa6cd..0a771db 100755 --- a/horovod/keras/submit_job_juwels_python3.sh +++ b/horovod/keras/submit_job_juwels_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py index 4d90a01..4f43193 100644 --- a/horovod/pytorch/mnist.py +++ b/horovod/pytorch/mnist.py @@ -53,11 +53,15 @@ if args.cuda: torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) +# Horovod: limit # of CPU threads to be used per worker. +torch.set_num_threads(1) + # [HPCNS] Fully qualified dataset file name dataset_file = os.path.join(data_dir, data_file) # [HPCNS] Dataset filename for this rank -dataset_for_rank = 'MNIST' +dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank()) +dataset_for_rank = dataset_root_for_rank + '/MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_for_rank): @@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ - datasets.MNIST('', train=True, download=False, + datasets.MNIST(dataset_root_for_rank, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ - datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ + datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) @@ -116,13 +120,14 @@ if args.cuda: # Move model to GPU. model.cuda() -# Horovod: broadcast parameters. -hvd.broadcast_parameters(model.state_dict(), root_rank=0) - # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none @@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1): test() # [HPCNS] Remove the copied dataset -shutil.rmtree(dataset_for_rank) +shutil.rmtree(dataset_root_for_rank) diff --git a/horovod/pytorch/submit_job_juwels_python3.sh b/horovod/pytorch/submit_job_juwels_python3.sh index 754793f..5070055 100755 --- a/horovod/pytorch/submit_job_juwels_python3.sh +++ b/horovod/pytorch/submit_job_juwels_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er diff --git a/horovod/tensorflow/submit_job_juwels_python3.sh b/horovod/tensorflow/submit_job_juwels_python3.sh index bf0b4e6..6be2a89 100755 --- a/horovod/tensorflow/submit_job_juwels_python3.sh +++ b/horovod/tensorflow/submit_job_juwels_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er -- GitLab