From 07a891f459629efd7d0324284e9e25ff8784b2c8 Mon Sep 17 00:00:00 2001 From: Fahad Khalid <f.khalid@fz-juelich.de> Date: Fri, 23 Aug 2019 09:30:55 +0200 Subject: [PATCH] Updates after testing on JURECA. --- .gitignore | 1 + README.md | 2 +- horovod/keras/mnist.py | 2 +- horovod/keras/mnist_advanced.py | 2 +- horovod/keras/submit_job_jureca_python3.sh | 2 +- horovod/pytorch/mnist.py | 19 ++++++++++++------- horovod/pytorch/submit_job_jureca_python3.sh | 2 +- .../tensorflow/submit_job_jureca_python3.sh | 2 +- 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 492e5e9..7f5ba6e 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ celerybeat-schedule env/ venv/ venv3/ +venv_2019a/ ENV/ env.bak/ venv.bak/ diff --git a/README.md b/README.md index 3bbadd9..54f0f02 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju 2. Load the Git LFS module: - `module load git-lfs/2.6.1` + `module load git-lfs` 3. Initialize Git LFS: `git lfs install` diff --git a/horovod/keras/mnist.py b/horovod/keras/mnist.py index b098f26..85dd944 100644 --- a/horovod/keras/mnist.py +++ b/horovod/keras/mnist.py @@ -35,7 +35,7 @@ batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. -epochs = int(math.ceil(12.0 / hvd.size())) +epochs = int(math.ceil(16.0 / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28 diff --git a/horovod/keras/mnist_advanced.py b/horovod/keras/mnist_advanced.py index 9337026..bf52fdd 100644 --- a/horovod/keras/mnist_advanced.py +++ b/horovod/keras/mnist_advanced.py @@ -36,7 +36,7 @@ num_classes = 10 # Enough epochs to demonstrate learning rate warmup and the reduction of # learning rate when training plateaues. -epochs = 12 +epochs = 16 # Input image dimensions img_rows, img_cols = 28, 28 diff --git a/horovod/keras/submit_job_jureca_python3.sh b/horovod/keras/submit_job_jureca_python3.sh index 76fa6cd..0a771db 100755 --- a/horovod/keras/submit_job_jureca_python3.sh +++ b/horovod/keras/submit_job_jureca_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er diff --git a/horovod/pytorch/mnist.py b/horovod/pytorch/mnist.py index 4d90a01..4f43193 100644 --- a/horovod/pytorch/mnist.py +++ b/horovod/pytorch/mnist.py @@ -53,11 +53,15 @@ if args.cuda: torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) +# Horovod: limit # of CPU threads to be used per worker. +torch.set_num_threads(1) + # [HPCNS] Fully qualified dataset file name dataset_file = os.path.join(data_dir, data_file) # [HPCNS] Dataset filename for this rank -dataset_for_rank = 'MNIST' +dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank()) +dataset_for_rank = dataset_root_for_rank + '/MNIST' # [HPCNS] If the path already exists, remove it if os.path.exists(dataset_for_rank): @@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ - datasets.MNIST('', train=True, download=False, + datasets.MNIST(dataset_root_for_rank, train=True, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) @@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ - datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ + datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) @@ -116,13 +120,14 @@ if args.cuda: # Move model to GPU. model.cuda() -# Horovod: broadcast parameters. -hvd.broadcast_parameters(model.state_dict(), root_rank=0) - # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), momentum=args.momentum) +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none @@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1): test() # [HPCNS] Remove the copied dataset -shutil.rmtree(dataset_for_rank) +shutil.rmtree(dataset_root_for_rank) diff --git a/horovod/pytorch/submit_job_jureca_python3.sh b/horovod/pytorch/submit_job_jureca_python3.sh index 754793f..5070055 100755 --- a/horovod/pytorch/submit_job_jureca_python3.sh +++ b/horovod/pytorch/submit_job_jureca_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er diff --git a/horovod/tensorflow/submit_job_jureca_python3.sh b/horovod/tensorflow/submit_job_jureca_python3.sh index bf0b4e6..6be2a89 100755 --- a/horovod/tensorflow/submit_job_jureca_python3.sh +++ b/horovod/tensorflow/submit_job_jureca_python3.sh @@ -2,7 +2,7 @@ # Slurm job configuration #SBATCH --nodes=2 -#SBATCH --ntasks=4 +#SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 #SBATCH --output=output_%j.out #SBATCH --error=error_%j.er -- GitLab