Skip to content
Snippets Groups Projects
Commit 07a891f4 authored by Fahad Khalid's avatar Fahad Khalid
Browse files

Updates after testing on JURECA.

parent fade019c
Branches jureca_2019_a
No related tags found
No related merge requests found
...@@ -89,6 +89,7 @@ celerybeat-schedule ...@@ -89,6 +89,7 @@ celerybeat-schedule
env/ env/
venv/ venv/
venv3/ venv3/
venv_2019a/
ENV/ ENV/
env.bak/ env.bak/
venv.bak/ venv.bak/
......
...@@ -156,7 +156,7 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju ...@@ -156,7 +156,7 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju
2. Load the Git LFS module: 2. Load the Git LFS module:
`module load git-lfs/2.6.1` `module load git-lfs`
3. Initialize Git LFS: 3. Initialize Git LFS:
`git lfs install` `git lfs install`
......
...@@ -35,7 +35,7 @@ batch_size = 128 ...@@ -35,7 +35,7 @@ batch_size = 128
num_classes = 10 num_classes = 10
# Horovod: adjust number of epochs based on number of GPUs. # Horovod: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(12.0 / hvd.size())) epochs = int(math.ceil(16.0 / hvd.size()))
# Input image dimensions # Input image dimensions
img_rows, img_cols = 28, 28 img_rows, img_cols = 28, 28
......
...@@ -36,7 +36,7 @@ num_classes = 10 ...@@ -36,7 +36,7 @@ num_classes = 10
# Enough epochs to demonstrate learning rate warmup and the reduction of # Enough epochs to demonstrate learning rate warmup and the reduction of
# learning rate when training plateaues. # learning rate when training plateaues.
epochs = 12 epochs = 16
# Input image dimensions # Input image dimensions
img_rows, img_cols = 28, 28 img_rows, img_cols = 28, 28
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# Slurm job configuration # Slurm job configuration
#SBATCH --nodes=2 #SBATCH --nodes=2
#SBATCH --ntasks=4 #SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4 #SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out #SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er #SBATCH --error=error_%j.er
......
...@@ -53,11 +53,15 @@ if args.cuda: ...@@ -53,11 +53,15 @@ if args.cuda:
torch.cuda.set_device(hvd.local_rank()) torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed(args.seed)
# Horovod: limit # of CPU threads to be used per worker.
torch.set_num_threads(1)
# [HPCNS] Fully qualified dataset file name # [HPCNS] Fully qualified dataset file name
dataset_file = os.path.join(data_dir, data_file) dataset_file = os.path.join(data_dir, data_file)
# [HPCNS] Dataset filename for this rank # [HPCNS] Dataset filename for this rank
dataset_for_rank = 'MNIST' dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank())
dataset_for_rank = dataset_root_for_rank + '/MNIST'
# [HPCNS] If the path already exists, remove it # [HPCNS] If the path already exists, remove it
if os.path.exists(dataset_for_rank): if os.path.exists(dataset_for_rank):
...@@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank) ...@@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \ train_dataset = \
datasets.MNIST('', train=True, download=False, datasets.MNIST(dataset_root_for_rank, train=True, download=False,
transform=transforms.Compose([ transform=transforms.Compose([
transforms.ToTensor(), transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)) transforms.Normalize((0.1307,), (0.3081,))
...@@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader( ...@@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
test_dataset = \ test_dataset = \
datasets.MNIST('', train=False, download=False, transform=transforms.Compose([ datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(), transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)) transforms.Normalize((0.1307,), (0.3081,))
])) ]))
...@@ -116,13 +120,14 @@ if args.cuda: ...@@ -116,13 +120,14 @@ if args.cuda:
# Move model to GPU. # Move model to GPU.
model.cuda() model.cuda()
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
# Horovod: scale learning rate by the number of GPUs. # Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(), optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
momentum=args.momentum) momentum=args.momentum)
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm. # Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
...@@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1): ...@@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1):
test() test()
# [HPCNS] Remove the copied dataset # [HPCNS] Remove the copied dataset
shutil.rmtree(dataset_for_rank) shutil.rmtree(dataset_root_for_rank)
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# Slurm job configuration # Slurm job configuration
#SBATCH --nodes=2 #SBATCH --nodes=2
#SBATCH --ntasks=4 #SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4 #SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out #SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er #SBATCH --error=error_%j.er
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# Slurm job configuration # Slurm job configuration
#SBATCH --nodes=2 #SBATCH --nodes=2
#SBATCH --ntasks=4 #SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4 #SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out #SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er #SBATCH --error=error_%j.er
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment