Skip to content
Snippets Groups Projects
Commit 87acffca authored by Fahad Khalid's avatar Fahad Khalid
Browse files

Updates replicated from jureca_2019_a.

parent f2139653
No related branches found
No related tags found
No related merge requests found
......@@ -89,6 +89,7 @@ celerybeat-schedule
env/
venv/
venv3/
venv_2019a/
ENV/
env.bak/
venv.bak/
......
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
......@@ -53,11 +53,15 @@ if args.cuda:
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(args.seed)
# Horovod: limit # of CPU threads to be used per worker.
torch.set_num_threads(1)
# [HPCNS] Fully qualified dataset file name
dataset_file = os.path.join(data_dir, data_file)
# [HPCNS] Dataset filename for this rank
dataset_for_rank = 'MNIST'
dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank())
dataset_for_rank = dataset_root_for_rank + '/MNIST'
# [HPCNS] If the path already exists, remove it
if os.path.exists(dataset_for_rank):
......@@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
datasets.MNIST('', train=True, download=False,
datasets.MNIST(dataset_root_for_rank, train=True, download=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
......@@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
test_dataset = \
datasets.MNIST('', train=False, download=False, transform=transforms.Compose([
datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
......@@ -116,13 +120,14 @@ if args.cuda:
# Move model to GPU.
model.cuda()
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
momentum=args.momentum)
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
......@@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1):
test()
# [HPCNS] Remove the copied dataset
shutil.rmtree(dataset_for_rank)
shutil.rmtree(dataset_root_for_rank)
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment