Skip to content
Snippets Groups Projects
Commit 07a891f4 authored by Fahad Khalid's avatar Fahad Khalid
Browse files

Updates after testing on JURECA.

parent fade019c
No related branches found
No related tags found
No related merge requests found
......@@ -89,6 +89,7 @@ celerybeat-schedule
env/
venv/
venv3/
venv_2019a/
ENV/
env.bak/
venv.bak/
......
......@@ -156,7 +156,7 @@ systems. To learn more about Git LFS, click [here](http://gitlab.pages.jsc.fz-ju
2. Load the Git LFS module:
`module load git-lfs/2.6.1`
`module load git-lfs`
3. Initialize Git LFS:
`git lfs install`
......
......@@ -35,7 +35,7 @@ batch_size = 128
num_classes = 10
# Horovod: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(12.0 / hvd.size()))
epochs = int(math.ceil(16.0 / hvd.size()))
# Input image dimensions
img_rows, img_cols = 28, 28
......
......@@ -36,7 +36,7 @@ num_classes = 10
# Enough epochs to demonstrate learning rate warmup and the reduction of
# learning rate when training plateaues.
epochs = 12
epochs = 16
# Input image dimensions
img_rows, img_cols = 28, 28
......
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
......@@ -53,11 +53,15 @@ if args.cuda:
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(args.seed)
# Horovod: limit # of CPU threads to be used per worker.
torch.set_num_threads(1)
# [HPCNS] Fully qualified dataset file name
dataset_file = os.path.join(data_dir, data_file)
# [HPCNS] Dataset filename for this rank
dataset_for_rank = 'MNIST'
dataset_root_for_rank = 'MNIST-data-{}'.format(hvd.rank())
dataset_for_rank = dataset_root_for_rank + '/MNIST'
# [HPCNS] If the path already exists, remove it
if os.path.exists(dataset_for_rank):
......@@ -68,7 +72,7 @@ shutil.copytree(dataset_file, dataset_for_rank)
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
datasets.MNIST('', train=True, download=False,
datasets.MNIST(dataset_root_for_rank, train=True, download=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
......@@ -80,7 +84,7 @@ train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)
test_dataset = \
datasets.MNIST('', train=False, download=False, transform=transforms.Compose([
datasets.MNIST(dataset_root_for_rank, train=False, download=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
......@@ -116,13 +120,14 @@ if args.cuda:
# Move model to GPU.
model.cuda()
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=args.lr * hvd.size(),
momentum=args.momentum)
# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
......@@ -192,4 +197,4 @@ for epoch in range(1, args.epochs + 1):
test()
# [HPCNS] Remove the copied dataset
shutil.rmtree(dataset_for_rank)
shutil.rmtree(dataset_root_for_rank)
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
......@@ -2,7 +2,7 @@
# Slurm job configuration
#SBATCH --nodes=2
#SBATCH --ntasks=4
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=4
#SBATCH --output=output_%j.out
#SBATCH --error=error_%j.er
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment