Skip to content
Snippets Groups Projects
Commit 49c75086 authored by Stefan Kesselheim's avatar Stefan Kesselheim
Browse files

update

parent 42f4501c
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env bash
#SBATCH --account=atmlaml
#SBATCH --partition=develbooster
#SBATCH --nodes=1
#SBATCH --account=training2437
#SBATCH --partition=dc-gpu-devel
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48
# Use only physical cores. (Can use up to 2 threads per core.)
......@@ -16,7 +16,7 @@ curr_dir="$(dirname "$curr_file")"
# Propagate the specified number of CPUs per task to each `srun`.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
source "$curr_dir"/activate.sh
source ./activate.sh
export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
if [ "$SYSTEMNAME" = juwelsbooster ] \
......@@ -33,10 +33,12 @@ export NCCL_SOCKET_IFNAME=ib0
# Prevent GLOO not being able to communicate.
export GLOO_SOCKET_IFNAME=ib0
srun --tasks-per-node=1 copy_and_unpack_data_to_shm.sh
srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
--nproc_per_node=gpu \
--nnodes="$SLURM_JOB_NUM_NODES" \
--rdzv_id="$SLURM_JOB_ID" \
--rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
--rdzv_backend=c10d \
"$curr_dir"/main.py "$@"
./train_parallel.py "$@"
......@@ -5,6 +5,7 @@ import torchvision
import torchvision.models
import types
import os
import time
root='/dev/shm/data/'
modelsroot='/p/project1/training2437/kesselheim1/galaxy-classification/models/'
......@@ -35,6 +36,7 @@ for images, labels in datamodule.train_dataloader():
def train(args, model, device, train_loader, optimizer, epoch, loss_fn):
model.train()
start_time=time.time()
for batch_idx, (data, target) in enumerate(train_loader):
target=(target>0.).type(torch.long)
......@@ -50,6 +52,7 @@ def train(args, model, device, train_loader, optimizer, epoch, loss_fn):
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
print("--- Epoch time: %s seconds ---" % (time.time() - start_time))
def test(model, device, test_loader, loss_fn):
model.eval()
......@@ -80,7 +83,7 @@ model=model.cuda()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(), lr=0.01)
optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs=20
args=types.SimpleNamespace(dry_run=False, log_interval=16)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment