From d60bc6eddf6813800d0c5f54672fb0304f24e755 Mon Sep 17 00:00:00 2001 From: Alexandre Strube <a.strube@fz-juelich.de> Date: Mon, 29 May 2023 23:04:46 +0200 Subject: [PATCH] trying stuff --- src/distrib.py | 5 +++++ src/distrib.slurm | 14 ++++++++++++++ src/serial.slurm | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/distrib.py b/src/distrib.py index 322a146..2b1bf70 100644 --- a/src/distrib.py +++ b/src/distrib.py @@ -1,6 +1,11 @@ from fastai.vision.all import * from fastai.distributed import * from fastai.vision.models.xresnet import * +from accelerate import Accelerator + +# Print status information about the distributed environment +accelerator = Accelerator() +print(accelerator.state) path = rank0_first(untar_data, URLs.IMAGEWOOF_320) dls = DataBlock( diff --git a/src/distrib.slurm b/src/distrib.slurm index 1a86c33..bf7be2f 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -10,6 +10,20 @@ #SBATCH --partition=develbooster #SBATCH --gres=gpu:4 +# srun doesnot inherit cpus-per-task from sbatch +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} +# so processes know who to talk to +MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" +# Allow communication over InfiniBand cells. +MASTER_ADDR="${MASTER_ADDR}i" +# Get IP for hostname. +MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +MASTER_PORT=6000 +GPUS_PER_NODE=4 +NNODES=$SLURM_JOB_NUM_NODES + + + # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src diff --git a/src/serial.slurm b/src/serial.slurm index 7d38cac..10071ea 100644 --- a/src/serial.slurm +++ b/src/serial.slurm @@ -7,7 +7,7 @@ #SBATCH --output=output.%j #SBATCH --error=err.%j #SBATCH --time=00:40:00 -#SBATCH --partition=booster +#SBATCH --partition=develbooster #SBATCH --gres=gpu:1 # Make sure we are on the right directory -- GitLab