diff --git a/src/distrib.py b/src/distrib.py index 322a146ad353a0c3d827b4498a765d1db7eff264..2b1bf703e952b2652592a6ff5417e7a75f7bba46 100644 --- a/src/distrib.py +++ b/src/distrib.py @@ -1,6 +1,11 @@ from fastai.vision.all import * from fastai.distributed import * from fastai.vision.models.xresnet import * +from accelerate import Accelerator + +# Print status information about the distributed environment +accelerator = Accelerator() +print(accelerator.state) path = rank0_first(untar_data, URLs.IMAGEWOOF_320) dls = DataBlock( diff --git a/src/distrib.slurm b/src/distrib.slurm index 1a86c33303c0c549b5d0ca5fe461e778b90d0173..bf7be2fa2a220e08fd75a2719ed20a78d26488a5 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -10,6 +10,20 @@ #SBATCH --partition=develbooster #SBATCH --gres=gpu:4 +# srun doesnot inherit cpus-per-task from sbatch +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} +# so processes know who to talk to +MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" +# Allow communication over InfiniBand cells. +MASTER_ADDR="${MASTER_ADDR}i" +# Get IP for hostname. +MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +MASTER_PORT=6000 +GPUS_PER_NODE=4 +NNODES=$SLURM_JOB_NUM_NODES + + + # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src diff --git a/src/serial.slurm b/src/serial.slurm index 7d38cacf8e477c396ed38c20b7b279f8880ef644..10071ea9f092bba271db786c24f5aa2268feb32e 100644 --- a/src/serial.slurm +++ b/src/serial.slurm @@ -7,7 +7,7 @@ #SBATCH --output=output.%j #SBATCH --error=err.%j #SBATCH --time=00:40:00 -#SBATCH --partition=booster +#SBATCH --partition=develbooster #SBATCH --gres=gpu:1 # Make sure we are on the right directory