diff --git a/src/distrib.py b/src/distrib.py index 2b1bf703e952b2652592a6ff5417e7a75f7bba46..322a146ad353a0c3d827b4498a765d1db7eff264 100644 --- a/src/distrib.py +++ b/src/distrib.py @@ -1,11 +1,6 @@ from fastai.vision.all import * from fastai.distributed import * from fastai.vision.models.xresnet import * -from accelerate import Accelerator - -# Print status information about the distributed environment -accelerator = Accelerator() -print(accelerator.state) path = rank0_first(untar_data, URLs.IMAGEWOOF_320) dls = DataBlock( diff --git a/src/distrib.slurm b/src/distrib.slurm index bf7be2fa2a220e08fd75a2719ed20a78d26488a5..24e2ca06334dca1290f256662899b7d1f5094496 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -3,12 +3,12 @@ #SBATCH --nodes=1 #SBATCH --job-name=ai-multi-gpu #SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=4 -#SBATCH --output=output.%j -#SBATCH --error=err.%j +#SBATCH --cpus-per-task=48 +#SBATCH --output=out-distrib.%j +#SBATCH --error=err-distrib.%j #SBATCH --time=00:20:00 #SBATCH --partition=develbooster -#SBATCH --gres=gpu:4 +#SBATCH --gres=gpu:4 # srun doesnot inherit cpus-per-task from sbatch export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} @@ -17,13 +17,10 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" # Allow communication over InfiniBand cells. MASTER_ADDR="${MASTER_ADDR}i" # Get IP for hostname. -MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" MASTER_PORT=6000 GPUS_PER_NODE=4 NNODES=$SLURM_JOB_NUM_NODES - - # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src diff --git a/src/serial.slurm b/src/serial.slurm index 10071ea9f092bba271db786c24f5aa2268feb32e..27ac4a821c3441e9fb37fa10340e269ab58d8f52 100644 --- a/src/serial.slurm +++ b/src/serial.slurm @@ -4,8 +4,8 @@ #SBATCH --job-name=ai-serial #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 -#SBATCH --output=output.%j -#SBATCH --error=err.%j +#SBATCH --output=out-serial.%j +#SBATCH --error=err-serial.%j #SBATCH --time=00:40:00 #SBATCH --partition=develbooster #SBATCH --gres=gpu:1