Select Git revision
distrib.slurm
Forked from
Alexandre Strube / 2024-05-talk-intro-to-supercompting-jsc
7 commits ahead of the upstream repository.
Alexandre Strube authored
distrib.slurm 1.25 KiB
#!/bin/bash
#SBATCH --account=training2436
#SBATCH --nodes=1
#SBATCH --job-name=ai-multi-gpu
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48
#SBATCH --output=out-distrib.%j
#SBATCH --error=err-distrib.%j
#SBATCH --time=00:20:00
#SBATCH --partition=dc-gpu
#SBATCH --gres=gpu:4
# Without this, srun does not inherit cpus-per-task from sbatch.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
# so processes know who to talk to
MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
# Allow communication over InfiniBand cells.
MASTER_ADDR="${MASTER_ADDR}i"
# Get IP for hostname.
export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
export MASTER_PORT=7010
export GPUS_PER_NODE=4
# Make sure we are on the right directory
cd $HOME/2024-11-talk-intro-to-supercompting-jsc/src
# This loads modules and python packages
source sc_venv_template/activate.sh
time srun bash -c 'accelerate launch \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--multi_gpu \
--mixed_precision no \
--num_processes=$(($SLURM_JOB_NUM_NODES * $GPUS_PER_NODE)) \
--dynamo_backend=no \
--num_machines=$SLURM_JOB_NUM_NODES \
--machine_rank=$SLURM_NODEID \
--rdzv_backend c10d \
distrib.py'