run.sbatch

#!/usr/bin/env bash

#SBATCH --account=atmlaml
#SBATCH --partition=develbooster
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# 96 CPUs w/o nomultithread
# 48 CPUs w/ nomultithread
#SBATCH --cpus-per-task=48
# Use only physical cores.
#SBATCH --hint=nomultithread
#SBATCH --gres=gpu:4
#SBATCH --time=00:15:00

curr_file="$(scontrol show job "$SLURM_JOB_ID" | grep '^[[:space:]]*Command=' | head -n 1 | cut -d '=' -f 2-)"
curr_dir="$(dirname "$curr_file")"

# Propagate the specified number of CPUs per task to each `srun`.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"

[ -x "$(command -v deactivate)" ] && deactivate

module --force purge
module load Stages
source "$curr_dir"/modules.sh

if ! [ -d "$curr_dir"/env ]; then
    echo "Cannot set up \`venv\` on JUWELS Booster compute node." \
         "Please manually execute \`bash set_up.sh\` on a login node."
    exit 1
fi

source "$curr_dir"/env/bin/activate

export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
if [ "$SYSTEMNAME" = juwelsbooster ] \
       || [ "$SYSTEMNAME" = juwels ] \
       || [ "$SYSTEMNAME" = jurecadc ] \
       || [ "$SYSTEMNAME" = jusuf ]; then
    # Allow communication over InfiniBand cells on JSC machines.
    MASTER_ADDR="$MASTER_ADDR"i
fi
export MASTER_PORT=54123

# Prevent NCCL not figuring out how to initialize.
export NCCL_SOCKET_IFNAME=ib0
# Prevent GLOO not being able to communicate.
export GLOO_SOCKET_IFNAME=ib0

srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
       --nproc_per_node=gpu \
       --nnodes="$SLURM_JOB_NUM_NODES" \
       --rdzv_id="$SLURM_JOB_ID" \
       --rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
       --rdzv_backend=c10d \
       "$curr_dir"/main.py "$@"