Select Git revision
ci-devel.yaml
run.sbatch 1.69 KiB
#!/usr/bin/env bash
#SBATCH --account=atmlaml
#SBATCH --partition=develbooster
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
# 96 CPUs w/o nomultithread
# 48 CPUs w/ nomultithread
#SBATCH --cpus-per-task=48
# Use only physical cores.
#SBATCH --hint=nomultithread
#SBATCH --gres=gpu:4
#SBATCH --time=00:15:00
curr_file="$(scontrol show job "$SLURM_JOB_ID" | grep '^[[:space:]]*Command=' | head -n 1 | cut -d '=' -f 2-)"
curr_dir="$(dirname "$curr_file")"
# Propagate the specified number of CPUs per task to each `srun`.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
[ -x "$(command -v deactivate)" ] && deactivate
module --force purge
module load Stages
source "$curr_dir"/modules.sh
if ! [ -d "$curr_dir"/env ]; then
echo "Cannot set up \`venv\` on JUWELS Booster compute node." \
"Please manually execute \`bash set_up.sh\` on a login node."
exit 1
fi
source "$curr_dir"/env/bin/activate
export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
if [ "$SYSTEMNAME" = juwelsbooster ] \
|| [ "$SYSTEMNAME" = juwels ] \
|| [ "$SYSTEMNAME" = jurecadc ] \
|| [ "$SYSTEMNAME" = jusuf ]; then
# Allow communication over InfiniBand cells on JSC machines.
MASTER_ADDR="$MASTER_ADDR"i
fi
export MASTER_PORT=54123
# Prevent NCCL not figuring out how to initialize.
export NCCL_SOCKET_IFNAME=ib0
# Prevent GLOO not being able to communicate.
export GLOO_SOCKET_IFNAME=ib0
srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
--nproc_per_node=gpu \
--nnodes="$SLURM_JOB_NUM_NODES" \
--rdzv_id="$SLURM_JOB_ID" \
--rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
--rdzv_backend=c10d \
"$curr_dir"/main.py "$@"