from booster

2a23d51f · Alexandre Strube · 9de19268 · 2a23d51f
Commit 2a23d51f authored May 30, 2023 by Alexandre Strube
--- a/src/distrib.slurm
+++ b/src/distrib.slurm
+#!/bin/bash -x
+#SBATCH --account=training2306
+#SBATCH --nodes=1
+#SBATCH --job-name=ai-multi-gpu
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=48
+#SBATCH --output=out-distrib.%j
+#SBATCH --error=err-distrib.%j
+#SBATCH --time=00:20:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+# srun doesnot inherit cpus-per-task from sbatch
+export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
+# so processes know who to talk to
+MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
+# Allow communication over InfiniBand cells.
+MASTER_ADDR="${MASTER_ADDR}i"
+# Get IP for hostname.
+export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
+<<<<<<< HEAD
+export MASTER_PORT=7010
+export GPUS_PER_NODE=4
+export NNODES=$SLURM_JOB_NUM_NODES
+# do not remove or the training will hang and nodes will be lost w/o this workaround
+#export CUDA_LAUNCH_BLOCKING=1
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+# force crashing on nccl issues like hanging broadcast
+#export NCCL_ASYNC_ERROR_HANDLING=1
+# handle timeouts
+export NCCL_IB_TIMEOUT=20
+=======
+export MASTER_PORT=6000
+GPUS_PER_NODE=4
+NNODES=$SLURM_JOB_NUM_NODES  
+>>>>>>> 423694b75394e474f9d309661bdceafb44c402e0
+# Make sure we are on the right directory
+cd $HOME/2023-may-intro-to-supercompting-jsc/src
+# This loads modules and python packages
+source sc_venv_template/activate.sh
+export LOGLEVEL=INFO
+# Run the demo
+time srun bash -c 'accelerate launch \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --multi_gpu \
+    --mixed_precision=no \
+    --num_processes=$(($NNODES * 4)) \
+    --dynamo_backend=no \
+    --num_machines=$NNODES  \
+    --machine_rank=$SLURM_PROCID \
+    --rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \
+    distrib.py'