Clean up and fix everything

18331bc1 · Jan Ebert · 88132c8c · 18331bc1
Commit 18331bc1 authored May 30, 2023 by Jan Ebert
--- a/src/distrib.slurm
+++ b/src/distrib.slurm
-#!/bin/bash -x
+#!/bin/bash
 #SBATCH --account=training2306
 #SBATCH --nodes=2
 #SBATCH --job-name=ai-multi-gpu
@@ -10,8 +10,9 @@
 #SBATCH --partition=booster
 #SBATCH --gres=gpu:4

-# srun doesnot inherit cpus-per-task from sbatch
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
+# Without this, srun does not inherit cpus-per-task from sbatch.
+export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
+
 # so processes know who to talk to
 MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
 # Allow communication over InfiniBand cells.
@@ -20,19 +21,8 @@ MASTER_ADDR="${MASTER_ADDR}i"
 export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"

 export MASTER_PORT=7010
-export GPUS_PER_NODE=4
-export NNODES=$SLURM_JOB_NUM_NODES
-# do not remove or the training will hang and nodes will be lost w/o this workaround
-export CUDA_LAUNCH_BLOCKING=1
-
-# hide duplicated errors using this hack - will be properly fixed in pt-1.12
-export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
-
-# force crashing on nccl issues like hanging broadcast
-export NCCL_ASYNC_ERROR_HANDLING=1

-# handle timeouts
-export NCCL_IB_TIMEOUT=20
+export GPUS_PER_NODE=4

 # Make sure we are on the right directory
 cd $HOME/2023-may-intro-to-supercompting-jsc/src
@@ -40,17 +30,29 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
 # This loads modules and python packages
 source sc_venv_template/activate.sh

-export LOGLEVEL=INFO
+# Set up accelerate config.
+export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml
+srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\"
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: \$SLURM_NODEID
+main_process_ip: '\$MASTER_ADDR'
+main_process_port: \$MASTER_PORT
+main_training_function: main
+mixed_precision: 'no'
+num_machines: \$SLURM_JOB_NUM_NODES
+num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE))
+rdzv_backend: c10d
+same_network: false
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+EOT"
+
 # Run the demo
 time srun bash -c 'accelerate launch \
-    --main_process_ip $MASTER_ADDR \
-    --main_process_port $MASTER_PORT \
-    --multi_gpu \
-    --mixed_precision=no \
-    --num_processes=$(($NNODES * 4)) \
-    --dynamo_backend=no \
-    --num_machines=$NNODES  \
-    --machine_rank=$SLURM_PROCID \
-    --rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \
+    --config_file=$ACCELERATE_CONFIG_YAML \
    distrib.py'
-