Skip to content
Snippets Groups Projects
Commit 5f8d6cad authored by Alexandre Strube's avatar Alexandre Strube
Browse files

Merge branch 'fix' into 'main'

Clean up and fix everything

See merge request strube1/2023-may-intro-to-supercompting-jsc!1
parents 88132c8c 18331bc1
Branches
Tags
1 merge request!1Clean up and fix everything
Pipeline #140756 passed
#!/bin/bash -x
#!/bin/bash
#SBATCH --account=training2306
#SBATCH --nodes=2
#SBATCH --job-name=ai-multi-gpu
......@@ -10,8 +10,9 @@
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
# srun doesnot inherit cpus-per-task from sbatch
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
# Without this, srun does not inherit cpus-per-task from sbatch.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
# so processes know who to talk to
MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
# Allow communication over InfiniBand cells.
......@@ -20,19 +21,8 @@ MASTER_ADDR="${MASTER_ADDR}i"
export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
export MASTER_PORT=7010
export GPUS_PER_NODE=4
export NNODES=$SLURM_JOB_NUM_NODES
# do not remove or the training will hang and nodes will be lost w/o this workaround
export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts
export NCCL_IB_TIMEOUT=20
export GPUS_PER_NODE=4
# Make sure we are on the right directory
cd $HOME/2023-may-intro-to-supercompting-jsc/src
......@@ -40,17 +30,29 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
# This loads modules and python packages
source sc_venv_template/activate.sh
export LOGLEVEL=INFO
# Set up accelerate config.
export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml
srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\"
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: \$SLURM_NODEID
main_process_ip: '\$MASTER_ADDR'
main_process_port: \$MASTER_PORT
main_training_function: main
mixed_precision: 'no'
num_machines: \$SLURM_JOB_NUM_NODES
num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE))
rdzv_backend: c10d
same_network: false
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
EOT"
# Run the demo
time srun bash -c 'accelerate launch \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--multi_gpu \
--mixed_precision=no \
--num_processes=$(($NNODES * 4)) \
--dynamo_backend=no \
--num_machines=$NNODES \
--machine_rank=$SLURM_PROCID \
--rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \
--config_file=$ACCELERATE_CONFIG_YAML \
distrib.py'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment