Skip to content
Snippets Groups Projects
Commit 18331bc1 authored by Jan Ebert's avatar Jan Ebert
Browse files

Clean up and fix everything

parent 88132c8c
Branches
No related tags found
1 merge request!1Clean up and fix everything
#!/bin/bash -x
#!/bin/bash
#SBATCH --account=training2306
#SBATCH --nodes=2
#SBATCH --job-name=ai-multi-gpu
......@@ -10,8 +10,9 @@
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
# srun doesnot inherit cpus-per-task from sbatch
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
# Without this, srun does not inherit cpus-per-task from sbatch.
export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
# so processes know who to talk to
MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
# Allow communication over InfiniBand cells.
......@@ -20,19 +21,8 @@ MASTER_ADDR="${MASTER_ADDR}i"
export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
export MASTER_PORT=7010
export GPUS_PER_NODE=4
export NNODES=$SLURM_JOB_NUM_NODES
# do not remove or the training will hang and nodes will be lost w/o this workaround
export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts
export NCCL_IB_TIMEOUT=20
export GPUS_PER_NODE=4
# Make sure we are on the right directory
cd $HOME/2023-may-intro-to-supercompting-jsc/src
......@@ -40,17 +30,29 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
# This loads modules and python packages
source sc_venv_template/activate.sh
export LOGLEVEL=INFO
# Set up accelerate config.
export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml
srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\"
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: \$SLURM_NODEID
main_process_ip: '\$MASTER_ADDR'
main_process_port: \$MASTER_PORT
main_training_function: main
mixed_precision: 'no'
num_machines: \$SLURM_JOB_NUM_NODES
num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE))
rdzv_backend: c10d
same_network: false
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
EOT"
# Run the demo
time srun bash -c 'accelerate launch \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--multi_gpu \
--mixed_precision=no \
--num_processes=$(($NNODES * 4)) \
--dynamo_backend=no \
--num_machines=$NNODES \
--machine_rank=$SLURM_PROCID \
--rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \
--config_file=$ACCELERATE_CONFIG_YAML \
distrib.py'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment