diff --git a/src/distrib.slurm b/src/distrib.slurm index 346cc0e7aadbdeb6d8cf016d9ec0fb4d48f94bab..562d48d44a04819a13f74fa4d4ff2ba389e408ac 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -1,4 +1,4 @@ -#!/bin/bash -x +#!/bin/bash #SBATCH --account=training2306 #SBATCH --nodes=2 #SBATCH --job-name=ai-multi-gpu @@ -10,8 +10,9 @@ #SBATCH --partition=booster #SBATCH --gres=gpu:4 -# srun doesnot inherit cpus-per-task from sbatch -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} +# Without this, srun does not inherit cpus-per-task from sbatch. +export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK" + # so processes know who to talk to MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" # Allow communication over InfiniBand cells. @@ -20,19 +21,8 @@ MASTER_ADDR="${MASTER_ADDR}i" export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" export MASTER_PORT=7010 -export GPUS_PER_NODE=4 -export NNODES=$SLURM_JOB_NUM_NODES -# do not remove or the training will hang and nodes will be lost w/o this workaround -export CUDA_LAUNCH_BLOCKING=1 - -# hide duplicated errors using this hack - will be properly fixed in pt-1.12 -export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json - -# force crashing on nccl issues like hanging broadcast -export NCCL_ASYNC_ERROR_HANDLING=1 -# handle timeouts -export NCCL_IB_TIMEOUT=20 +export GPUS_PER_NODE=4 # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src @@ -40,17 +30,29 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src # This loads modules and python packages source sc_venv_template/activate.sh -export LOGLEVEL=INFO +# Set up accelerate config. +export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml +srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\" +compute_environment: LOCAL_MACHINE +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: \$SLURM_NODEID +main_process_ip: '\$MASTER_ADDR' +main_process_port: \$MASTER_PORT +main_training_function: main +mixed_precision: 'no' +num_machines: \$SLURM_JOB_NUM_NODES +num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE)) +rdzv_backend: c10d +same_network: false +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +EOT" + # Run the demo time srun bash -c 'accelerate launch \ - --main_process_ip $MASTER_ADDR \ - --main_process_port $MASTER_PORT \ - --multi_gpu \ - --mixed_precision=no \ - --num_processes=$(($NNODES * 4)) \ - --dynamo_backend=no \ - --num_machines=$NNODES \ - --machine_rank=$SLURM_PROCID \ - --rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \ + --config_file=$ACCELERATE_CONFIG_YAML \ distrib.py' -