diff --git a/src/distrib.slurm b/src/distrib.slurm new file mode 100644 index 0000000000000000000000000000000000000000..517cd42622c1636c209ea98677f64ed2f0dda5c5 --- /dev/null +++ b/src/distrib.slurm @@ -0,0 +1,62 @@ +#!/bin/bash -x +#SBATCH --account=training2306 +#SBATCH --nodes=1 +#SBATCH --job-name=ai-multi-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=48 +#SBATCH --output=out-distrib.%j +#SBATCH --error=err-distrib.%j +#SBATCH --time=00:20:00 +#SBATCH --partition=booster +#SBATCH --gres=gpu:4 + +# srun doesnot inherit cpus-per-task from sbatch +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} +# so processes know who to talk to +MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" +# Allow communication over InfiniBand cells. +MASTER_ADDR="${MASTER_ADDR}i" +# Get IP for hostname. +export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +<<<<<<< HEAD +export MASTER_PORT=7010 +export GPUS_PER_NODE=4 +export NNODES=$SLURM_JOB_NUM_NODES +# do not remove or the training will hang and nodes will be lost w/o this workaround +#export CUDA_LAUNCH_BLOCKING=1 + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +# force crashing on nccl issues like hanging broadcast +#export NCCL_ASYNC_ERROR_HANDLING=1 + +# handle timeouts +export NCCL_IB_TIMEOUT=20 +======= +export MASTER_PORT=6000 +GPUS_PER_NODE=4 +NNODES=$SLURM_JOB_NUM_NODES +>>>>>>> 423694b75394e474f9d309661bdceafb44c402e0 + +# Make sure we are on the right directory +cd $HOME/2023-may-intro-to-supercompting-jsc/src + +# This loads modules and python packages +source sc_venv_template/activate.sh + + +export LOGLEVEL=INFO +# Run the demo +time srun bash -c 'accelerate launch \ + --main_process_ip $MASTER_ADDR \ + --main_process_port $MASTER_PORT \ + --multi_gpu \ + --mixed_precision=no \ + --num_processes=$(($NNODES * 4)) \ + --dynamo_backend=no \ + --num_machines=$NNODES \ + --machine_rank=$SLURM_PROCID \ + --rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \ + distrib.py' +