Skip to content
Snippets Groups Projects
Commit 2a23d51f authored by Alexandre Strube's avatar Alexandre Strube
Browse files

from booster

parent 9de19268
Branches
No related tags found
No related merge requests found
Pipeline #140721 passed
#!/bin/bash -x
#SBATCH --account=training2306
#SBATCH --nodes=1
#SBATCH --job-name=ai-multi-gpu
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48
#SBATCH --output=out-distrib.%j
#SBATCH --error=err-distrib.%j
#SBATCH --time=00:20:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
# srun doesnot inherit cpus-per-task from sbatch
export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
# so processes know who to talk to
MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
# Allow communication over InfiniBand cells.
MASTER_ADDR="${MASTER_ADDR}i"
# Get IP for hostname.
export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
<<<<<<< HEAD
export MASTER_PORT=7010
export GPUS_PER_NODE=4
export NNODES=$SLURM_JOB_NUM_NODES
# do not remove or the training will hang and nodes will be lost w/o this workaround
#export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast
#export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts
export NCCL_IB_TIMEOUT=20
=======
export MASTER_PORT=6000
GPUS_PER_NODE=4
NNODES=$SLURM_JOB_NUM_NODES
>>>>>>> 423694b75394e474f9d309661bdceafb44c402e0
# Make sure we are on the right directory
cd $HOME/2023-may-intro-to-supercompting-jsc/src
# This loads modules and python packages
source sc_venv_template/activate.sh
export LOGLEVEL=INFO
# Run the demo
time srun bash -c 'accelerate launch \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--multi_gpu \
--mixed_precision=no \
--num_processes=$(($NNODES * 4)) \
--dynamo_backend=no \
--num_machines=$NNODES \
--machine_rank=$SLURM_PROCID \
--rdzv_conf "rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT rdzv_backend=c10d" \
distrib.py'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment