diff --git a/01-deep-learning-on-supercomputers.md b/01-deep-learning-on-supercomputers.md index 4e6842bd83ab488be344773a40861f5a18961c06..fcd8ce2d49ab83016f0a050ee06b3b27a52eced2 100644 --- a/01-deep-learning-on-supercomputers.md +++ b/01-deep-learning-on-supercomputers.md @@ -378,8 +378,6 @@ date: May 31, 2023 - Let's take a simple model - Run it "serially" (single-gpu) - Then make it data parallel among multiple gpus in one node -- Afterwards, make it data parallel among multiple nodes -- Finally, make it in model parallel --- diff --git a/public/01-deep-learning-on-supercomputers.html b/public/01-deep-learning-on-supercomputers.html index cedece5020201504a0caa13293a34c1c8c2ffe53..54e6f76f8df5ec323c1aa287b476c2e47c1b28a6 100644 --- a/public/01-deep-learning-on-supercomputers.html +++ b/public/01-deep-learning-on-supercomputers.html @@ -591,9 +591,6 @@ gpus</li> <li class="fragment">Run it “serially” (single-gpu)</li> <li class="fragment">Then make it data parallel among multiple gpus in one node</li> -<li class="fragment">Afterwards, make it data parallel among multiple -nodes</li> -<li class="fragment">Finally, make it in model parallel</li> </ul> </section> <section id="expected-imports" class="slide level2"> diff --git a/src/distrib.slurm b/src/distrib.slurm index 517cd42622c1636c209ea98677f64ed2f0dda5c5..346cc0e7aadbdeb6d8cf016d9ec0fb4d48f94bab 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -1,6 +1,6 @@ #!/bin/bash -x #SBATCH --account=training2306 -#SBATCH --nodes=1 +#SBATCH --nodes=2 #SBATCH --job-name=ai-multi-gpu #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=48 @@ -18,26 +18,21 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" MASTER_ADDR="${MASTER_ADDR}i" # Get IP for hostname. export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" -<<<<<<< HEAD + export MASTER_PORT=7010 export GPUS_PER_NODE=4 export NNODES=$SLURM_JOB_NUM_NODES # do not remove or the training will hang and nodes will be lost w/o this workaround -#export CUDA_LAUNCH_BLOCKING=1 +export CUDA_LAUNCH_BLOCKING=1 # hide duplicated errors using this hack - will be properly fixed in pt-1.12 -#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json +export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json # force crashing on nccl issues like hanging broadcast -#export NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_ASYNC_ERROR_HANDLING=1 # handle timeouts export NCCL_IB_TIMEOUT=20 -======= -export MASTER_PORT=6000 -GPUS_PER_NODE=4 -NNODES=$SLURM_JOB_NUM_NODES ->>>>>>> 423694b75394e474f9d309661bdceafb44c402e0 # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src @@ -45,7 +40,6 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src # This loads modules and python packages source sc_venv_template/activate.sh - export LOGLEVEL=INFO # Run the demo time srun bash -c 'accelerate launch \