fix git stuff

88132c8c · Alexandre Strube · 2a23d51f · 88132c8c · 88132c8c · 88132c8c
Commit 88132c8c authored May 30, 2023 by Alexandre Strube
--- a/01-deep-learning-on-supercomputers.md
+++ b/01-deep-learning-on-supercomputers.md
@@ -378,8 +378,6 @@ date: May 31, 2023
 - Let's take a simple model
 - Run it "serially" (single-gpu)
 - Then make it data parallel among multiple gpus in one node
- Afterwards, make it data parallel among multiple nodes
- Finally, make it in model parallel
 ---

--- a/public/01-deep-learning-on-supercomputers.html
+++ b/public/01-deep-learning-on-supercomputers.html
@@ -591,9 +591,6 @@ gpus</li>
 <li class="fragment">Run it “serially” (single-gpu)</li>
 <li class="fragment">Then make it data parallel among multiple gpus in
 one node</li>
-<li class="fragment">Afterwards, make it data parallel among multiple
-nodes</li>
-<li class="fragment">Finally, make it in model parallel</li>
 </ul>
 </section>
 <section id="expected-imports" class="slide level2">

--- a/src/distrib.slurm
+++ b/src/distrib.slurm
 #!/bin/bash -x
 #SBATCH --account=training2306
-#SBATCH --nodes=1
+#SBATCH --nodes=2
 #SBATCH --job-name=ai-multi-gpu
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=48
@@ -18,26 +18,21 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
 MASTER_ADDR="${MASTER_ADDR}i"
 # Get IP for hostname.
 export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
-<<<<<<< HEAD
 export MASTER_PORT=7010
 export GPUS_PER_NODE=4
 export NNODES=$SLURM_JOB_NUM_NODES
 # do not remove or the training will hang and nodes will be lost w/o this workaround
-#export CUDA_LAUNCH_BLOCKING=1
+export CUDA_LAUNCH_BLOCKING=1
 # hide duplicated errors using this hack - will be properly fixed in pt-1.12
-#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
 # force crashing on nccl issues like hanging broadcast
-#export NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_ASYNC_ERROR_HANDLING=1
 # handle timeouts
 export NCCL_IB_TIMEOUT=20
-=======
-export MASTER_PORT=6000
-GPUS_PER_NODE=4
-NNODES=$SLURM_JOB_NUM_NODES  
->>>>>>> 423694b75394e474f9d309661bdceafb44c402e0
 # Make sure we are on the right directory
 cd $HOME/2023-may-intro-to-supercompting-jsc/src
@@ -45,7 +40,6 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
 # This loads modules and python packages
 source sc_venv_template/activate.sh
 export LOGLEVEL=INFO
 # Run the demo
 time srun bash -c 'accelerate launch \