Skip to content
Snippets Groups Projects
Commit 88132c8c authored by Alexandre Strube's avatar Alexandre Strube
Browse files

fix git stuff

parent 2a23d51f
Branches master
No related tags found
No related merge requests found
Pipeline #140724 passed
...@@ -378,8 +378,6 @@ date: May 31, 2023 ...@@ -378,8 +378,6 @@ date: May 31, 2023
- Let's take a simple model - Let's take a simple model
- Run it "serially" (single-gpu) - Run it "serially" (single-gpu)
- Then make it data parallel among multiple gpus in one node - Then make it data parallel among multiple gpus in one node
- Afterwards, make it data parallel among multiple nodes
- Finally, make it in model parallel
--- ---
......
...@@ -591,9 +591,6 @@ gpus</li> ...@@ -591,9 +591,6 @@ gpus</li>
<li class="fragment">Run it “serially” (single-gpu)</li> <li class="fragment">Run it “serially” (single-gpu)</li>
<li class="fragment">Then make it data parallel among multiple gpus in <li class="fragment">Then make it data parallel among multiple gpus in
one node</li> one node</li>
<li class="fragment">Afterwards, make it data parallel among multiple
nodes</li>
<li class="fragment">Finally, make it in model parallel</li>
</ul> </ul>
</section> </section>
<section id="expected-imports" class="slide level2"> <section id="expected-imports" class="slide level2">
......
#!/bin/bash -x #!/bin/bash -x
#SBATCH --account=training2306 #SBATCH --account=training2306
#SBATCH --nodes=1 #SBATCH --nodes=2
#SBATCH --job-name=ai-multi-gpu #SBATCH --job-name=ai-multi-gpu
#SBATCH --ntasks-per-node=1 #SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48 #SBATCH --cpus-per-task=48
...@@ -18,26 +18,21 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" ...@@ -18,26 +18,21 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
MASTER_ADDR="${MASTER_ADDR}i" MASTER_ADDR="${MASTER_ADDR}i"
# Get IP for hostname. # Get IP for hostname.
export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
<<<<<<< HEAD
export MASTER_PORT=7010 export MASTER_PORT=7010
export GPUS_PER_NODE=4 export GPUS_PER_NODE=4
export NNODES=$SLURM_JOB_NUM_NODES export NNODES=$SLURM_JOB_NUM_NODES
# do not remove or the training will hang and nodes will be lost w/o this workaround # do not remove or the training will hang and nodes will be lost w/o this workaround
#export CUDA_LAUNCH_BLOCKING=1 export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12 # hide duplicated errors using this hack - will be properly fixed in pt-1.12
#export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
# force crashing on nccl issues like hanging broadcast # force crashing on nccl issues like hanging broadcast
#export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_ASYNC_ERROR_HANDLING=1
# handle timeouts # handle timeouts
export NCCL_IB_TIMEOUT=20 export NCCL_IB_TIMEOUT=20
=======
export MASTER_PORT=6000
GPUS_PER_NODE=4
NNODES=$SLURM_JOB_NUM_NODES
>>>>>>> 423694b75394e474f9d309661bdceafb44c402e0
# Make sure we are on the right directory # Make sure we are on the right directory
cd $HOME/2023-may-intro-to-supercompting-jsc/src cd $HOME/2023-may-intro-to-supercompting-jsc/src
...@@ -45,7 +40,6 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src ...@@ -45,7 +40,6 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
# This loads modules and python packages # This loads modules and python packages
source sc_venv_template/activate.sh source sc_venv_template/activate.sh
export LOGLEVEL=INFO export LOGLEVEL=INFO
# Run the demo # Run the demo
time srun bash -c 'accelerate launch \ time srun bash -c 'accelerate launch \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment