Cleanup the html and the python file

58fafa14 · Alexandre Strube · bf6fd520 · 58fafa14 · 58fafa14
Commit 58fafa14 authored 2 years ago by Alexandre Strube
--- a/01-deep-learning-on-supercomputers.md
+++ b/01-deep-learning-on-supercomputers.md
@@ -378,6 +378,7 @@ date: May 31, 2023
 - Let's take a simple model
 - Run it "serially" (single-gpu)
 - Then make it data parallel among multiple gpus in one node
+- Then we make it multi-node!

 ---

@@ -494,7 +495,6 @@ deepspeed
 ```
 - Run `./setup.sh`
 - `source activate.sh`
- `accelerate config`
 - Done! You installed everything you need

 ---
@@ -592,29 +592,7 @@ with learn.distrib_ctx():

 ## Submission script: data parallel

-```bash
-#!/bin/bash -x
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=48
-... rest of the sbatch stuff, removed to fit screen
-
-# srun doesnot inherit cpus-per-task from sbatch
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-# so processes know who to talk to
-MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
-# Allow communication over InfiniBand cells.
-MASTER_ADDR="${MASTER_ADDR}i"
-# Get IP for hostname.
-export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
-export MASTER_PORT=6000
-export GPUS_PER_NODE=4
-export NNODES=$SLURM_JOB_NUM_NODES  
-
-cd $HOME/2023-may-intro-to-supercompting-jsc/src
-source sc_venv_template/activate.sh
-
-time srun accelerate launch distrib.py
-```
+- Please check the course repository: [src/distrib.slurm](https://gitlab.jsc.fz-juelich.de/strube1/2023-may-intro-to-supercompting-jsc/-/blob/main/src/distrib.slurm)

 ---


--- a/src/distrib.py
+++ b/src/distrib.py
 from fastai.vision.all import *
 from fastai.distributed import *
 from fastai.vision.models.xresnet import *
-import os
-rank = os.environ.get('SLURM_PROCID')
-path = untar_data(URLs.IMAGEWOOF_320)
+
+path = rank0_first(untar_data, URLs.IMAGEWOOF_320)

 dls = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
@@ -15,5 +14,4 @@ dls = DataBlock(

 learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16()
 with learn.distrib_ctx():
-    print("************************ Distributed test4 on rank ", rank)
    learn.fine_tune(6)