diff --git a/01-deep-learning-on-supercomputers.md b/01-deep-learning-on-supercomputers.md
index 8941bbbd07e40bb0c48641a5bfeba391f7d88576..4e6842bd83ab488be344773a40861f5a18961c06 100644
--- a/01-deep-learning-on-supercomputers.md
+++ b/01-deep-learning-on-supercomputers.md
@@ -476,7 +476,7 @@ dls = DataBlock(
 
 learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16()
 
-learn.fine_tune(48)
+learn.fine_tune(6)
 ```
 
 ---
@@ -496,7 +496,8 @@ deepspeed
 ```
 - Run `./setup.sh`
 - `source activate.sh`
-- Done! You installed everything you need (works with Jupyter too)
+- `accelerate config`
+- Done! You installed everything you need
 
 ---
 
@@ -504,15 +505,16 @@ deepspeed
 
 ```bash
 #!/bin/bash -x
-#SBATCH --account=training2315
+#SBATCH --account=training2306
 #SBATCH --nodes=1
 #SBATCH --job-name=ai-serial
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=1
-#SBATCH --output=output.%j
-#SBATCH --error=err.%j
+#SBATCH --output=out-serial.%j
+#SBATCH --error=err-serial.%j
 #SBATCH --time=00:40:00
-#SBATCH --partition=booster
+#SBATCH --partition=develbooster
+#SBATCH --gres=gpu:1
 
 # Make sure we are on the right directory
 cd $HOME/2023-may-intro-to-supercompting-jsc/src
@@ -532,8 +534,9 @@ time srun python serial.py
 cd [COURSE REPO DIRECTORY]/src
 sbatch serial.slurm
 ```
-- On Juwels Booster, each epoch takes ca. 45 seconds: Around 36 minutes
-- (On a cpu system this would take half a day)
+- On Juwels Booster, should take about 5 minutes
+- On a cpu system this would take half a day
+
 ---
 
 ## Going data parallel
@@ -545,7 +548,116 @@ sbatch serial.slurm
 ## Data parallel
 
 ```python
+from fastai.vision.all import *
+from fastai.distributed import *
+from fastai.vision.models.xresnet import *
+
+path = rank0_first(untar_data, URLs.IMAGEWOOF_320)
+dls = DataBlock(
+    blocks=(ImageBlock, CategoryBlock),
+    splitter=GrandparentSplitter(valid_name='val'),
+    get_items=get_image_files, get_y=parent_label,
+    item_tfms=[RandomResizedCrop(160), FlipItem(0.5)],
+    batch_tfms=Normalize.from_stats(*imagenet_stats)
+).dataloaders(path, path=path, bs=64)
+
+learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16()
+with learn.distrib_ctx():
+    learn.fine_tune(6)
+```
+
+---
+
+## Data Parallel
+
+What changed?
+
+- It was 
+- ```python
+path = untar_data(URLs.IMAGEWOOF_320)
+``` 
+- Became 
+- ```python
+path = rank0_first(untar_data, URLs.IMAGEWOOF_320)
+```
+- It was
+- ```python
+learn.fine_tune(6)
+```
+- Became
+- ```python
+with learn.distrib_ctx():
+    learn.fine_tune(6)
+```
+
+---
+
+## Submission script: data parallel
+
+```bash
+#!/bin/bash -x
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=48
+... rest of the sbatch stuff, removed to fit screen
+
+# srun doesnot inherit cpus-per-task from sbatch
+export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
+# so processes know who to talk to
+MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
+# Allow communication over InfiniBand cells.
+MASTER_ADDR="${MASTER_ADDR}i"
+# Get IP for hostname.
+export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
+export MASTER_PORT=6000
+export GPUS_PER_NODE=4
+export NNODES=$SLURM_JOB_NUM_NODES  
+
+cd $HOME/2023-may-intro-to-supercompting-jsc/src
+source sc_venv_template/activate.sh
+
+time srun accelerate launch distrib.py
+```
+
+---
+
+## Let's check the outputs!
+
+- Single gpu:
+- ```bash
+epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
+0         2.249933    2.152813    0.225757  0.750573        01:11                          
+epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
+0         1.882008    1.895813    0.324510  0.832018        00:44                          
+1         1.837312    1.916380    0.374141  0.845253        00:44                          
+2         1.717144    1.739026    0.378722  0.869941        00:43                          
+3         1.594981    1.637526    0.417664  0.891575        00:44                          
+4         1.460454    1.410519    0.507254  0.920336        00:44                          
+5         1.389946    1.304924    0.538814  0.935862        00:43  
+real	5m44.972s
+```
+
+- Multi gpu:
+- ```bash
+epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
+0         2.201540    2.799354    0.202950  0.662513        00:09                        
+epoch     train_loss  valid_loss  accuracy  top_k_accuracy  time    
+0         1.951004    2.059517    0.294761  0.781282        00:08                        
+1         1.929561    1.999069    0.309512  0.792981        00:08                        
+2         1.854629    1.962271    0.314344  0.840285        00:08                        
+3         1.754019    1.687136    0.404883  0.872330        00:08                        
+4         1.643759    1.499526    0.482706  0.906409        00:08                        
+5         1.554356    1.450976    0.502798  0.914547        00:08  
+real	1m19.979s
+```
+
+---
+
+## Some insights
 
+- Distributed run suffered a bit on the accuracy and loss in exchange for speed 🏎️
+- Data parallel is a simple and effective way to distribute DL workload
+- This is really just a primer - there's much more to that
+- I/O plays a HUGE role on Supercomputers, for example
 
 ---
 
diff --git a/public/01-deep-learning-on-supercomputers.html b/public/01-deep-learning-on-supercomputers.html
index 81a11c6dce354de49aca8df1ae55ac72a78c8092..cedece5020201504a0caa13293a34c1c8c2ffe53 100644
--- a/public/01-deep-learning-on-supercomputers.html
+++ b/public/01-deep-learning-on-supercomputers.html
@@ -676,7 +676,7 @@ class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a hr
 <span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>learn <span class="op">=</span> Learner(dls, xresnet50(n_out<span class="op">=</span><span class="dv">10</span>), metrics<span class="op">=</span>[accuracy,top_k_accuracy]).to_fp16()</span>
 <span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">48</span>)</span></code></pre></div>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div>
 </section>
 <section id="venv_template" class="slide level2">
 <h2>Venv_template</h2>
@@ -698,32 +698,33 @@ class="sourceCode python"><code class="sourceCode python"><span id="cb5-1"><a hr
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>deepspeed</span></code></pre></div></li>
 <li class="fragment">Run <code>./setup.sh</code></li>
 <li class="fragment"><code>source activate.sh</code></li>
-<li class="fragment">Done! You installed everything you need (works with
-Jupyter too)</li>
+<li class="fragment"><code>accelerate config</code></li>
+<li class="fragment">Done! You installed everything you need</li>
 </ul>
 </section>
 <section id="submission-script" class="slide level2">
 <h2>Submission Script</h2>
 <div class="sourceCode" id="cb6"><pre
 class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!/bin/bash -x</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --account=training2315</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --account=training2306</span></span>
 <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --nodes=1</span></span>
 <span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --job-name=ai-serial</span></span>
 <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --ntasks-per-node=1</span></span>
 <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --cpus-per-task=1</span></span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --output=output.%j</span></span>
-<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --error=err.%j</span></span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --output=out-serial.%j</span></span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --error=err-serial.%j</span></span>
 <span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --time=00:40:00</span></span>
-<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --partition=booster</span></span>
-<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Make sure we are on the right directory</span></span>
-<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span>
-<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="co"># This loads modules and python packages</span></span>
-<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span>
-<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Run the demo</span></span>
-<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun python serial.py</span></code></pre></div>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --partition=develbooster</span></span>
+<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --gres=gpu:1</span></span>
+<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Make sure we are on the right directory</span></span>
+<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span>
+<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="co"># This loads modules and python packages</span></span>
+<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span>
+<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Run the demo</span></span>
+<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun python serial.py</span></code></pre></div>
 </section>
 <section id="running-it" class="slide level2">
 <h2>Running it</h2>
@@ -731,11 +732,8 @@ class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="
 <li class="fragment"><div class="sourceCode" id="cb7"><pre
 class="sourceCode bash"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> [COURSE REPO DIRECTORY]/src</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="ex">sbatch</span> serial.slurm</span></code></pre></div></li>
-<li class="fragment">On Juwels Booster, each epoch takes ca. 45 seconds:
-Around 36 minutes</li>
-<li class="fragment"><h2
-id="on-a-cpu-system-this-would-take-half-a-day">(On a cpu system this
-would take half a day)</h2></li>
+<li class="fragment">On Juwels Booster, should take about 5 minutes</li>
+<li class="fragment">On a cpu system this would take half a day</li>
 </ul>
 </section>
 <section id="going-data-parallel" class="slide level2">
@@ -747,7 +745,109 @@ differences</li>
 </section>
 <section id="data-parallel-4" class="slide level2">
 <h2>Data parallel</h2>
-<p>```python</p>
+<div class="sourceCode" id="cb8"><pre
+class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.vision.<span class="bu">all</span> <span class="im">import</span> <span class="op">*</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.distributed <span class="im">import</span> <span class="op">*</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.vision.models.xresnet <span class="im">import</span> <span class="op">*</span></span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> rank0_first(untar_data, URLs.IMAGEWOOF_320)</span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>dls <span class="op">=</span> DataBlock(</span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a>    blocks<span class="op">=</span>(ImageBlock, CategoryBlock),</span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a>    splitter<span class="op">=</span>GrandparentSplitter(valid_name<span class="op">=</span><span class="st">&#39;val&#39;</span>),</span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a>    get_items<span class="op">=</span>get_image_files, get_y<span class="op">=</span>parent_label,</span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a>    item_tfms<span class="op">=</span>[RandomResizedCrop(<span class="dv">160</span>), FlipItem(<span class="fl">0.5</span>)],</span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a>    batch_tfms<span class="op">=</span>Normalize.from_stats(<span class="op">*</span>imagenet_stats)</span>
+<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>).dataloaders(path, path<span class="op">=</span>path, bs<span class="op">=</span><span class="dv">64</span>)</span>
+<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a>learn <span class="op">=</span> Learner(dls, xresnet50(n_out<span class="op">=</span><span class="dv">10</span>), metrics<span class="op">=</span>[accuracy,top_k_accuracy]).to_fp16()</span>
+<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> learn.distrib_ctx():</span>
+<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a>    learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div>
+</section>
+<section id="data-parallel-5" class="slide level2">
+<h2>Data Parallel</h2>
+<p>What changed?</p>
+<ul>
+<li class="fragment">It was</li>
+<li class="fragment"><div class="sourceCode" id="cb9"><pre
+class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> untar_data(URLs.IMAGEWOOF_320)</span></code></pre></div></li>
+<li class="fragment">Became</li>
+<li class="fragment"><div class="sourceCode" id="cb10"><pre
+class="sourceCode python"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> rank0_first(untar_data, URLs.IMAGEWOOF_320)</span></code></pre></div></li>
+<li class="fragment">It was</li>
+<li class="fragment"><div class="sourceCode" id="cb11"><pre
+class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div></li>
+<li class="fragment">Became</li>
+<li class="fragment"><div class="sourceCode" id="cb12"><pre
+class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> learn.distrib_ctx():</span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>    learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div></li>
+</ul>
+</section>
+<section id="submission-script-data-parallel" class="slide level2">
+<h2>Submission script: data parallel</h2>
+<div class="sourceCode" id="cb13"><pre
+class="sourceCode bash"><code class="sourceCode bash"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!/bin/bash -x</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --nodes=1</span></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --cpus-per-task=48</span></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="ex">...</span> rest of the sbatch stuff, removed to fit screen</span>
+<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="co"># srun doesnot inherit cpus-per-task from sbatch</span></span>
+<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">SRUN_CPUS_PER_TASK</span><span class="op">=</span><span class="va">${SLURM_CPUS_PER_TASK}</span></span>
+<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a><span class="co"># so processes know who to talk to</span></span>
+<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">&quot;</span><span class="va">$(</span><span class="ex">scontrol</span> show hostnames <span class="st">&quot;</span><span class="va">$SLURM_JOB_NODELIST</span><span class="st">&quot;</span> <span class="kw">|</span> <span class="fu">head</span> <span class="at">-n</span> 1<span class="va">)</span><span class="st">&quot;</span></span>
+<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Allow communication over InfiniBand cells.</span></span>
+<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a><span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">&quot;</span><span class="va">${MASTER_ADDR}</span><span class="st">i&quot;</span></span>
+<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Get IP for hostname.</span></span>
+<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">&quot;</span><span class="va">$(</span><span class="ex">nslookup</span> <span class="st">&quot;</span><span class="va">$MASTER_ADDR</span><span class="st">&quot;</span> <span class="kw">|</span> <span class="fu">grep</span> <span class="at">-oP</span> <span class="st">&#39;(?&lt;=Address: ).*&#39;</span><span class="va">)</span><span class="st">&quot;</span></span>
+<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">MASTER_PORT</span><span class="op">=</span>6000</span>
+<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">GPUS_PER_NODE</span><span class="op">=</span>4</span>
+<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">NNODES</span><span class="op">=</span><span class="va">$SLURM_JOB_NUM_NODES</span>  </span>
+<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span>
+<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span>
+<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb13-21"><a href="#cb13-21" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun accelerate launch distrib.py</span></code></pre></div>
+</section>
+<section id="lets-check-the-outputs" class="slide level2">
+<h2>Let’s check the outputs!</h2>
+<ul>
+<li class="fragment">Single gpu:</li>
+<li class="fragment"><div class="sourceCode" id="cb14"><pre
+class="sourceCode bash"><code class="sourceCode bash"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span>     train_loss  valid_loss  accuracy  top_k_accuracy  time    </span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span>         2.249933    2.152813    0.225757  0.750573        01:11                          </span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span>     train_loss  valid_loss  accuracy  top_k_accuracy  time    </span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span>         1.882008    1.895813    0.324510  0.832018        00:44                          </span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="ex">1</span>         1.837312    1.916380    0.374141  0.845253        00:44                          </span>
+<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="ex">2</span>         1.717144    1.739026    0.378722  0.869941        00:43                          </span>
+<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="ex">3</span>         1.594981    1.637526    0.417664  0.891575        00:44                          </span>
+<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="ex">4</span>         1.460454    1.410519    0.507254  0.920336        00:44                          </span>
+<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="ex">5</span>         1.389946    1.304924    0.538814  0.935862        00:43  </span>
+<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="ex">real</span>    5m44.972s</span></code></pre></div></li>
+<li class="fragment">Multi gpu:</li>
+<li class="fragment"><div class="sourceCode" id="cb15"><pre
+class="sourceCode bash"><code class="sourceCode bash"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span>     train_loss  valid_loss  accuracy  top_k_accuracy  time    </span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span>         2.201540    2.799354    0.202950  0.662513        00:09                        </span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span>     train_loss  valid_loss  accuracy  top_k_accuracy  time    </span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span>         1.951004    2.059517    0.294761  0.781282        00:08                        </span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="ex">1</span>         1.929561    1.999069    0.309512  0.792981        00:08                        </span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="ex">2</span>         1.854629    1.962271    0.314344  0.840285        00:08                        </span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="ex">3</span>         1.754019    1.687136    0.404883  0.872330        00:08                        </span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="ex">4</span>         1.643759    1.499526    0.482706  0.906409        00:08                        </span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="ex">5</span>         1.554356    1.450976    0.502798  0.914547        00:08  </span>
+<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="ex">real</span>    1m19.979s</span></code></pre></div></li>
+</ul>
+</section>
+<section id="some-insights" class="slide level2">
+<h2>Some insights</h2>
+<ul>
+<li class="fragment">Distributed run suffered a bit on the accuracy and
+loss in exchange for speed 🏎️</li>
+<li class="fragment">Data parallel is a simple and effective way to
+distribute DL workload</li>
+<li class="fragment">This is really just a primer - there’s much more to
+that</li>
+<li class="fragment">I/O plays a HUGE role on Supercomputers, for
+example</li>
+</ul>
 </section>
 <section id="thats-all-folks" class="slide level2">
 <h2>That’s all folks!</h2>
diff --git a/src/distrib.slurm b/src/distrib.slurm
index 24e2ca06334dca1290f256662899b7d1f5094496..62fcd891276ed90a799186865c6ff52c365703a6 100644
--- a/src/distrib.slurm
+++ b/src/distrib.slurm
@@ -17,7 +17,8 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
 # Allow communication over InfiniBand cells.
 MASTER_ADDR="${MASTER_ADDR}i"
 # Get IP for hostname.
-MASTER_PORT=6000
+export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
+export MASTER_PORT=6000
 GPUS_PER_NODE=4
 NNODES=$SLURM_JOB_NUM_NODES