diff --git a/01-deep-learning-on-supercomputers.md b/01-deep-learning-on-supercomputers.md index 8941bbbd07e40bb0c48641a5bfeba391f7d88576..4e6842bd83ab488be344773a40861f5a18961c06 100644 --- a/01-deep-learning-on-supercomputers.md +++ b/01-deep-learning-on-supercomputers.md @@ -476,7 +476,7 @@ dls = DataBlock( learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16() -learn.fine_tune(48) +learn.fine_tune(6) ``` --- @@ -496,7 +496,8 @@ deepspeed ``` - Run `./setup.sh` - `source activate.sh` -- Done! You installed everything you need (works with Jupyter too) +- `accelerate config` +- Done! You installed everything you need --- @@ -504,15 +505,16 @@ deepspeed ```bash #!/bin/bash -x -#SBATCH --account=training2315 +#SBATCH --account=training2306 #SBATCH --nodes=1 #SBATCH --job-name=ai-serial #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=1 -#SBATCH --output=output.%j -#SBATCH --error=err.%j +#SBATCH --output=out-serial.%j +#SBATCH --error=err-serial.%j #SBATCH --time=00:40:00 -#SBATCH --partition=booster +#SBATCH --partition=develbooster +#SBATCH --gres=gpu:1 # Make sure we are on the right directory cd $HOME/2023-may-intro-to-supercompting-jsc/src @@ -532,8 +534,9 @@ time srun python serial.py cd [COURSE REPO DIRECTORY]/src sbatch serial.slurm ``` -- On Juwels Booster, each epoch takes ca. 45 seconds: Around 36 minutes -- (On a cpu system this would take half a day) +- On Juwels Booster, should take about 5 minutes +- On a cpu system this would take half a day + --- ## Going data parallel @@ -545,7 +548,116 @@ sbatch serial.slurm ## Data parallel ```python +from fastai.vision.all import * +from fastai.distributed import * +from fastai.vision.models.xresnet import * + +path = rank0_first(untar_data, URLs.IMAGEWOOF_320) +dls = DataBlock( + blocks=(ImageBlock, CategoryBlock), + splitter=GrandparentSplitter(valid_name='val'), + get_items=get_image_files, get_y=parent_label, + item_tfms=[RandomResizedCrop(160), FlipItem(0.5)], + batch_tfms=Normalize.from_stats(*imagenet_stats) +).dataloaders(path, path=path, bs=64) + +learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16() +with learn.distrib_ctx(): + learn.fine_tune(6) +``` + +--- + +## Data Parallel + +What changed? + +- It was +- ```python +path = untar_data(URLs.IMAGEWOOF_320) +``` +- Became +- ```python +path = rank0_first(untar_data, URLs.IMAGEWOOF_320) +``` +- It was +- ```python +learn.fine_tune(6) +``` +- Became +- ```python +with learn.distrib_ctx(): + learn.fine_tune(6) +``` + +--- + +## Submission script: data parallel + +```bash +#!/bin/bash -x +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=48 +... rest of the sbatch stuff, removed to fit screen + +# srun doesnot inherit cpus-per-task from sbatch +export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} +# so processes know who to talk to +MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" +# Allow communication over InfiniBand cells. +MASTER_ADDR="${MASTER_ADDR}i" +# Get IP for hostname. +export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +export MASTER_PORT=6000 +export GPUS_PER_NODE=4 +export NNODES=$SLURM_JOB_NUM_NODES + +cd $HOME/2023-may-intro-to-supercompting-jsc/src +source sc_venv_template/activate.sh + +time srun accelerate launch distrib.py +``` + +--- + +## Let's check the outputs! + +- Single gpu: +- ```bash +epoch train_loss valid_loss accuracy top_k_accuracy time +0 2.249933 2.152813 0.225757 0.750573 01:11 +epoch train_loss valid_loss accuracy top_k_accuracy time +0 1.882008 1.895813 0.324510 0.832018 00:44 +1 1.837312 1.916380 0.374141 0.845253 00:44 +2 1.717144 1.739026 0.378722 0.869941 00:43 +3 1.594981 1.637526 0.417664 0.891575 00:44 +4 1.460454 1.410519 0.507254 0.920336 00:44 +5 1.389946 1.304924 0.538814 0.935862 00:43 +real 5m44.972s +``` + +- Multi gpu: +- ```bash +epoch train_loss valid_loss accuracy top_k_accuracy time +0 2.201540 2.799354 0.202950 0.662513 00:09 +epoch train_loss valid_loss accuracy top_k_accuracy time +0 1.951004 2.059517 0.294761 0.781282 00:08 +1 1.929561 1.999069 0.309512 0.792981 00:08 +2 1.854629 1.962271 0.314344 0.840285 00:08 +3 1.754019 1.687136 0.404883 0.872330 00:08 +4 1.643759 1.499526 0.482706 0.906409 00:08 +5 1.554356 1.450976 0.502798 0.914547 00:08 +real 1m19.979s +``` + +--- + +## Some insights +- Distributed run suffered a bit on the accuracy and loss in exchange for speed 🏎️ +- Data parallel is a simple and effective way to distribute DL workload +- This is really just a primer - there's much more to that +- I/O plays a HUGE role on Supercomputers, for example --- diff --git a/public/01-deep-learning-on-supercomputers.html b/public/01-deep-learning-on-supercomputers.html index 81a11c6dce354de49aca8df1ae55ac72a78c8092..cedece5020201504a0caa13293a34c1c8c2ffe53 100644 --- a/public/01-deep-learning-on-supercomputers.html +++ b/public/01-deep-learning-on-supercomputers.html @@ -676,7 +676,7 @@ class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a hr <span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a></span> <span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>learn <span class="op">=</span> Learner(dls, xresnet50(n_out<span class="op">=</span><span class="dv">10</span>), metrics<span class="op">=</span>[accuracy,top_k_accuracy]).to_fp16()</span> <span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">48</span>)</span></code></pre></div> +<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div> </section> <section id="venv_template" class="slide level2"> <h2>Venv_template</h2> @@ -698,32 +698,33 @@ class="sourceCode python"><code class="sourceCode python"><span id="cb5-1"><a hr <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>deepspeed</span></code></pre></div></li> <li class="fragment">Run <code>./setup.sh</code></li> <li class="fragment"><code>source activate.sh</code></li> -<li class="fragment">Done! You installed everything you need (works with -Jupyter too)</li> +<li class="fragment"><code>accelerate config</code></li> +<li class="fragment">Done! You installed everything you need</li> </ul> </section> <section id="submission-script" class="slide level2"> <h2>Submission Script</h2> <div class="sourceCode" id="cb6"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!/bin/bash -x</span></span> -<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --account=training2315</span></span> +<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --account=training2306</span></span> <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --nodes=1</span></span> <span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --job-name=ai-serial</span></span> <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --ntasks-per-node=1</span></span> <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --cpus-per-task=1</span></span> -<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --output=output.%j</span></span> -<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --error=err.%j</span></span> +<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --output=out-serial.%j</span></span> +<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --error=err-serial.%j</span></span> <span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --time=00:40:00</span></span> -<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --partition=booster</span></span> -<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Make sure we are on the right directory</span></span> -<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span> -<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="co"># This loads modules and python packages</span></span> -<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span> -<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a></span> -<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Run the demo</span></span> -<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun python serial.py</span></code></pre></div> +<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --partition=develbooster</span></span> +<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --gres=gpu:1</span></span> +<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Make sure we are on the right directory</span></span> +<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span> +<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="co"># This loads modules and python packages</span></span> +<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span> +<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Run the demo</span></span> +<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun python serial.py</span></code></pre></div> </section> <section id="running-it" class="slide level2"> <h2>Running it</h2> @@ -731,11 +732,8 @@ class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href=" <li class="fragment"><div class="sourceCode" id="cb7"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> [COURSE REPO DIRECTORY]/src</span> <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="ex">sbatch</span> serial.slurm</span></code></pre></div></li> -<li class="fragment">On Juwels Booster, each epoch takes ca. 45 seconds: -Around 36 minutes</li> -<li class="fragment"><h2 -id="on-a-cpu-system-this-would-take-half-a-day">(On a cpu system this -would take half a day)</h2></li> +<li class="fragment">On Juwels Booster, should take about 5 minutes</li> +<li class="fragment">On a cpu system this would take half a day</li> </ul> </section> <section id="going-data-parallel" class="slide level2"> @@ -747,7 +745,109 @@ differences</li> </section> <section id="data-parallel-4" class="slide level2"> <h2>Data parallel</h2> -<p>```python</p> +<div class="sourceCode" id="cb8"><pre +class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.vision.<span class="bu">all</span> <span class="im">import</span> <span class="op">*</span></span> +<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.distributed <span class="im">import</span> <span class="op">*</span></span> +<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> fastai.vision.models.xresnet <span class="im">import</span> <span class="op">*</span></span> +<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> rank0_first(untar_data, URLs.IMAGEWOOF_320)</span> +<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a>dls <span class="op">=</span> DataBlock(</span> +<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a> blocks<span class="op">=</span>(ImageBlock, CategoryBlock),</span> +<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a> splitter<span class="op">=</span>GrandparentSplitter(valid_name<span class="op">=</span><span class="st">'val'</span>),</span> +<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a> get_items<span class="op">=</span>get_image_files, get_y<span class="op">=</span>parent_label,</span> +<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a> item_tfms<span class="op">=</span>[RandomResizedCrop(<span class="dv">160</span>), FlipItem(<span class="fl">0.5</span>)],</span> +<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a> batch_tfms<span class="op">=</span>Normalize.from_stats(<span class="op">*</span>imagenet_stats)</span> +<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a>).dataloaders(path, path<span class="op">=</span>path, bs<span class="op">=</span><span class="dv">64</span>)</span> +<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a>learn <span class="op">=</span> Learner(dls, xresnet50(n_out<span class="op">=</span><span class="dv">10</span>), metrics<span class="op">=</span>[accuracy,top_k_accuracy]).to_fp16()</span> +<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> learn.distrib_ctx():</span> +<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a> learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div> +</section> +<section id="data-parallel-5" class="slide level2"> +<h2>Data Parallel</h2> +<p>What changed?</p> +<ul> +<li class="fragment">It was</li> +<li class="fragment"><div class="sourceCode" id="cb9"><pre +class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> untar_data(URLs.IMAGEWOOF_320)</span></code></pre></div></li> +<li class="fragment">Became</li> +<li class="fragment"><div class="sourceCode" id="cb10"><pre +class="sourceCode python"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>path <span class="op">=</span> rank0_first(untar_data, URLs.IMAGEWOOF_320)</span></code></pre></div></li> +<li class="fragment">It was</li> +<li class="fragment"><div class="sourceCode" id="cb11"><pre +class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div></li> +<li class="fragment">Became</li> +<li class="fragment"><div class="sourceCode" id="cb12"><pre +class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> learn.distrib_ctx():</span> +<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> learn.fine_tune(<span class="dv">6</span>)</span></code></pre></div></li> +</ul> +</section> +<section id="submission-script-data-parallel" class="slide level2"> +<h2>Submission script: data parallel</h2> +<div class="sourceCode" id="cb13"><pre +class="sourceCode bash"><code class="sourceCode bash"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co">#!/bin/bash -x</span></span> +<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --nodes=1</span></span> +<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="co">#SBATCH --cpus-per-task=48</span></span> +<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="ex">...</span> rest of the sbatch stuff, removed to fit screen</span> +<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="co"># srun doesnot inherit cpus-per-task from sbatch</span></span> +<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">SRUN_CPUS_PER_TASK</span><span class="op">=</span><span class="va">${SLURM_CPUS_PER_TASK}</span></span> +<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a><span class="co"># so processes know who to talk to</span></span> +<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">"</span><span class="va">$(</span><span class="ex">scontrol</span> show hostnames <span class="st">"</span><span class="va">$SLURM_JOB_NODELIST</span><span class="st">"</span> <span class="kw">|</span> <span class="fu">head</span> <span class="at">-n</span> 1<span class="va">)</span><span class="st">"</span></span> +<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Allow communication over InfiniBand cells.</span></span> +<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a><span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">"</span><span class="va">${MASTER_ADDR}</span><span class="st">i"</span></span> +<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Get IP for hostname.</span></span> +<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">MASTER_ADDR</span><span class="op">=</span><span class="st">"</span><span class="va">$(</span><span class="ex">nslookup</span> <span class="st">"</span><span class="va">$MASTER_ADDR</span><span class="st">"</span> <span class="kw">|</span> <span class="fu">grep</span> <span class="at">-oP</span> <span class="st">'(?<=Address: ).*'</span><span class="va">)</span><span class="st">"</span></span> +<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">MASTER_PORT</span><span class="op">=</span>6000</span> +<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">GPUS_PER_NODE</span><span class="op">=</span>4</span> +<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">NNODES</span><span class="op">=</span><span class="va">$SLURM_JOB_NUM_NODES</span> </span> +<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> <span class="va">$HOME</span>/2023-may-intro-to-supercompting-jsc/src</span> +<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> sc_venv_template/activate.sh</span> +<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a></span> +<span id="cb13-21"><a href="#cb13-21" aria-hidden="true" tabindex="-1"></a><span class="bu">time</span> srun accelerate launch distrib.py</span></code></pre></div> +</section> +<section id="lets-check-the-outputs" class="slide level2"> +<h2>Let’s check the outputs!</h2> +<ul> +<li class="fragment">Single gpu:</li> +<li class="fragment"><div class="sourceCode" id="cb14"><pre +class="sourceCode bash"><code class="sourceCode bash"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span> train_loss valid_loss accuracy top_k_accuracy time </span> +<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span> 2.249933 2.152813 0.225757 0.750573 01:11 </span> +<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span> train_loss valid_loss accuracy top_k_accuracy time </span> +<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span> 1.882008 1.895813 0.324510 0.832018 00:44 </span> +<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="ex">1</span> 1.837312 1.916380 0.374141 0.845253 00:44 </span> +<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="ex">2</span> 1.717144 1.739026 0.378722 0.869941 00:43 </span> +<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="ex">3</span> 1.594981 1.637526 0.417664 0.891575 00:44 </span> +<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="ex">4</span> 1.460454 1.410519 0.507254 0.920336 00:44 </span> +<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="ex">5</span> 1.389946 1.304924 0.538814 0.935862 00:43 </span> +<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="ex">real</span> 5m44.972s</span></code></pre></div></li> +<li class="fragment">Multi gpu:</li> +<li class="fragment"><div class="sourceCode" id="cb15"><pre +class="sourceCode bash"><code class="sourceCode bash"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span> train_loss valid_loss accuracy top_k_accuracy time </span> +<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span> 2.201540 2.799354 0.202950 0.662513 00:09 </span> +<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="ex">epoch</span> train_loss valid_loss accuracy top_k_accuracy time </span> +<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="ex">0</span> 1.951004 2.059517 0.294761 0.781282 00:08 </span> +<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="ex">1</span> 1.929561 1.999069 0.309512 0.792981 00:08 </span> +<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="ex">2</span> 1.854629 1.962271 0.314344 0.840285 00:08 </span> +<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="ex">3</span> 1.754019 1.687136 0.404883 0.872330 00:08 </span> +<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="ex">4</span> 1.643759 1.499526 0.482706 0.906409 00:08 </span> +<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="ex">5</span> 1.554356 1.450976 0.502798 0.914547 00:08 </span> +<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="ex">real</span> 1m19.979s</span></code></pre></div></li> +</ul> +</section> +<section id="some-insights" class="slide level2"> +<h2>Some insights</h2> +<ul> +<li class="fragment">Distributed run suffered a bit on the accuracy and +loss in exchange for speed 🏎️</li> +<li class="fragment">Data parallel is a simple and effective way to +distribute DL workload</li> +<li class="fragment">This is really just a primer - there’s much more to +that</li> +<li class="fragment">I/O plays a HUGE role on Supercomputers, for +example</li> +</ul> </section> <section id="thats-all-folks" class="slide level2"> <h2>That’s all folks!</h2> diff --git a/src/distrib.slurm b/src/distrib.slurm index 24e2ca06334dca1290f256662899b7d1f5094496..62fcd891276ed90a799186865c6ff52c365703a6 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -17,7 +17,8 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" # Allow communication over InfiniBand cells. MASTER_ADDR="${MASTER_ADDR}i" # Get IP for hostname. -MASTER_PORT=6000 +export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" +export MASTER_PORT=6000 GPUS_PER_NODE=4 NNODES=$SLURM_JOB_NUM_NODES