diff --git a/01-deep-learning-on-supercomputers.md b/01-deep-learning-on-supercomputers.md index fcd8ce2d49ab83016f0a050ee06b3b27a52eced2..f77a612f27ed9bc3220a44ff01c9e1177a2f9220 100644 --- a/01-deep-learning-on-supercomputers.md +++ b/01-deep-learning-on-supercomputers.md @@ -378,6 +378,7 @@ date: May 31, 2023 - Let's take a simple model - Run it "serially" (single-gpu) - Then make it data parallel among multiple gpus in one node +- Then we make it multi-node! --- @@ -494,7 +495,6 @@ deepspeed ``` - Run `./setup.sh` - `source activate.sh` -- `accelerate config` - Done! You installed everything you need --- @@ -592,29 +592,7 @@ with learn.distrib_ctx(): ## Submission script: data parallel -```bash -#!/bin/bash -x -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=48 -... rest of the sbatch stuff, removed to fit screen - -# srun doesnot inherit cpus-per-task from sbatch -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -# so processes know who to talk to -MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" -# Allow communication over InfiniBand cells. -MASTER_ADDR="${MASTER_ADDR}i" -# Get IP for hostname. -export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" -export MASTER_PORT=6000 -export GPUS_PER_NODE=4 -export NNODES=$SLURM_JOB_NUM_NODES - -cd $HOME/2023-may-intro-to-supercompting-jsc/src -source sc_venv_template/activate.sh - -time srun accelerate launch distrib.py -``` +- Please check the course repository: [src/distrib.slurm](https://gitlab.jsc.fz-juelich.de/strube1/2023-may-intro-to-supercompting-jsc/-/blob/main/src/distrib.slurm) --- diff --git a/src/distrib.py b/src/distrib.py index 4d68aa0714471b910cb6aa1e17f5364fbba70663..4b24a9aedf41e30c2b32f3d19f0e68b5899b68ba 100644 --- a/src/distrib.py +++ b/src/distrib.py @@ -1,9 +1,8 @@ from fastai.vision.all import * from fastai.distributed import * from fastai.vision.models.xresnet import * -import os -rank = os.environ.get('SLURM_PROCID') -path = untar_data(URLs.IMAGEWOOF_320) + +path = rank0_first(untar_data, URLs.IMAGEWOOF_320) dls = DataBlock( blocks=(ImageBlock, CategoryBlock), @@ -15,5 +14,4 @@ dls = DataBlock( learn = Learner(dls, xresnet50(n_out=10), metrics=[accuracy,top_k_accuracy]).to_fp16() with learn.distrib_ctx(): - print("************************ Distributed test4 on rank ", rank) learn.fine_tune(6)