From d60bc6eddf6813800d0c5f54672fb0304f24e755 Mon Sep 17 00:00:00 2001
From: Alexandre Strube <a.strube@fz-juelich.de>
Date: Mon, 29 May 2023 23:04:46 +0200
Subject: [PATCH] trying stuff

---
 src/distrib.py    |  5 +++++
 src/distrib.slurm | 14 ++++++++++++++
 src/serial.slurm  |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/distrib.py b/src/distrib.py
index 322a146..2b1bf70 100644
--- a/src/distrib.py
+++ b/src/distrib.py
@@ -1,6 +1,11 @@
 from fastai.vision.all import *
 from fastai.distributed import *
 from fastai.vision.models.xresnet import *
+from accelerate import Accelerator
+
+# Print status information about the distributed environment
+accelerator = Accelerator()
+print(accelerator.state)
 
 path = rank0_first(untar_data, URLs.IMAGEWOOF_320)
 dls = DataBlock(
diff --git a/src/distrib.slurm b/src/distrib.slurm
index 1a86c33..bf7be2f 100644
--- a/src/distrib.slurm
+++ b/src/distrib.slurm
@@ -10,6 +10,20 @@
 #SBATCH --partition=develbooster
 #SBATCH --gres=gpu:4                                                                                                                                                                                                                           
 
+# srun doesnot inherit cpus-per-task from sbatch
+export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
+# so processes know who to talk to
+MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
+# Allow communication over InfiniBand cells.
+MASTER_ADDR="${MASTER_ADDR}i"
+# Get IP for hostname.
+MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
+MASTER_PORT=6000
+GPUS_PER_NODE=4
+NNODES=$SLURM_JOB_NUM_NODES  
+
+
+
 # Make sure we are on the right directory
 cd $HOME/2023-may-intro-to-supercompting-jsc/src
 
diff --git a/src/serial.slurm b/src/serial.slurm
index 7d38cac..10071ea 100644
--- a/src/serial.slurm
+++ b/src/serial.slurm
@@ -7,7 +7,7 @@
 #SBATCH --output=output.%j
 #SBATCH --error=err.%j
 #SBATCH --time=00:40:00
-#SBATCH --partition=booster
+#SBATCH --partition=develbooster
 #SBATCH --gres=gpu:1
 
 # Make sure we are on the right directory
-- 
GitLab