From 4d8f63ba6008ffc27559ea632981ea2a6b10c195 Mon Sep 17 00:00:00 2001
From: Alexandre Strube <a.strube@fz-juelich.de>
Date: Wed, 31 May 2023 14:05:30 +0200
Subject: [PATCH] distrib finally works

---
 src/distrib.slurm | 35 +++++++++--------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/src/distrib.slurm b/src/distrib.slurm
index 562d48d..f9471bf 100644
--- a/src/distrib.slurm
+++ b/src/distrib.slurm
@@ -19,9 +19,7 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
 MASTER_ADDR="${MASTER_ADDR}i"
 # Get IP for hostname.
 export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
-
 export MASTER_PORT=7010
-
 export GPUS_PER_NODE=4
 
 # Make sure we are on the right directory
@@ -30,29 +28,14 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src
 # This loads modules and python packages
 source sc_venv_template/activate.sh
 
-# Set up accelerate config.
-export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml
-srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\"
-compute_environment: LOCAL_MACHINE
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: \$SLURM_NODEID
-main_process_ip: '\$MASTER_ADDR'
-main_process_port: \$MASTER_PORT
-main_training_function: main
-mixed_precision: 'no'
-num_machines: \$SLURM_JOB_NUM_NODES
-num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE))
-rdzv_backend: c10d
-same_network: false
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-EOT"
-
-# Run the demo
 time srun bash -c 'accelerate launch \
-    --config_file=$ACCELERATE_CONFIG_YAML \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --multi_gpu \
+    --mixed_precision no \
+    --num_processes=$(($SLURM_JOB_NUM_NODES * $GPUS_PER_NODE)) \
+    --dynamo_backend=no \
+    --num_machines=$SLURM_JOB_NUM_NODES \
+    --machine_rank=$SLURM_NODEID \
+    --rdzv_backend c10d \
     distrib.py'
-- 
GitLab