From 4d8f63ba6008ffc27559ea632981ea2a6b10c195 Mon Sep 17 00:00:00 2001 From: Alexandre Strube <a.strube@fz-juelich.de> Date: Wed, 31 May 2023 14:05:30 +0200 Subject: [PATCH] distrib finally works --- src/distrib.slurm | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/src/distrib.slurm b/src/distrib.slurm index 562d48d..f9471bf 100644 --- a/src/distrib.slurm +++ b/src/distrib.slurm @@ -19,9 +19,7 @@ MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" MASTER_ADDR="${MASTER_ADDR}i" # Get IP for hostname. export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')" - export MASTER_PORT=7010 - export GPUS_PER_NODE=4 # Make sure we are on the right directory @@ -30,29 +28,14 @@ cd $HOME/2023-may-intro-to-supercompting-jsc/src # This loads modules and python packages source sc_venv_template/activate.sh -# Set up accelerate config. -export ACCELERATE_CONFIG_YAML=accelerate_config_"$SLURM_JOB_ID".yaml -srun bash -c "((\$SLURM_PROCID)) || cat <<EOT > \"\$ACCELERATE_CONFIG_YAML\" -compute_environment: LOCAL_MACHINE -distributed_type: MULTI_GPU -downcast_bf16: 'no' -gpu_ids: all -machine_rank: \$SLURM_NODEID -main_process_ip: '\$MASTER_ADDR' -main_process_port: \$MASTER_PORT -main_training_function: main -mixed_precision: 'no' -num_machines: \$SLURM_JOB_NUM_NODES -num_processes: \$((SLURM_JOB_NUM_NODES * GPUS_PER_NODE)) -rdzv_backend: c10d -same_network: false -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false -EOT" - -# Run the demo time srun bash -c 'accelerate launch \ - --config_file=$ACCELERATE_CONFIG_YAML \ + --main_process_ip $MASTER_ADDR \ + --main_process_port $MASTER_PORT \ + --multi_gpu \ + --mixed_precision no \ + --num_processes=$(($SLURM_JOB_NUM_NODES * $GPUS_PER_NODE)) \ + --dynamo_backend=no \ + --num_machines=$SLURM_JOB_NUM_NODES \ + --machine_rank=$SLURM_NODEID \ + --rdzv_backend c10d \ distrib.py' -- GitLab