Skip to content
Snippets Groups Projects
Select Git revision
  • ac07de2c619cf2c773b2e27730fe213b4cf64122
  • main default protected
2 results

distrib.slurm

Blame
  • distrib.slurm 1.25 KiB
    #!/bin/bash
    #SBATCH --account=training2436
    #SBATCH --nodes=1
    #SBATCH --job-name=ai-multi-gpu
    #SBATCH --ntasks-per-node=1
    #SBATCH --cpus-per-task=48
    #SBATCH --output=out-distrib.%j
    #SBATCH --error=err-distrib.%j
    #SBATCH --time=00:20:00
    #SBATCH --partition=dc-gpu
    #SBATCH --gres=gpu:4
    
    # Without this, srun does not inherit cpus-per-task from sbatch.
    export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
    
    # so processes know who to talk to
    MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
    # Allow communication over InfiniBand cells.
    MASTER_ADDR="${MASTER_ADDR}i"
    # Get IP for hostname.
    export MASTER_ADDR="$(nslookup "$MASTER_ADDR" | grep -oP '(?<=Address: ).*')"
    export MASTER_PORT=7010
    export GPUS_PER_NODE=4
    
    # Make sure we are on the right directory
    cd $HOME/2024-11-talk-intro-to-supercompting-jsc/src
    
    # This loads modules and python packages
    source sc_venv_template/activate.sh
    
    time srun bash -c 'accelerate launch \
        --main_process_ip $MASTER_ADDR \
        --main_process_port $MASTER_PORT \
        --multi_gpu \
        --mixed_precision no \
        --num_processes=$(($SLURM_JOB_NUM_NODES * $GPUS_PER_NODE)) \
        --dynamo_backend=no \
        --num_machines=$SLURM_JOB_NUM_NODES \
        --machine_rank=$SLURM_NODEID \
        --rdzv_backend c10d \
        distrib.py'