Skip to content
Snippets Groups Projects
Select Git revision
  • a03c1c03cbe901b3fb1ac08d14ce15201d34a512
  • main default protected
2 results

index.html

Blame
  • Forked from Alexandre Strube / 2024-05-talk-intro-to-supercompting-jsc
    Source project has a limited visibility.
    run.sbatch 1.69 KiB
    #!/usr/bin/env bash
    
    #SBATCH --account=atmlaml
    #SBATCH --partition=develbooster
    #SBATCH --nodes=1
    #SBATCH --ntasks-per-node=1
    # 96 CPUs w/o nomultithread
    # 48 CPUs w/ nomultithread
    #SBATCH --cpus-per-task=48
    # Use only physical cores.
    #SBATCH --hint=nomultithread
    #SBATCH --gres=gpu:4
    #SBATCH --time=00:15:00
    
    curr_file="$(scontrol show job "$SLURM_JOB_ID" | grep '^[[:space:]]*Command=' | head -n 1 | cut -d '=' -f 2-)"
    curr_dir="$(dirname "$curr_file")"
    
    # Propagate the specified number of CPUs per task to each `srun`.
    export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
    
    [ -x "$(command -v deactivate)" ] && deactivate
    
    module --force purge
    module load Stages
    source "$curr_dir"/modules.sh
    
    if ! [ -d "$curr_dir"/env ]; then
        echo "Cannot set up \`venv\` on JUWELS Booster compute node." \
             "Please manually execute \`bash set_up.sh\` on a login node."
        exit 1
    fi
    
    source "$curr_dir"/env/bin/activate
    
    export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
    if [ "$SYSTEMNAME" = juwelsbooster ] \
           || [ "$SYSTEMNAME" = juwels ] \
           || [ "$SYSTEMNAME" = jurecadc ] \
           || [ "$SYSTEMNAME" = jusuf ]; then
        # Allow communication over InfiniBand cells on JSC machines.
        MASTER_ADDR="$MASTER_ADDR"i
    fi
    export MASTER_PORT=54123
    
    # Prevent NCCL not figuring out how to initialize.
    export NCCL_SOCKET_IFNAME=ib0
    # Prevent GLOO not being able to communicate.
    export GLOO_SOCKET_IFNAME=ib0
    
    srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
           --nproc_per_node=gpu \
           --nnodes="$SLURM_JOB_NUM_NODES" \
           --rdzv_id="$SLURM_JOB_ID" \
           --rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
           --rdzv_backend=c10d \
           "$curr_dir"/main.py "$@"