Skip to content
Snippets Groups Projects
Select Git revision
  • 108-implement-cpu-id-query-for-apple-m1-hardware
  • devel default
  • 107-compilation-error-when-building-maestro-core-on-m1-apple-processors
  • 58-scripting-interface-to-maestro-core
  • 101-need-ci-test-using-installed-maestro
  • 57-sphinx-documentation
  • 105-memory-leak-in-pm-message-envelope-handling
  • 104-permit-disabling-memory-pool
  • 103-liberl-installation-issue-on-devel
  • 94-maestro-rdma-transport-ignores-max_msg_size-2
  • main protected
  • 102-possible-race-in-check_pm_redundant_interlock-test
  • 97-check-if-shm-provider-can-be-enabled-after-libfabric-1-14-is-in-our-tree-2
  • 100-include-maestro-attributes-h-cannot-include-mamba-header-from-deps-path
  • 97-check-if-shm-provider-can-be-enabled-after-libfabric-1-14-is-in-our-tree
  • 17-job-failed-282354-needs-update-of-mio-interface-and-build-rules
  • 96-test-libfabric-update-to-1-13-or-1-14
  • feature/stop-telemetry-after-all-left
  • 94-maestro-rdma-transport-ignores-max_msg_size
  • 93-improve-performance-of-mstro_attribute_val_cmp_str
  • v0.3_rc1
  • maestro_d65
  • d65_experiments_20211113
  • v0.2
  • v0.2_rc1
  • d3.3
  • d3.3-review
  • d5.5
  • d5.5-review
  • v0.1
  • d3.2
  • d3.2-draft
  • v0.0
33 results

ci-devel.yaml

Blame
  • run.sbatch 1.69 KiB
    #!/usr/bin/env bash
    
    #SBATCH --account=atmlaml
    #SBATCH --partition=develbooster
    #SBATCH --nodes=1
    #SBATCH --ntasks-per-node=1
    # 96 CPUs w/o nomultithread
    # 48 CPUs w/ nomultithread
    #SBATCH --cpus-per-task=48
    # Use only physical cores.
    #SBATCH --hint=nomultithread
    #SBATCH --gres=gpu:4
    #SBATCH --time=00:15:00
    
    curr_file="$(scontrol show job "$SLURM_JOB_ID" | grep '^[[:space:]]*Command=' | head -n 1 | cut -d '=' -f 2-)"
    curr_dir="$(dirname "$curr_file")"
    
    # Propagate the specified number of CPUs per task to each `srun`.
    export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK"
    
    [ -x "$(command -v deactivate)" ] && deactivate
    
    module --force purge
    module load Stages
    source "$curr_dir"/modules.sh
    
    if ! [ -d "$curr_dir"/env ]; then
        echo "Cannot set up \`venv\` on JUWELS Booster compute node." \
             "Please manually execute \`bash set_up.sh\` on a login node."
        exit 1
    fi
    
    source "$curr_dir"/env/bin/activate
    
    export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"
    if [ "$SYSTEMNAME" = juwelsbooster ] \
           || [ "$SYSTEMNAME" = juwels ] \
           || [ "$SYSTEMNAME" = jurecadc ] \
           || [ "$SYSTEMNAME" = jusuf ]; then
        # Allow communication over InfiniBand cells on JSC machines.
        MASTER_ADDR="$MASTER_ADDR"i
    fi
    export MASTER_PORT=54123
    
    # Prevent NCCL not figuring out how to initialize.
    export NCCL_SOCKET_IFNAME=ib0
    # Prevent GLOO not being able to communicate.
    export GLOO_SOCKET_IFNAME=ib0
    
    srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
           --nproc_per_node=gpu \
           --nnodes="$SLURM_JOB_NUM_NODES" \
           --rdzv_id="$SLURM_JOB_ID" \
           --rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
           --rdzv_backend=c10d \
           "$curr_dir"/main.py "$@"