Simulation and Data Lab Applied Machine Learning
PyTorch at JSC

Repository

sbatch run.sbatch --train-num-workers=8 --valid-num-workers=3
[W socket.cpp:436] [c10d] The server socket cannot be initialized on [::]:54123 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:663] [c10d] The client socket cannot be initialized to connect to [jwb0001i.juwels]:54123 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:663] [c10d] The client socket cannot be initialized to connect to [jwb0001i.juwels]:32164 (errno: 97 - Address family not supported by protocol).
export TORCH_CUDA_ARCH_LIST=7.0;8.0;9.0
#SBATCH --ntasks-per-node=1
#SBATCH --ntasks-per-node=4
srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \
       --nproc_per_node=gpu \
       --nnodes="$SLURM_JOB_NUM_NODES" \
       --rdzv_id="$SLURM_JOB_ID" \
       --rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \
       --rdzv_backend=c10d \
       "$curr_dir"/main.py "$@"
srun env -u CUDA_VISIBLE_DEVICES python -u "$curr_dir"/main.py "$@"
import os

try:
    from lightning.pytorch.plugins.environments import SLURMEnvironment
except (ModuleNotFoundError, ImportError):
    # For PyTorch Lightning <2, this namespace needs to used instead.
    from pytorch_lightning.plugins.environments import SLURMEnvironment


def patch_lightning_slurm_master_addr():
    # Do not patch anything if we're not on a Jülich machine.
    if os.getenv('SYSTEMNAME', '') not in [
            'juwelsbooster',
            'juwels',
            'jurecadc',
            'jusuf',
    ]:
        return

    old_resolver = SLURMEnvironment.resolve_root_node_address

    def new_resolver(*args):
        nodes = args[-1]
        # Append an "i" for communication over InfiniBand.
        return old_resolver(nodes) + 'i'

    SLURMEnvironment.__old_resolve_root_node_address = old_resolver
    SLURMEnvironment.resolve_root_node_address = new_resolver


patch_lightning_slurm_master_addr()
export TORCH_LOGS='+torch.distributed.elastic.agent.server.api,+torch.distributed.elastic.agent.server.local_elastic_agent,+torch.distributed.elastic.rendezvous.dynamic_rendezvous,+torch.distributed.elastic.rendezvous.c10d_rendezvous_backend,+torch.distributed.elastic.rendezvous.utils,+torch.distributed.distributed_c10d'
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export TORCH_CPP_LOG_LEVEL=INFO