diff --git a/README.md b/README.md index 9205aef7966b7db558dfba623096d07f270dbbae..2073c67fbe1ca82e12e035881be7df7a8689aba6 100644 --- a/README.md +++ b/README.md @@ -262,9 +262,10 @@ use Instead of ```shell -srun python -u -m torchrun_jsc \ - --nproc_per_node="$DEVICES_PER_NODE" \ +srun env -u CUDA_VISIBLE_DEVICES python -u -m torchrun_jsc \ + --nproc_per_node=gpu \ --nnodes="$SLURM_JOB_NUM_NODES" \ + --rdzv_id="$SLURM_JOB_ID" \ --rdzv_endpoint="$MASTER_ADDR":"$MASTER_PORT" \ --rdzv_backend=c10d \ "$curr_dir"/main.py "$@" @@ -273,7 +274,7 @@ srun python -u -m torchrun_jsc \ use ```shell -srun python -u "$curr_dir"/main.py "$@" +srun env -u CUDA_VISIBLE_DEVICES python -u "$curr_dir"/main.py "$@" ``` Additionally, if using PyTorch Lightning, you may encounter issues @@ -301,6 +302,7 @@ def patch_lightning_slurm_master_addr(): 'juwelsbooster', 'juwels', 'jurecadc', + 'jusuf', ]: return