diff --git a/pytorch-ddp-example/activate.sh b/pytorch-ddp-example/activate.sh new file mode 100644 index 0000000000000000000000000000000000000000..4083f0a8698b4961ff10e693e2d8b286447b14d6 --- /dev/null +++ b/pytorch-ddp-example/activate.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +if [ -z "$curr_dir" ]; then + curr_file="${BASH_SOURCE[0]:-${(%):-%x}}" + curr_dir="$(dirname "$curr_file")" +fi + +venv_dir="$curr_dir"/env + +[ -x "$(command -v deactivate)" ] && deactivate + +module --force purge +source "$curr_dir"/modules.sh + +if ! [ -d "$venv_dir" ]; then + echo "Cannot set up \`venv\` on JUWELS Booster compute node." \ + "Please manually execute \`bash set_up.sh\` on a login node." + exit 1 +fi + +source "$venv_dir"/bin/activate diff --git a/pytorch-ddp-example/modules.sh b/pytorch-ddp-example/modules.sh index d690e3e9e52faedf2e004db1609a1cd5a3e7115d..f99ea5fed2fb939bf9ccf13932842a42814b40f7 100644 --- a/pytorch-ddp-example/modules.sh +++ b/pytorch-ddp-example/modules.sh @@ -1,3 +1,4 @@ #!/usr/bin/env sh +module load Stages module load GCC OpenMPI PyTorch torchvision diff --git a/pytorch-ddp-example/run.sbatch b/pytorch-ddp-example/run.sbatch index 68c316553a013fd710eb02594abf53aa493f264b..27c4e4c7fcaf20640dd687841d9ece816bf7d8c2 100644 --- a/pytorch-ddp-example/run.sbatch +++ b/pytorch-ddp-example/run.sbatch @@ -18,19 +18,7 @@ curr_dir="$(dirname "$curr_file")" # Propagate the specified number of CPUs per task to each `srun`. export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK" -[ -x "$(command -v deactivate)" ] && deactivate - -module --force purge -module load Stages -source "$curr_dir"/modules.sh - -if ! [ -d "$curr_dir"/env ]; then - echo "Cannot set up \`venv\` on JUWELS Booster compute node." \ - "Please manually execute \`bash set_up.sh\` on a login node." - exit 1 -fi - -source "$curr_dir"/env/bin/activate +source "$curr_dir"/activate.sh export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" if [ "$SYSTEMNAME" = juwelsbooster ] \ diff --git a/pytorch-fsdp-example/activate.sh b/pytorch-fsdp-example/activate.sh new file mode 100644 index 0000000000000000000000000000000000000000..4083f0a8698b4961ff10e693e2d8b286447b14d6 --- /dev/null +++ b/pytorch-fsdp-example/activate.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +if [ -z "$curr_dir" ]; then + curr_file="${BASH_SOURCE[0]:-${(%):-%x}}" + curr_dir="$(dirname "$curr_file")" +fi + +venv_dir="$curr_dir"/env + +[ -x "$(command -v deactivate)" ] && deactivate + +module --force purge +source "$curr_dir"/modules.sh + +if ! [ -d "$venv_dir" ]; then + echo "Cannot set up \`venv\` on JUWELS Booster compute node." \ + "Please manually execute \`bash set_up.sh\` on a login node." + exit 1 +fi + +source "$venv_dir"/bin/activate diff --git a/pytorch-fsdp-example/modules.sh b/pytorch-fsdp-example/modules.sh index d690e3e9e52faedf2e004db1609a1cd5a3e7115d..f99ea5fed2fb939bf9ccf13932842a42814b40f7 100644 --- a/pytorch-fsdp-example/modules.sh +++ b/pytorch-fsdp-example/modules.sh @@ -1,3 +1,4 @@ #!/usr/bin/env sh +module load Stages module load GCC OpenMPI PyTorch torchvision diff --git a/pytorch-fsdp-example/run.sbatch b/pytorch-fsdp-example/run.sbatch index 4af8053e43d773ef0069351d915889b948e6f8a2..4b404e41f7116bdbbcc8f63d20990c34e74ec4ee 100644 --- a/pytorch-fsdp-example/run.sbatch +++ b/pytorch-fsdp-example/run.sbatch @@ -18,19 +18,7 @@ curr_dir="$(dirname "$curr_file")" # Propagate the specified number of CPUs per task to each `srun`. export SRUN_CPUS_PER_TASK="$SLURM_CPUS_PER_TASK" -[ -x "$(command -v deactivate)" ] && deactivate - -module --force purge -module load Stages -source "$curr_dir"/modules.sh - -if ! [ -d "$curr_dir"/env ]; then - echo "Cannot set up \`venv\` on JUWELS Booster compute node." \ - "Please manually execute \`bash set_up.sh\` on a login node." - exit 1 -fi - -source "$curr_dir"/env/bin/activate +source "$curr_dir"/activate.sh export MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" if [ "$SYSTEMNAME" = juwelsbooster ] \