Skip to content
Snippets Groups Projects
Commit 3d88d3f4 authored by Jóhannes Nordal's avatar Jóhannes Nordal
Browse files

Merge branch '1-add-leonardo-system' into 'main'

Update 6 files

Closes #1

See merge request !3
parents 60611845 f53b1283
Branches
No related tags found
1 merge request!3Update 6 files
...@@ -13,7 +13,8 @@ ...@@ -13,7 +13,8 @@
"JUWELS", "JUWELS",
"LUMI", "LUMI",
"MockHPCSystem", "MockHPCSystem",
"VEGA" "VEGA",
"LEONARDO"
] ]
} }
}, },
...@@ -95,6 +96,15 @@ ...@@ -95,6 +96,15 @@
"Pytorch-DDP", "Pytorch-DDP",
"DeepSpeed" "DeepSpeed"
] ]
},
{
"key": [
"LEONARDO"
],
"value": [
"Pytorch-DDP",
"Horovod"
]
} }
] ]
} }
...@@ -195,6 +205,14 @@ ...@@ -195,6 +205,14 @@
"dp-esb", "dp-esb",
"dp-dam" "dp-dam"
] ]
},
{
"key": [
"LEONARDO"
],
"value": [
"boost_usr_prod"
]
} }
] ]
} }
...@@ -638,6 +656,14 @@ ...@@ -638,6 +656,14 @@
[ [
"DEEP", "DEEP",
"DeepSpeed" "DeepSpeed"
],
[
"LEONARDO",
"Horovod"
],
[
"LEONARDO",
"Pytorch-DDP"
] ]
] ]
} }
...@@ -722,6 +748,14 @@ ...@@ -722,6 +748,14 @@
[ [
"DEEP", "DEEP",
"DeepSpeed" "DeepSpeed"
],
[
"LEONARDO",
"Horovod"
],
[
"LEONARDO",
"Pytorch-DDP"
] ]
] ]
} }
...@@ -735,7 +769,8 @@ ...@@ -735,7 +769,8 @@
"JUWELS": "https://apps.fz-juelich.de/jsc/hps/juwels/index.html", "JUWELS": "https://apps.fz-juelich.de/jsc/hps/juwels/index.html",
"LUMI": "https://docs.lumi-supercomputer.eu/software/", "LUMI": "https://docs.lumi-supercomputer.eu/software/",
"VEGA": "https://doc.vega.izum.si", "VEGA": "https://doc.vega.izum.si",
"Cyclone": "https://hpcf.cyi.ac.cy/documentation/" "Cyclone": "https://hpcf.cyi.ac.cy/documentation/",
"LEONARDO": "https://wiki.u-gov.it/confluence/display/SCAIUS/HPC+User+Guide"
}, },
"software": { "software": {
"Pytorch-DDP": "https://pytorch.org/tutorials/intermediate/ddp_tutorial.html", "Pytorch-DDP": "https://pytorch.org/tutorials/intermediate/ddp_tutorial.html",
......
#!/usr/bin/bash
#SBATCH --job-name=job
#SBATCH --account=%account%
#SBATCH --output=job.out
#SBATCH --error=job.err
#SBATCH --partition=%partition%
#SBATCH --nodes=%nodes%
#SBATCH --gpus=
#SBATCH --gpus-per-node=4
#MODULES BEGIN LEONARDO Horovod
ml cudnn/8.9.7.29-12--gcc--12.2.0-cuda-12.1
ml parallel-netcdf/1.12.3--openmpi--4.1.6--gcc--12.2.0
ml hdf5/1.14.3--gcc--12.2.0
ml py-mpi4py/3.1.4--openmpi--4.1.6--gcc--12.2.0
ml python/3.11.6--gcc--12.2.0-nlkgjki
ml nccl/2.19.1-1--gcc--12.2.0-cuda-12.1
ml cuda/12.1
#MODULES END
source your/env_path/bin/activate
export CUDA_VISIBLE_DEVICES="0,1,2,3"
srun --cpu-bind=none python3 -u %executable%
\ No newline at end of file
{"template": "LEONARDO_Horovod_script.sh"}
\ No newline at end of file
#!/usr/bin/bash
#SBATCH --job-name=job
#SBATCH --account=%account%
#SBATCH --output=job.out
#SBATCH --error=job.err
#SBATCH --time=01:00:00
#SBATCH --partition=%partition%
#SBATCH --nodes=%nodes%
#SBATCH --gpus=
#SBATCH --gpus-per-node=4
#MODULES BEGIN LEONARDO Pytorch-DDP
ml cudnn/8.9.7.29-12--gcc--12.2.0-cuda-12.1
ml parallel-netcdf/1.12.3--openmpi--4.1.6--gcc--12.2.0
ml hdf5/1.14.3--gcc--12.2.0
ml py-mpi4py/3.1.4--openmpi--4.1.6--gcc--12.2.0
ml python/3.11.6--gcc--12.2.0-nlkgjki
ml nccl/2.19.1-1--gcc--12.2.0-cuda-12.1
ml cuda/12.1
#MODULES END
source your/env_path/bin/activate
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export PSP_CUDA=1
export PSP_UCP=1
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
srun --cpu-bind=none bash -c "torchrun \
--nnodes=$SLURM_NNODES \
--nproc_per_node=$SLURM_GPUS_PER_NODE \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
--rdzv_backend=c10d \
--rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)':29400 \
%executable%
{"template": "LEONARDO_Pytorch-DDP_script.sh"}
\ No newline at end of file
{
"partition": {
"boost_usr_prod": {
"nodes": 64
}
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment