diff --git a/data/form-schema.json b/data/form-schema.json index 1cb083290b6440cc3bfc8864319069f00badd255..93dec36bbb99778a0d427579bedb922e03805785 100644 --- a/data/form-schema.json +++ b/data/form-schema.json @@ -13,7 +13,8 @@ "JUWELS", "LUMI", "MockHPCSystem", - "VEGA" + "VEGA", + "LEONARDO" ] } }, @@ -95,6 +96,15 @@ "Pytorch-DDP", "DeepSpeed" ] + }, + { + "key": [ + "LEONARDO" + ], + "value": [ + "Pytorch-DDP", + "Horovod" + ] } ] } @@ -195,6 +205,14 @@ "dp-esb", "dp-dam" ] + }, + { + "key": [ + "LEONARDO" + ], + "value": [ + "boost_usr_prod" + ] } ] } @@ -638,6 +656,14 @@ [ "DEEP", "DeepSpeed" + ], + [ + "LEONARDO", + "Horovod" + ], + [ + "LEONARDO", + "Pytorch-DDP" ] ] } @@ -722,6 +748,14 @@ [ "DEEP", "DeepSpeed" + ], + [ + "LEONARDO", + "Horovod" + ], + [ + "LEONARDO", + "Pytorch-DDP" ] ] } @@ -735,7 +769,8 @@ "JUWELS": "https://apps.fz-juelich.de/jsc/hps/juwels/index.html", "LUMI": "https://docs.lumi-supercomputer.eu/software/", "VEGA": "https://doc.vega.izum.si", - "Cyclone": "https://hpcf.cyi.ac.cy/documentation/" + "Cyclone": "https://hpcf.cyi.ac.cy/documentation/", + "LEONARDO": "https://wiki.u-gov.it/confluence/display/SCAIUS/HPC+User+Guide" }, "software": { "Pytorch-DDP": "https://pytorch.org/tutorials/intermediate/ddp_tutorial.html", diff --git a/scripts/LEONARDO/.gitkeep b/scripts/LEONARDO/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/LEONARDO/Horovod/LEONARDO_Horovod_script.sh b/scripts/LEONARDO/Horovod/LEONARDO_Horovod_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..58c2f8535a90e6adfc27234d30d41e702832ea91 --- /dev/null +++ b/scripts/LEONARDO/Horovod/LEONARDO_Horovod_script.sh @@ -0,0 +1,26 @@ +#!/usr/bin/bash + +#SBATCH --job-name=job +#SBATCH --account=%account% +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus= +#SBATCH --gpus-per-node=4 + +#MODULES BEGIN LEONARDO Horovod +ml cudnn/8.9.7.29-12--gcc--12.2.0-cuda-12.1 +ml parallel-netcdf/1.12.3--openmpi--4.1.6--gcc--12.2.0 +ml hdf5/1.14.3--gcc--12.2.0 +ml py-mpi4py/3.1.4--openmpi--4.1.6--gcc--12.2.0 +ml python/3.11.6--gcc--12.2.0-nlkgjki +ml nccl/2.19.1-1--gcc--12.2.0-cuda-12.1 +ml cuda/12.1 +#MODULES END + +source your/env_path/bin/activate + +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +srun --cpu-bind=none python3 -u %executable% \ No newline at end of file diff --git a/scripts/LEONARDO/Horovod/lamec.json b/scripts/LEONARDO/Horovod/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..64028e0da49b727fe64636ede88963fe30f5cfad --- /dev/null +++ b/scripts/LEONARDO/Horovod/lamec.json @@ -0,0 +1 @@ +{"template": "LEONARDO_Horovod_script.sh"} \ No newline at end of file diff --git a/scripts/LEONARDO/Pytorch-DDP/LEONARDO_Pytorch-DDP_script.sh b/scripts/LEONARDO/Pytorch-DDP/LEONARDO_Pytorch-DDP_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..c78199016d20200b1d1e45114987849c66f38efd --- /dev/null +++ b/scripts/LEONARDO/Pytorch-DDP/LEONARDO_Pytorch-DDP_script.sh @@ -0,0 +1,37 @@ +#!/usr/bin/bash + +#SBATCH --job-name=job +#SBATCH --account=%account% +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=01:00:00 +#SBATCH --partition=%partition% +#SBATCH --nodes=%nodes% +#SBATCH --gpus= +#SBATCH --gpus-per-node=4 + +#MODULES BEGIN LEONARDO Pytorch-DDP +ml cudnn/8.9.7.29-12--gcc--12.2.0-cuda-12.1 +ml parallel-netcdf/1.12.3--openmpi--4.1.6--gcc--12.2.0 +ml hdf5/1.14.3--gcc--12.2.0 +ml py-mpi4py/3.1.4--openmpi--4.1.6--gcc--12.2.0 +ml python/3.11.6--gcc--12.2.0-nlkgjki +ml nccl/2.19.1-1--gcc--12.2.0-cuda-12.1 +ml cuda/12.1 +#MODULES END + +source your/env_path/bin/activate + +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export PSP_CUDA=1 +export PSP_UCP=1 +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK + +srun --cpu-bind=none bash -c "torchrun \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)':29400 \ + %executable% diff --git a/scripts/LEONARDO/Pytorch-DDP/lamec.json b/scripts/LEONARDO/Pytorch-DDP/lamec.json new file mode 100644 index 0000000000000000000000000000000000000000..48f6f3906a829cefc250e9e60fa18488044071d2 --- /dev/null +++ b/scripts/LEONARDO/Pytorch-DDP/lamec.json @@ -0,0 +1 @@ +{"template": "LEONARDO_Pytorch-DDP_script.sh"} \ No newline at end of file diff --git a/scripts/LEONARDO/sysinfo.json b/scripts/LEONARDO/sysinfo.json new file mode 100644 index 0000000000000000000000000000000000000000..ae1151b3c4feb8bfea261255384d405736b55eb8 --- /dev/null +++ b/scripts/LEONARDO/sysinfo.json @@ -0,0 +1,7 @@ +{ + "partition": { + "boost_usr_prod": { + "nodes": 64 + } + } +} \ No newline at end of file