From 49785d16d904135d9da45ec977524f74b7b14d02 Mon Sep 17 00:00:00 2001 From: Sebastian Achilles <s.achilles@fz-juelich.de> Date: Sat, 3 Jun 2023 16:39:25 +0200 Subject: [PATCH] JR H100: update stage --- .../NVSHMEM-2.8.0-gompi-2022a-CUDA-12.0.eb | 71 +++++++++++++++++++ .../n/nvidia-driver/nvidia-driver-default.eb | 27 +++++++ .../o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb | 21 +++++- .../u/UCC/UCC-default-GCCcore-11.3.0.eb | 4 +- 4 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 Overlays/jureca_spr_overlay/n/NVSHMEM/NVSHMEM-2.8.0-gompi-2022a-CUDA-12.0.eb create mode 100644 Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb diff --git a/Overlays/jureca_spr_overlay/n/NVSHMEM/NVSHMEM-2.8.0-gompi-2022a-CUDA-12.0.eb b/Overlays/jureca_spr_overlay/n/NVSHMEM/NVSHMEM-2.8.0-gompi-2022a-CUDA-12.0.eb new file mode 100644 index 000000000..0d7ea6832 --- /dev/null +++ b/Overlays/jureca_spr_overlay/n/NVSHMEM/NVSHMEM-2.8.0-gompi-2022a-CUDA-12.0.eb @@ -0,0 +1,71 @@ +easyblock = 'ConfigureMake' + +name = 'NVSHMEM' +version = '2.8.0' +versionsuffix = '-CUDA-%(cudaver)s' + +local_cuda_version = '12.0' + +homepage = 'https://developer.nvidia.com/nvshmem' +description = """NVSHMEM is a parallel programming interface based on OpenSHMEM that provides +efficient and scalable communication for NVIDIA GPU clusters. NVSHMEM creates a +global address space for data that spans the memory of multiple GPUs and can be +accessed with fine-grained GPU-initiated operations, CPU-initiated operations, +and operations on CUDA streams. +""" + +toolchain = {'name': 'gompi', 'version': '2022a'} + +download_instructions = """The sources of NVSHMEM can be downloaded at NVIDIA's webpage when you have signed up for +their (free) developer program: +https://developer.nvidia.com/nvshmem-downloads""" + +sources = ['%(namelower)s_src_%(version)s-3.txz'] +checksums = ['7d4ef226630a94b587d18e02c27decc8b41d6f4ee52a26e25644b23cd18da81f'] + +builddependencies = [ + ('Autotools', '20220317'), + ('pkgconf', '1.8.0'), +] + +dependencies = [ + ('CUDA', local_cuda_version, '', SYSTEM), + ('NCCL', 'default', f'-CUDA-{local_cuda_version}'), +] + +skipsteps = ['configure'] + +prebuildopts = 'export %s &&' % ' '.join([ + 'NVSHMEM_USE_GDRCOPY=1', + 'GDRCOPY_HOME=${EBROOTGDRCOPY}', + + 'MPI_HOME=${EBROOTOPENMPI}', + 'NVSHMEM_MPI_SUPPORT=1', + 'NVSHMEMTEST_USE_MPI_LAUNCHER=1', + + 'NCCL_HOME=${EBROOTNCCL}', + 'NVSHMEM_USE_NCCL=1', + + 'NVSHMEM_BUILDDIR=%(builddir)s', + 'NVSHMEM_EXAMPLES_BUILDDIR=${NVSHMEM_BUILDDIR}/examples/obj', + 'NVSHMEM_OTHERTEST_BUILDDIR=${NVSHMEM_BUILDDIR}/othertest/obj', + 'NVSHMEM_TEST_BUILDDIR=${NVSHMEM_BUILDDIR}/test/obj', + 'NVSHMEM_PERFTEST_BUILDDIR=${NVSHMEM_BUILDDIR}/perftest/obj', + + 'NVSHMEM_PREFIX=%(installdir)s', + 'NVSHMEM_EXAMPLES_INSTALL=${NVSHMEM_PREFIX}/examples', + 'NVSHMEM_OTHERTEST_INSTALL=${NVSHMEM_PREFIX}/othertest', + 'NVSHMEM_PERFTEST_INSTALL=${NVSHMEM_PREFIX}/perftest', + 'NVSHMEM_TEST_INSTALL=${NVSHMEM_PREFIX}/test', +]) + +preinstallopts = prebuildopts + +sanity_check_paths = { + 'files': ['lib/libnvshmem.a', 'lib/nvshmem_bootstrap_mpi.%s' % SHLIB_EXT], + 'dirs': ['include'] +} + +modextravars = {'NVSHMEM_HOME': '%(installdir)s'} + +moduleclass = 'devel' diff --git a/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb new file mode 100644 index 000000000..2900da59e --- /dev/null +++ b/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb @@ -0,0 +1,27 @@ +name = 'nvidia-driver' +version = 'default' +realversion = '525.105.17' + +homepage = 'https://developer.nvidia.com/cuda-toolkit' +description = f""" +This is a set of libraries normally installed by the NVIDIA driver installer. + +The real version of this package is {realversion}. +""" + +site_contacts = 'sc@fz-juelich.de' + +toolchain = SYSTEM + +source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] +sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] +checksums = ['c635a21a282c9b53485f19ebb64a0f4b536a968b94d4d97629e0bc547a58142a'] + +# To avoid conflicts between NVML and the kernel driver +postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] + +modluafooter = ''' +add_property("arch","gpu") +''' + +moduleclass = 'system' diff --git a/Overlays/jureca_spr_overlay/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb b/Overlays/jureca_spr_overlay/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb index b3e731028..18778a139 100644 --- a/Overlays/jureca_spr_overlay/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb +++ b/Overlays/jureca_spr_overlay/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb @@ -1,3 +1,5 @@ +easyblock = 'ConfigureMake' + name = 'OpenMPI' version = '4.1.4' @@ -51,15 +53,30 @@ preconfigopts = ' && '.join([ 'aclocal', 'automake', '' -]) + ]) -configopts = '--without-orte ' +configopts = '--enable-shared ' +configopts += '--with-hwloc=$EBROOTHWLOC ' # hwloc support +configopts += '--with-ucx=$EBROOTUCX ' +configopts += '--with-verbs ' +configopts += '--with-libevent=$EBROOTLIBEVENT ' +configopts += '--without-orte ' configopts += '--without-psm2 ' configopts += '--disable-oshmem ' +configopts += '--with-cuda=$EBROOTCUDA ' configopts += '--with-ime=/opt/ddn/ime ' configopts += '--with-gpfs ' # to enable SLURM integration (site-specific) configopts += '--with-slurm --with-pmix=external --with-libevent=external --with-ompi-pmix-rte' +local_libs = ["mpi_mpifh", "mpi", "ompitrace", "open-pal", "open-rte"] +sanity_check_paths = { + 'files': ["bin/%s" % local_binfile for local_binfile in ["ompi_info", "opal_wrapper"]] + + ["lib/lib%s.%s" % (local_libfile, SHLIB_EXT) for local_libfile in local_libs] + + ["include/%s.h" % x for x in ["mpi-ext", "mpif-config", + "mpif", "mpi", "mpi_portable_platform"]], + 'dirs': [], +} + moduleclass = 'mpi' diff --git a/Overlays/jureca_spr_overlay/u/UCC/UCC-default-GCCcore-11.3.0.eb b/Overlays/jureca_spr_overlay/u/UCC/UCC-default-GCCcore-11.3.0.eb index 6a90ac037..a508309e8 100644 --- a/Overlays/jureca_spr_overlay/u/UCC/UCC-default-GCCcore-11.3.0.eb +++ b/Overlays/jureca_spr_overlay/u/UCC/UCC-default-GCCcore-11.3.0.eb @@ -7,7 +7,7 @@ easyblock = 'ConfigureMake' name = 'UCC' version = 'default' -local_realversion = '1.1.0-rc1' +local_realversion = '1.2.0-rc1' homepage = 'https://www.openucx.org/' description = f"""UCC (Unified Collective Communication) is a collective @@ -23,7 +23,7 @@ toolchainopts = {'pic': True} source_urls = ['https://github.com/openucx/ucc/archive/refs/tags'] sources = [f'v{local_realversion}.tar.gz'] checksums = [ - '4af76d706a788af081c4a6ce566b6d4e33d75629ce9a8a7b8eec1760eff13168', # v1.1.0-rc1.tar.gz + {'v1.2.0-rc1.tar.gz': 'ae6384eecec5054e2c5e960dfc03c083f5f98afaed17276a306c6fe27db4354b'}, ] builddependencies = [ -- GitLab