diff --git a/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb b/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..b0427412f810a96ebe7e4bad1383ecab827d5bf7 --- /dev/null +++ b/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb @@ -0,0 +1,45 @@ +easyblock = 'ConfigureMake' + +name = 'hwloc' +version = '2.7.1' + +homepage = 'https://www.open-mpi.org/projects/hwloc/' + +description = """ + The Portable Hardware Locality (hwloc) software package provides a portable + abstraction (across OS, versions, architectures, ...) of the hierarchical + topology of modern architectures, including NUMA memory nodes, sockets, shared + caches, cores and simultaneous multithreading. It also gathers various system + attributes such as cache and memory information as well as the locality of I/O + devices such as network interfaces, InfiniBand HCAs or GPUs. It primarily + aims at helping applications with gathering information about modern computing + hardware so as to exploit it accordingly and efficiently. +""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} + +source_urls = ['https://www.open-mpi.org/software/hwloc/v%(version_major_minor)s/downloads/'] +sources = [SOURCE_TAR_GZ] +checksums = ['4cb0a781ed980b03ad8c48beb57407aa67c4b908e45722954b9730379bc7f6d5'] + +builddependencies = [ + ('binutils', '2.38'), +] + +dependencies = [ + ('numactl', '2.0.15', '', SYSTEM), + ('libxml2', '2.9.13'), + ('libpciaccess', '0.16'), +] + +configopts = "--enable-libnuma=$EBROOTNUMACTL " +configopts += "--disable-cairo --disable-opencl --disable-cuda --disable-nvml --disable-gl --disable-libudev " + +sanity_check_paths = { + 'files': ['bin/lstopo', 'include/hwloc/linux.h', + 'lib/libhwloc.%s' % SHLIB_EXT], + 'dirs': ['share/man/man3'], +} +sanity_check_commands = ['lstopo'] + +moduleclass = 'system' diff --git a/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb b/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..09681e8d363257771e185eb3bff58c4b4abd9a50 --- /dev/null +++ b/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb @@ -0,0 +1,38 @@ +easyblock = 'ConfigureMake' + +name = 'libevent' +version = '2.1.12' + +homepage = 'https://libevent.org/' + +description = """ + The libevent API provides a mechanism to execute a callback function when + a specific event occurs on a file descriptor or after a timeout has been + reached. Furthermore, libevent also support callbacks due to signals or + regular timeouts. +""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} +toolchainopts = {'pic': True} + +source_urls = ['https://github.com/%(name)s/%(name)s/releases/download/release-%(version)s-stable/'] +sources = ['%(name)s-%(version)s-stable.tar.gz'] +checksums = ['92e6de1be9ec176428fd2367677e61ceffc2ee1cb119035037a27d346b0403bb'] + +builddependencies = [ + ('binutils', '2.38'), + ('pkgconf', '1.8.0'), +] + +dependencies = [ + ('zlib', '1.2.12'), + ('OpenSSL', '1.1', '', True), +] + +sanity_check_paths = { + 'files': ['bin/event_rpcgen.py', 'include/event.h', 'include/event2/event.h', + 'lib/libevent_core.%s' % SHLIB_EXT, 'lib/pkgconfig/libevent.pc'], + 'dirs': [], +} + +moduleclass = 'lib' diff --git a/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb b/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..1f03c09bfad07f06d203d6544f62f2bf00c1307e --- /dev/null +++ b/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb @@ -0,0 +1,26 @@ +easyblock = 'ConfigureMake' + +name = 'libpciaccess' +version = '0.16' + +homepage = 'https://cgit.freedesktop.org/xorg/lib/libpciaccess/' +description = """Generic PCI access library.""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} + +source_urls = ['https://www.x.org/releases/individual/lib/'] +sources = [SOURCE_TAR_GZ] +checksums = ['84413553994aef0070cf420050aa5c0a51b1956b404920e21b81e96db6a61a27'] + +builddependencies = [ + ('binutils', '2.38'), + ('Autotools', '20220317'), + ('xorg-macros', '1.19.3'), +] + +sanity_check_paths = { + 'files': ['include/pciaccess.h', 'lib/libpciaccess.a'], + 'dirs': ['lib/pkgconfig'], +} + +moduleclass = 'system' diff --git a/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb b/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb new file mode 100644 index 0000000000000000000000000000000000000000..5c27b17ca75f06764ed5adc098fce6d8c29873a3 --- /dev/null +++ b/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb @@ -0,0 +1,30 @@ +name = 'NCCL' +version = '2.14.3-1' +versionsuffix = '-CUDA-%(cudashortver)s' + +homepage = 'https://developer.nvidia.com/nccl' +description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective +communication primitives that are performance optimized for NVIDIA GPUs.""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} + +github_account = 'NVIDIA' +sources = [{ + 'filename': '%(name)s-%(version)s.tar.gz', + 'git_config': { + 'url': 'https://github.com/NVIDIA/', + 'repo_name': 'nccl', + 'tag': 'v%(version)s', + 'recursive': True, + }, +}] +checksums = ['a05e153f0508e05be76e19b8262c2ea4f8996aedaaa873dcd241e061202422a6'] + +builddependencies = [('binutils', '2.38')] + +dependencies = [ + ('CUDA', '11.7', '', True), + ('UCX', 'default', '', SYSTEM), +] + +moduleclass = 'lib' diff --git a/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch new file mode 100644 index 0000000000000000000000000000000000000000..8c1a1f8ec353b5e35b8ee3912b4ecb0373de0c94 --- /dev/null +++ b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch @@ -0,0 +1,456 @@ +If Open MPI is built with support for CUDA there's a small +(up to 10%) performance penalty for small messages due to overhead +in the datatype memory copy functionality. + +This eliminates most of this overhead as follows: +1. Seperate compilation of CUDA code paths in pack/unpack routines + instead of runtime checks in inner loops, similar to the existing + checksum functionality. +2. Expose opal_cuda_enabled variable so it can be checked directly + in opal_datatype_copy_content_same_ddt() instead of calling a + function. +3. Eliminate cbmemcpy function pointer as it always points to + opal_cuda_memcpy(), and a direct call is cheaper. + +Signed off by Bart Oldeman <bart.oldeman@calculquebec.ca> + +diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am +index daaaa8e4b0..ef2da1cd81 100644 +--- a/opal/datatype/Makefile.am ++++ b/opal/datatype/Makefile.am +@@ -44,6 +44,11 @@ noinst_LTLIBRARIES = \ + # these sources will be compiled with the special -D + libdatatype_reliable_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c + libdatatype_reliable_la_CFLAGS = -DCHECKSUM $(AM_CFLAGS) ++if OPAL_cuda_support ++libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c ++libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS) ++noinst_LTLIBRARIES += libdatatype_gpu.la ++endif + + # these sources will be compiled with the normal CFLAGS only + libdatatype_la_SOURCES = \ +@@ -69,6 +74,9 @@ libdatatype_la_SOURCES = \ + opal_datatype_unpack.c + + libdatatype_la_LIBADD = libdatatype_reliable.la ++if OPAL_cuda_support ++libdatatype_la_LIBADD += libdatatype_gpu.la ++endif + + # Conditionally install the header files + if WANT_INSTALL_HEADERS +diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c +index 3931d99d17..33aebe2612 100644 +--- a/opal/datatype/opal_convertor.c ++++ b/opal/datatype/opal_convertor.c +@@ -40,8 +40,6 @@ + #include "opal/datatype/opal_convertor_internal.h" + #if OPAL_CUDA_SUPPORT + #include "opal/datatype/opal_datatype_cuda.h" +-#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + static void opal_convertor_construct( opal_convertor_t* convertor ) +@@ -51,9 +49,6 @@ static void opal_convertor_construct( opal_convertor_t* convertor ) + convertor->partial_length = 0; + convertor->remoteArch = opal_local_arch; + convertor->flags = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED; +-#if OPAL_CUDA_SUPPORT +- convertor->cbmemcpy = &opal_cuda_memcpy; +-#endif + } + + +@@ -241,11 +236,7 @@ int32_t opal_convertor_pack( opal_convertor_t* pConv, + if( OPAL_LIKELY(NULL == iov[i].iov_base) ) + iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; + else +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); +-#else + MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); +-#endif + pending_length -= iov[i].iov_len; + base_pointer += iov[i].iov_len; + } +@@ -258,11 +249,7 @@ complete_contiguous_data_pack: + if( OPAL_LIKELY(NULL == iov[i].iov_base) ) + iov[i].iov_base = (IOVBASE_TYPE *) base_pointer; + else +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv ); +-#else + MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len ); +-#endif + pConv->bConverted = pConv->local_size; + *out_size = i + 1; + pConv->flags |= CONVERTOR_COMPLETED; +@@ -296,11 +283,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, + if( iov[i].iov_len >= pending_length ) { + goto complete_contiguous_data_unpack; + } +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv ); +-#else + MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len ); +-#endif + pending_length -= iov[i].iov_len; + base_pointer += iov[i].iov_len; + } +@@ -310,11 +293,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv, + + complete_contiguous_data_unpack: + iov[i].iov_len = pending_length; +-#if OPAL_CUDA_SUPPORT +- MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv ); +-#else + MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len ); +-#endif + pConv->bConverted = pConv->local_size; + *out_size = i + 1; + pConv->flags |= CONVERTOR_COMPLETED; +@@ -530,7 +509,7 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor ) + \ + convertor->remote_size = convertor->local_size; \ + if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) { \ +- if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) && \ ++ if( !(convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA)) && \ + ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \ + ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \ + return OPAL_SUCCESS; \ +@@ -541,8 +520,8 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor ) + opal_convertor_compute_remote_size( convertor ); \ + assert( NULL != convertor->use_desc->desc ); \ + /* For predefined datatypes (contiguous) do nothing more */ \ +- /* if checksum is enabled then always continue */ \ +- if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \ ++ /* if checksum or cuda is enabled then always continue */ \ ++ if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA | OPAL_DATATYPE_FLAG_NO_GAPS)) \ + == OPAL_DATATYPE_FLAG_NO_GAPS) && \ + ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \ + (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) { \ +@@ -592,7 +571,19 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor, + } + } + } else +-#endif /* defined(CHECKSUM) */ ++#elif OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) { ++ if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) { ++ convertor->fAdvance = opal_unpack_general_gpu; ++ } else { ++ if (convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) { ++ convertor->fAdvance = opal_unpack_homogeneous_contig_gpu; ++ } else { ++ convertor->fAdvance = opal_generic_simple_unpack_gpu; ++ } ++ } ++ } else ++#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */ + if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) { + convertor->fAdvance = opal_unpack_general; + } else { +@@ -636,7 +627,25 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor, + } + } + } else +-#endif /* defined(CHECKSUM) */ ++#elif OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) { ++ if (CONVERTOR_SEND_CONVERSION ++ == (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) { ++ convertor->fAdvance = opal_pack_general_gpu; ++ } else { ++ if (datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) { ++ if (((datatype->ub - datatype->lb) == (ptrdiff_t) datatype->size) ++ || (1 >= convertor->count)) { ++ convertor->fAdvance = opal_pack_homogeneous_contig_gpu; ++ } else { ++ convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_gpu; ++ } ++ } else { ++ convertor->fAdvance = opal_generic_simple_pack_gpu; ++ } ++ } ++ } else ++#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */ + if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) { + convertor->fAdvance = opal_pack_general; + } else { +@@ -694,9 +703,6 @@ int opal_convertor_clone( const opal_convertor_t* source, + destination->bConverted = source->bConverted; + destination->stack_pos = source->stack_pos; + } +-#if OPAL_CUDA_SUPPORT +- destination->cbmemcpy = source->cbmemcpy; +-#endif + return OPAL_SUCCESS; + } + +diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h +index b24d94c37b..53b6f0d526 100644 +--- a/opal/datatype/opal_convertor.h ++++ b/opal/datatype/opal_convertor.h +@@ -118,7 +118,6 @@ struct opal_convertor_t { + dt_stack_t static_stack[DT_STATIC_STACK_SIZE]; /**< local stack for small datatypes */ + + #if OPAL_CUDA_SUPPORT +- memcpy_fct_t cbmemcpy; /**< memcpy or cuMemcpy */ + void * stream; /**< CUstream for async copy */ + #endif + }; +diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c +index c70bdd24df..d7c10af3dc 100644 +--- a/opal/datatype/opal_datatype_copy.c ++++ b/opal/datatype/opal_datatype_copy.c +@@ -86,14 +86,6 @@ static size_t opal_datatype_memop_block_size = 128 * 1024; + #define MEM_OP opal_cuda_memmove + #include "opal_datatype_copy.h" + +-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) \ +- do { \ +- if (true == cuda_device_bufs) { \ +- fct = copy_function; \ +- } \ +- } while(0) +-#else +-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function) + #endif + + int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count, +@@ -102,10 +94,6 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in + ptrdiff_t extent; + int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*); + +-#if OPAL_CUDA_SUPPORT +- bool cuda_device_bufs = opal_cuda_check_bufs(destination_base, source_base); +-#endif +- + DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n", + (void*)datatype, count, (void*)destination_base, (void*)source_base ); ); + +@@ -122,20 +110,25 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in + extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb); + + fct = non_overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, non_overlap_cuda_copy_content_same_ddt); + if( destination_base < source_base ) { + if( (destination_base + extent) > source_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt); + } + } else { + if( (source_base + extent) > destination_base ) { + /* memmove */ + fct = overlap_copy_content_same_ddt; +- SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt); + } + } ++ ++#if OPAL_CUDA_SUPPORT ++ if (OPAL_UNLIKELY(opal_cuda_enabled) && opal_cuda_check_bufs(destination_base, source_base)) { ++ fct = (fct == non_overlap_copy_content_same_ddt ? ++ non_overlap_cuda_copy_content_same_ddt : overlap_cuda_copy_content_same_ddt); ++ } ++#endif ++ + return fct( datatype, count, destination_base, source_base ); + } + +diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c +index 7869f17e90..f3b3cef8da 100644 +--- a/opal/datatype/opal_datatype_cuda.c ++++ b/opal/datatype/opal_datatype_cuda.c +@@ -20,7 +20,7 @@ + + static bool initialized = false; + int opal_cuda_verbose = 0; +-static int opal_cuda_enabled = 0; /* Starts out disabled */ ++int opal_cuda_enabled = 1; /* Starts out enabled */ + static int opal_cuda_output = 0; + static void opal_cuda_support_init(void); + static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL; +@@ -48,10 +48,6 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf) + opal_cuda_support_init(); + } + +- /* This is needed to handle case where convertor is not fully initialized +- * like when trying to do a sendi with convertor on the statck */ +- convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy; +- + /* If not enabled, then nothing else to do */ + if (!opal_cuda_enabled) { + return; +@@ -192,6 +188,7 @@ static void opal_cuda_support_init(void) + + /* Callback into the common cuda initialization routine. This is only + * set if some work had been done already in the common cuda code.*/ ++ opal_cuda_enabled = 0; + if (NULL != common_cuda_initialization_function) { + if (0 == common_cuda_initialization_function(&ftable)) { + opal_cuda_enabled = 1; +diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h +index 2789320520..d512e24550 100644 +--- a/opal/datatype/opal_datatype_cuda.h ++++ b/opal/datatype/opal_datatype_cuda.h +@@ -30,4 +30,6 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size); + void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *)); + void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream); + ++extern int opal_cuda_enabled; ++ + #endif +diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c +index b4e03a9bea..f42e292e0b 100644 +--- a/opal/datatype/opal_datatype_pack.c ++++ b/opal/datatype/opal_datatype_pack.c +@@ -45,6 +45,11 @@ + #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum + #define opal_generic_simple_pack_function opal_generic_simple_pack_checksum + #define opal_pack_general_function opal_pack_general_checksum ++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU) ++#define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig_gpu ++#define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_gpu ++#define opal_generic_simple_pack_function opal_generic_simple_pack_gpu ++#define opal_pack_general_function opal_pack_general_gpu + #else + #define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig + #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps +diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h +index 2a2e79180d..7fbf0c88e2 100644 +--- a/opal/datatype/opal_datatype_pack.h ++++ b/opal/datatype/opal_datatype_pack.h +@@ -19,11 +19,12 @@ + + #include "opal_config.h" + +-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Make use of existing macro to do CUDA style memcpy */ ++#include "opal_datatype_cuda.h" + #undef MEMCPY_CSUM + #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) ++ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + /** +diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h +index 668397112b..111f74f2a4 100644 +--- a/opal/datatype/opal_datatype_prototypes.h ++++ b/opal/datatype/opal_datatype_prototypes.h +@@ -39,6 +39,16 @@ OPAL_DECLSPEC int32_t + opal_unpack_general_checksum( opal_convertor_t* pConvertor, + struct iovec* iov, uint32_t* out_size, + size_t* max_data ); ++#if OPAL_CUDA_SUPPORT ++OPAL_DECLSPEC int32_t ++opal_pack_general_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++OPAL_DECLSPEC int32_t ++opal_unpack_general_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++#endif + + /* + * Now the internal functions +@@ -83,6 +93,28 @@ int32_t + opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor, + struct iovec* iov, uint32_t* out_size, + size_t* max_data ); ++#if OPAL_CUDA_SUPPORT ++int32_t ++opal_pack_homogeneous_contig_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_pack_homogeneous_contig_with_gaps_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_generic_simple_pack_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_unpack_homogeneous_contig_gpu( opal_convertor_t* pConv, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++int32_t ++opal_generic_simple_unpack_gpu( opal_convertor_t* pConvertor, ++ struct iovec* iov, uint32_t* out_size, ++ size_t* max_data ); ++#endif + + END_C_DECLS + +diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c +index 26a5810dc0..668b6624aa 100644 +--- a/opal/datatype/opal_datatype_unpack.c ++++ b/opal/datatype/opal_datatype_unpack.c +@@ -46,6 +46,10 @@ + #define opal_unpack_general_function opal_unpack_general_checksum + #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum + #define opal_generic_simple_unpack_function opal_generic_simple_unpack_checksum ++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU) ++#define opal_unpack_general_function opal_unpack_general_gpu ++#define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_gpu ++#define opal_generic_simple_unpack_function opal_generic_simple_unpack_gpu + #else + #define opal_unpack_general_function opal_unpack_general + #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig +@@ -204,10 +208,10 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_ + MEMCPY( temporary + start_position, partial_data, length ); + + /* Save the original content of the user memory */ +-#if OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* In the case where the data is being unpacked from device memory, need to + * use the special host to device memory copy. */ +- pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor ); ++ opal_cuda_memcpy(saved_data, user_data, data_length, pConvertor ); + #else + MEMCPY( saved_data, user_data, data_length ); + #endif +@@ -222,15 +226,15 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_ + + /* Rebuild the data by pulling back the unmodified bytes from the original + * content in the user memory. */ +-#if OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Need to copy the modified user_data again so we can see which + * bytes need to be converted back to their original values. */ + { + char resaved_data[16]; +- pConvertor->cbmemcpy(resaved_data, user_data, data_length, pConvertor ); ++ opal_cuda_memcpy(resaved_data, user_data, data_length, pConvertor ); + for(size_t i = 0; i < data_length; i++ ) { + if( unused_byte == resaved_data[i] ) +- pConvertor->cbmemcpy(&user_data[i], &saved_data[i], 1, pConvertor); ++ opal_cuda_memcpy(&user_data[i], &saved_data[i], 1, pConvertor); + } + } + #else +diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h +index 33db837882..4159a475fc 100644 +--- a/opal/datatype/opal_datatype_unpack.h ++++ b/opal/datatype/opal_datatype_unpack.h +@@ -19,11 +19,12 @@ + + #include "opal_config.h" + +-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT ++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU) + /* Make use of existing macro to do CUDA style memcpy */ ++#include "opal_datatype_cuda.h" + #undef MEMCPY_CSUM + #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \ +- CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) ++ opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) ) + #endif + + /** diff --git a/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..4a007d5cc467e2b2dc1d0b101d94e00ac1de18c0 --- /dev/null +++ b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb @@ -0,0 +1,67 @@ +name = 'OpenMPI' +version = '4.1.4' + +homepage = 'https://www.open-mpi.org/' +description = """The Open MPI Project is an open source MPI-3 implementation.""" + +toolchain = {'name': 'GCC', 'version': '11.3.0'} +toolchainopts = {'pic': True} + +source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] +sources = [SOURCELOWER_TAR_BZ2] +patches = [ + 'OpenMPI-4.1.1_opal-datatype-cuda-performance.patch', +] +checksums = [ + '92912e175fd1234368c8730c03f4996fe5942e7479bb1d10059405e7f2b3930d', # openmpi-4.1.4.tar.bz2 + # OpenMPI-4.1.1_opal-datatype-cuda-performance.patch + 'b767c7166cf0b32906132d58de5439c735193c9fd09ec3c5c11db8d5fa68750e', +] + +osdependencies = [ + # needed for --with-verbs + ('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel'), + # needed for --with-pmix + ('pmix-devel'), +] + +builddependencies = [ + ('pkgconf', '1.8.0'), + ('Perl', '5.34.1'), + ('Autotools', '20220317'), +] + +dependencies = [ + ('zlib', '1.2.12'), + ('hwloc', '2.7.1'), + ('UCX', 'default', '', SYSTEM), + ('CUDA', '11.7', '', SYSTEM), + ('libevent', '2.1.12'), + ('PMIx', '3.2.3'), # We rely on this version since it is the newest supported by psmgmt + ('UCC', 'default'), +] + +# Update configure to include changes from the "internal-cuda" patch +# by running a subset of autogen.pl sufficient to achieve this +# without doing the full, long-running regeneration. +preconfigopts = ' && '.join([ + 'cd config', + 'autom4te --language=m4sh opal_get_version.m4sh -o opal_get_version.sh', + 'cd ..', + 'autoconf', + 'autoheader', + 'aclocal', + 'automake', + '' +]) + +configopts = '--without-orte ' +configopts += '--without-psm2 ' +configopts += '--disable-oshmem ' +configopts += '--with-ime=/opt/ddn/ime ' +configopts += '--with-gpfs ' + +# to enable SLURM integration (site-specific) +configopts += '--with-slurm --with-pmix=external --with-libevent=external --with-ompi-pmix-rte' + +moduleclass = 'mpi' diff --git a/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb b/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..953711ddd2cd2583c5834a759e53234acf6753c7 --- /dev/null +++ b/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb @@ -0,0 +1,48 @@ +## +# Author: Robert Mijakovic <robert.mijakovic@lxp.lu> +## +easyblock = 'ConfigureMake' + +name = 'PMIx' +version = '3.2.3' + +homepage = 'https://pmix.org/' +description = """Process Management for Exascale Environments +PMI Exascale (PMIx) represents an attempt to +provide an extended version of the PMI standard specifically designed +to support clusters up to and including exascale sizes. The overall +objective of the project is not to branch the existing pseudo-standard +definitions - in fact, PMIx fully supports both of the existing PMI-1 +and PMI-2 APIs - but rather to (a) augment and extend those APIs to +eliminate some current restrictions that impact scalability, and (b) +provide a reference implementation of the PMI-server that demonstrates +the desired level of scalability. +""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} +toolchainopts = {'pic': True} + +source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s'] +sources = ['%(namelower)s-%(version)s.tar.bz2'] +checksums = ['9b835f23c2f94a193c14012ee68b3657a61c568598cdd1212a3716b32d41a135'] + +builddependencies = [('binutils', '2.38')] + +dependencies = [ + ('libevent', '2.1.12'), + ('zlib', '1.2.12'), + ('hwloc', '2.7.1'), +] + +configopts = ' --with-libevent=$EBROOTLIBEVENT --with-zlib=$EBROOTZLIB' +configopts += ' --with-hwloc=$EBROOTHWLOC' +configopts += ' --enable-pmix-binaries' + +buildopts = 'V=1' + +sanity_check_paths = { + 'files': ['bin/pevent', 'bin/plookup', 'bin/pmix_info', 'bin/pps'], + 'dirs': ['etc', 'include', 'lib', 'share'] +} + +moduleclass = 'lib' diff --git a/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz b/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..3400371cf58a722255fc02131699649f13f0a0b3 Binary files /dev/null and b/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz differ diff --git a/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb b/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..308921f966b85ac070c9bf0caab2f38bce48ca14 --- /dev/null +++ b/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb @@ -0,0 +1,56 @@ +# For figuring out the correct GPU arch. Easyconfig templates can't be used, +# since we have in some systems more than 1 compute capability. +# The local prefix is to appease the checker +import os as local_os + +easyblock = 'ConfigureMake' + +name = 'UCC' +version = 'default' +local_version = '1.1.0-rc1' + +homepage = 'https://www.openucx.org/' +description = """UCC (Unified Collective Communication) is a collective +communication operations API and library that is flexible, complete, and +feature-rich for current and emerging programming models and runtimes. +""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} +toolchainopts = {'pic': True} + +source_urls = ['https://github.com/openucx/ucc/archive/refs/tags'] +sources = [f'v{local_version}.tar.gz'] +checksums = [ + '4af76d706a788af081c4a6ce566b6d4e33d75629ce9a8a7b8eec1760eff13168', # v1.1.0-rc1.tar.gz +] + +builddependencies = [ + ('binutils', '2.38'), + ('Autotools', '20220317'), +] + +dependencies = [ + ('UCX', 'default', '', SYSTEM), + ('CUDA', '11.7', '', SYSTEM), + ('NCCL', '2.14.3-1', '-CUDA-%(cudashortver)s'), +] + +preconfigopts = "./autogen.sh && " + +local_gpu_cc = local_os.environ["EASYBUILD_CUDA_COMPUTE_CAPABILITIES"].split(',')[0].replace('.','') + +configopts = "--enable-optimizations " +configopts += f"--with-nvcc-gencode=-gencode=arch=compute_{local_gpu_cc},code=sm_{local_gpu_cc} " +configopts += "--with-ucx=$EBROOTUCX " +configopts += "--with-cuda=$EBROOTCUDA " +configopts += "--with-nccl=$EBROOTNCCL " +configopts += "--with-sharp=/opt/mellanox/sharp " + +sanity_check_paths = { + 'files': ['bin/ucc_info'], + 'dirs': ['include', 'lib'] +} + +sanity_check_commands = ["ucc_info -c"] + +moduleclass = 'lib' diff --git a/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb b/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..5858842fac29bab516a05f03ec8900415f48c541 --- /dev/null +++ b/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb @@ -0,0 +1,27 @@ +easyblock = 'ConfigureMake' + +name = 'xorg-macros' +version = '1.19.3' + +homepage = 'https://cgit.freedesktop.org/xorg/util/macros' +description = """X.org macros utilities.""" + +toolchain = {'name': 'GCCcore', 'version': '11.3.0'} + +source_urls = ['https://gitlab.freedesktop.org/xorg/util/macros/-/archive/util-macros-%(version)s'] +sources = ['macros-util-macros-%(version)s.tar.gz'] +checksums = ['8205d210a580da0938f5ce4392a96b60cf1d9a5f792eaa1474fa4c1977aef4d0'] + +builddependencies = [ + ('binutils', '2.38'), + ('Autotools', '20220317'), +] + +preconfigopts = './autogen.sh && ' + +sanity_check_paths = { + 'files': ['share/pkgconfig/xorg-macros.pc'], + 'dirs': [], +} + +moduleclass = 'devel'