diff --git a/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb b/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..b0427412f810a96ebe7e4bad1383ecab827d5bf7
--- /dev/null
+++ b/Golden_Repo/h/hwloc/hwloc-2.7.1-GCCcore-11.3.0.eb
@@ -0,0 +1,45 @@
+easyblock = 'ConfigureMake'
+
+name = 'hwloc'
+version = '2.7.1'
+
+homepage = 'https://www.open-mpi.org/projects/hwloc/'
+
+description = """
+ The Portable Hardware Locality (hwloc) software package provides a portable
+ abstraction (across OS, versions, architectures, ...) of the hierarchical
+ topology of modern architectures, including NUMA memory nodes, sockets, shared
+ caches, cores and simultaneous multithreading. It also gathers various system
+ attributes such as cache and memory information as well as the locality of I/O
+ devices such as network interfaces, InfiniBand HCAs or GPUs. It primarily
+ aims at helping applications with gathering information about modern computing
+ hardware so as to exploit it accordingly and efficiently.
+"""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+
+source_urls = ['https://www.open-mpi.org/software/hwloc/v%(version_major_minor)s/downloads/']
+sources = [SOURCE_TAR_GZ]
+checksums = ['4cb0a781ed980b03ad8c48beb57407aa67c4b908e45722954b9730379bc7f6d5']
+
+builddependencies = [
+    ('binutils', '2.38'),
+]
+
+dependencies = [
+    ('numactl', '2.0.15', '', SYSTEM),
+    ('libxml2', '2.9.13'),
+    ('libpciaccess', '0.16'),
+]
+
+configopts = "--enable-libnuma=$EBROOTNUMACTL "
+configopts += "--disable-cairo --disable-opencl --disable-cuda --disable-nvml --disable-gl --disable-libudev "
+
+sanity_check_paths = {
+    'files': ['bin/lstopo', 'include/hwloc/linux.h',
+              'lib/libhwloc.%s' % SHLIB_EXT],
+    'dirs': ['share/man/man3'],
+}
+sanity_check_commands = ['lstopo']
+
+moduleclass = 'system'
diff --git a/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb b/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..09681e8d363257771e185eb3bff58c4b4abd9a50
--- /dev/null
+++ b/Golden_Repo/l/libevent/libevent-2.1.12-GCCcore-11.3.0.eb
@@ -0,0 +1,38 @@
+easyblock = 'ConfigureMake'
+
+name = 'libevent'
+version = '2.1.12'
+
+homepage = 'https://libevent.org/'
+
+description = """
+ The libevent API provides a mechanism to execute a callback function when
+ a specific event occurs on a file descriptor or after a timeout has been
+ reached.  Furthermore, libevent also support callbacks due to signals or
+ regular timeouts.
+"""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+toolchainopts = {'pic': True}
+
+source_urls = ['https://github.com/%(name)s/%(name)s/releases/download/release-%(version)s-stable/']
+sources = ['%(name)s-%(version)s-stable.tar.gz']
+checksums = ['92e6de1be9ec176428fd2367677e61ceffc2ee1cb119035037a27d346b0403bb']
+
+builddependencies = [
+    ('binutils', '2.38'),
+    ('pkgconf', '1.8.0'),
+]
+
+dependencies = [
+    ('zlib', '1.2.12'),
+    ('OpenSSL', '1.1', '', True),
+]
+
+sanity_check_paths = {
+    'files': ['bin/event_rpcgen.py', 'include/event.h', 'include/event2/event.h',
+              'lib/libevent_core.%s' % SHLIB_EXT, 'lib/pkgconfig/libevent.pc'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb b/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..1f03c09bfad07f06d203d6544f62f2bf00c1307e
--- /dev/null
+++ b/Golden_Repo/l/libpciaccess/libpciaccess-0.16-GCCcore-11.3.0.eb
@@ -0,0 +1,26 @@
+easyblock = 'ConfigureMake'
+
+name = 'libpciaccess'
+version = '0.16'
+
+homepage = 'https://cgit.freedesktop.org/xorg/lib/libpciaccess/'
+description = """Generic PCI access library."""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+
+source_urls = ['https://www.x.org/releases/individual/lib/']
+sources = [SOURCE_TAR_GZ]
+checksums = ['84413553994aef0070cf420050aa5c0a51b1956b404920e21b81e96db6a61a27']
+
+builddependencies = [
+    ('binutils', '2.38'),
+    ('Autotools', '20220317'),
+    ('xorg-macros', '1.19.3'),
+]
+
+sanity_check_paths = {
+    'files': ['include/pciaccess.h', 'lib/libpciaccess.a'],
+    'dirs': ['lib/pkgconfig'],
+}
+
+moduleclass = 'system'
diff --git a/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb b/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb
new file mode 100644
index 0000000000000000000000000000000000000000..5c27b17ca75f06764ed5adc098fce6d8c29873a3
--- /dev/null
+++ b/Golden_Repo/n/NCCL/NCCL-2.14.3-1-GCCcore-11.3.0-CUDA-11.7.eb
@@ -0,0 +1,30 @@
+name = 'NCCL'
+version = '2.14.3-1'
+versionsuffix = '-CUDA-%(cudashortver)s'
+
+homepage = 'https://developer.nvidia.com/nccl'
+description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
+communication primitives that are performance optimized for NVIDIA GPUs."""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+
+github_account = 'NVIDIA'
+sources = [{
+    'filename': '%(name)s-%(version)s.tar.gz',
+    'git_config': {
+        'url': 'https://github.com/NVIDIA/',
+        'repo_name': 'nccl',
+        'tag': 'v%(version)s',
+        'recursive': True,
+    },
+}]
+checksums = ['a05e153f0508e05be76e19b8262c2ea4f8996aedaaa873dcd241e061202422a6']
+
+builddependencies = [('binutils', '2.38')]
+
+dependencies = [
+    ('CUDA', '11.7', '', True),
+    ('UCX', 'default', '', SYSTEM),
+]
+
+moduleclass = 'lib'
diff --git a/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8c1a1f8ec353b5e35b8ee3912b4ecb0373de0c94
--- /dev/null
+++ b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.1_opal-datatype-cuda-performance.patch
@@ -0,0 +1,456 @@
+If Open MPI is built with support for CUDA there's a small
+(up to 10%) performance penalty for small messages due to overhead
+in the datatype memory copy functionality.
+
+This eliminates most of this overhead as follows:
+1. Seperate compilation of CUDA code paths in pack/unpack routines
+   instead of runtime checks in inner loops, similar to the existing
+   checksum functionality.
+2. Expose opal_cuda_enabled variable so it can be checked directly
+   in opal_datatype_copy_content_same_ddt() instead of calling a
+   function.
+3. Eliminate cbmemcpy function pointer as it always points to
+   opal_cuda_memcpy(), and a direct call is cheaper.
+
+Signed off by Bart Oldeman <bart.oldeman@calculquebec.ca>
+
+diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
+index daaaa8e4b0..ef2da1cd81 100644
+--- a/opal/datatype/Makefile.am
++++ b/opal/datatype/Makefile.am
+@@ -44,6 +44,11 @@ noinst_LTLIBRARIES = \
+ # these sources will be compiled with the special -D
+ libdatatype_reliable_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
+ libdatatype_reliable_la_CFLAGS = -DCHECKSUM $(AM_CFLAGS)
++if OPAL_cuda_support
++libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
++libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS)
++noinst_LTLIBRARIES += libdatatype_gpu.la
++endif
+ 
+ # these sources will be compiled with the normal CFLAGS only
+ libdatatype_la_SOURCES = \
+@@ -69,6 +74,9 @@ libdatatype_la_SOURCES = \
+         opal_datatype_unpack.c
+ 
+ libdatatype_la_LIBADD = libdatatype_reliable.la
++if OPAL_cuda_support
++libdatatype_la_LIBADD += libdatatype_gpu.la
++endif
+ 
+ # Conditionally install the header files
+ if WANT_INSTALL_HEADERS
+diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
+index 3931d99d17..33aebe2612 100644
+--- a/opal/datatype/opal_convertor.c
++++ b/opal/datatype/opal_convertor.c
+@@ -40,8 +40,6 @@
+ #include "opal/datatype/opal_convertor_internal.h"
+ #if OPAL_CUDA_SUPPORT
+ #include "opal/datatype/opal_datatype_cuda.h"
+-#define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
+-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ #endif
+ 
+ static void opal_convertor_construct( opal_convertor_t* convertor )
+@@ -51,9 +49,6 @@ static void opal_convertor_construct( opal_convertor_t* convertor )
+     convertor->partial_length = 0;
+     convertor->remoteArch     = opal_local_arch;
+     convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
+-#if OPAL_CUDA_SUPPORT
+-    convertor->cbmemcpy       = &opal_cuda_memcpy;
+-#endif
+ }
+ 
+ 
+@@ -241,11 +236,7 @@ int32_t opal_convertor_pack( opal_convertor_t* pConv,
+             if( OPAL_LIKELY(NULL == iov[i].iov_base) )
+                 iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
+             else
+-#if OPAL_CUDA_SUPPORT
+-                MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
+-#else
+                 MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
+-#endif
+             pending_length -= iov[i].iov_len;
+             base_pointer += iov[i].iov_len;
+         }
+@@ -258,11 +249,7 @@ complete_contiguous_data_pack:
+         if( OPAL_LIKELY(NULL == iov[i].iov_base) )
+             iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
+         else
+-#if OPAL_CUDA_SUPPORT
+-            MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
+-#else
+             MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
+-#endif
+         pConv->bConverted = pConv->local_size;
+         *out_size = i + 1;
+         pConv->flags |= CONVERTOR_COMPLETED;
+@@ -296,11 +283,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,
+             if( iov[i].iov_len >= pending_length ) {
+                 goto complete_contiguous_data_unpack;
+             }
+-#if OPAL_CUDA_SUPPORT
+-            MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
+-#else
+             MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
+-#endif
+             pending_length -= iov[i].iov_len;
+             base_pointer += iov[i].iov_len;
+         }
+@@ -310,11 +293,7 @@ int32_t opal_convertor_unpack( opal_convertor_t* pConv,
+ 
+ complete_contiguous_data_unpack:
+         iov[i].iov_len = pending_length;
+-#if OPAL_CUDA_SUPPORT
+-        MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
+-#else
+         MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
+-#endif
+         pConv->bConverted = pConv->local_size;
+         *out_size = i + 1;
+         pConv->flags |= CONVERTOR_COMPLETED;
+@@ -530,7 +509,7 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
+                                                                         \
+         convertor->remote_size = convertor->local_size;                 \
+         if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) {   \
+-            if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) &&        \
++            if( !(convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA)) &&        \
+                 ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \
+                  ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \
+                 return OPAL_SUCCESS;                                    \
+@@ -541,8 +520,8 @@ size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
+         opal_convertor_compute_remote_size( convertor );                \
+         assert( NULL != convertor->use_desc->desc );                    \
+         /* For predefined datatypes (contiguous) do nothing more */     \
+-        /* if checksum is enabled then always continue */               \
+-        if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \
++        /* if checksum or cuda is enabled then always continue */       \
++        if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | CONVERTOR_CUDA | OPAL_DATATYPE_FLAG_NO_GAPS)) \
+              == OPAL_DATATYPE_FLAG_NO_GAPS) &&                          \
+             ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \
+              (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) {              \
+@@ -592,7 +571,19 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
+             }
+         }
+     } else
+-#endif  /* defined(CHECKSUM) */
++#elif OPAL_CUDA_SUPPORT
++    if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) {
++        if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) {
++            convertor->fAdvance = opal_unpack_general_gpu;
++        } else {
++            if (convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
++                convertor->fAdvance = opal_unpack_homogeneous_contig_gpu;
++            } else {
++                convertor->fAdvance = opal_generic_simple_unpack_gpu;
++            }
++        }
++    } else
++#endif  /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */
+         if( OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
+             convertor->fAdvance = opal_unpack_general;
+         } else {
+@@ -636,7 +627,25 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
+             }
+         }
+     } else
+-#endif  /* defined(CHECKSUM) */
++#elif OPAL_CUDA_SUPPORT
++    if (OPAL_UNLIKELY(convertor->flags & CONVERTOR_CUDA)) {
++        if (CONVERTOR_SEND_CONVERSION
++            == (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) {
++            convertor->fAdvance = opal_pack_general_gpu;
++        } else {
++            if (datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
++                if (((datatype->ub - datatype->lb) == (ptrdiff_t) datatype->size)
++                    || (1 >= convertor->count)) {
++                    convertor->fAdvance = opal_pack_homogeneous_contig_gpu;
++                } else {
++                    convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_gpu;
++                }
++            } else {
++                convertor->fAdvance = opal_generic_simple_pack_gpu;
++            }
++        }
++    } else
++#endif  /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT */
+         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
+             convertor->fAdvance = opal_pack_general;
+         } else {
+@@ -694,9 +703,6 @@ int opal_convertor_clone( const opal_convertor_t* source,
+         destination->bConverted = source->bConverted;
+         destination->stack_pos  = source->stack_pos;
+     }
+-#if OPAL_CUDA_SUPPORT
+-    destination->cbmemcpy   = source->cbmemcpy;
+-#endif
+     return OPAL_SUCCESS;
+ }
+ 
+diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
+index b24d94c37b..53b6f0d526 100644
+--- a/opal/datatype/opal_convertor.h
++++ b/opal/datatype/opal_convertor.h
+@@ -118,7 +118,6 @@ struct opal_convertor_t {
+     dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
+ 
+ #if OPAL_CUDA_SUPPORT
+-    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+     void *                        stream;         /**< CUstream for async copy */
+ #endif
+ };
+diff --git a/opal/datatype/opal_datatype_copy.c b/opal/datatype/opal_datatype_copy.c
+index c70bdd24df..d7c10af3dc 100644
+--- a/opal/datatype/opal_datatype_copy.c
++++ b/opal/datatype/opal_datatype_copy.c
+@@ -86,14 +86,6 @@ static size_t opal_datatype_memop_block_size = 128 * 1024;
+ #define MEM_OP opal_cuda_memmove
+ #include "opal_datatype_copy.h"
+ 
+-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function)     \
+-    do {                                                            \
+-        if (true == cuda_device_bufs) {                             \
+-            fct = copy_function;                                    \
+-        }                                                           \
+-    } while(0)
+-#else
+-#define SET_CUDA_COPY_FCT(cuda_device_bufs, fct, copy_function)
+ #endif
+ 
+ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, int32_t count,
+@@ -102,10 +94,6 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
+     ptrdiff_t extent;
+     int32_t (*fct)( const opal_datatype_t*, int32_t, char*, char*);
+ 
+-#if OPAL_CUDA_SUPPORT
+-    bool cuda_device_bufs = opal_cuda_check_bufs(destination_base, source_base);
+-#endif
+-
+     DO_DEBUG( opal_output( 0, "opal_datatype_copy_content_same_ddt( %p, %d, dst %p, src %p )\n",
+                            (void*)datatype, count, (void*)destination_base, (void*)source_base ); );
+ 
+@@ -122,20 +110,25 @@ int32_t opal_datatype_copy_content_same_ddt( const opal_datatype_t* datatype, in
+     extent = (datatype->true_ub - datatype->true_lb) + (count - 1) * (datatype->ub - datatype->lb);
+ 
+     fct = non_overlap_copy_content_same_ddt;
+-    SET_CUDA_COPY_FCT(cuda_device_bufs, fct, non_overlap_cuda_copy_content_same_ddt);
+     if( destination_base < source_base ) {
+         if( (destination_base + extent) > source_base ) {
+             /* memmove */
+             fct = overlap_copy_content_same_ddt;
+-            SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
+         }
+     } else {
+         if( (source_base + extent) > destination_base ) {
+             /* memmove */
+             fct = overlap_copy_content_same_ddt;
+-            SET_CUDA_COPY_FCT(cuda_device_bufs, fct, overlap_cuda_copy_content_same_ddt);
+         }
+     }
++
++#if OPAL_CUDA_SUPPORT
++    if (OPAL_UNLIKELY(opal_cuda_enabled) && opal_cuda_check_bufs(destination_base, source_base)) {
++        fct = (fct == non_overlap_copy_content_same_ddt ?
++               non_overlap_cuda_copy_content_same_ddt : overlap_cuda_copy_content_same_ddt);
++    }
++#endif
++
+     return fct( datatype, count, destination_base, source_base );
+ }
+ 
+diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
+index 7869f17e90..f3b3cef8da 100644
+--- a/opal/datatype/opal_datatype_cuda.c
++++ b/opal/datatype/opal_datatype_cuda.c
+@@ -20,7 +20,7 @@
+ 
+ static bool initialized = false;
+ int opal_cuda_verbose = 0;
+-static int opal_cuda_enabled = 0; /* Starts out disabled */
++int opal_cuda_enabled = 1; /* Starts out enabled */
+ static int opal_cuda_output = 0;
+ static void opal_cuda_support_init(void);
+ static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
+@@ -48,10 +48,6 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+         opal_cuda_support_init();
+     }
+ 
+-    /* This is needed to handle case where convertor is not fully initialized
+-     * like when trying to do a sendi with convertor on the statck */
+-    convertor->cbmemcpy = (memcpy_fct_t)&opal_cuda_memcpy;
+-
+     /* If not enabled, then nothing else to do */
+     if (!opal_cuda_enabled) {
+         return;
+@@ -192,6 +188,7 @@ static void opal_cuda_support_init(void)
+ 
+     /* Callback into the common cuda initialization routine. This is only
+      * set if some work had been done already in the common cuda code.*/
++    opal_cuda_enabled = 0;
+     if (NULL != common_cuda_initialization_function) {
+         if (0 == common_cuda_initialization_function(&ftable)) {
+             opal_cuda_enabled = 1;
+diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
+index 2789320520..d512e24550 100644
+--- a/opal/datatype/opal_datatype_cuda.h
++++ b/opal/datatype/opal_datatype_cuda.h
+@@ -30,4 +30,6 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
+ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
+ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
+ 
++extern int opal_cuda_enabled;
++
+ #endif
+diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
+index b4e03a9bea..f42e292e0b 100644
+--- a/opal/datatype/opal_datatype_pack.c
++++ b/opal/datatype/opal_datatype_pack.c
+@@ -45,6 +45,11 @@
+ #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
+ #define opal_generic_simple_pack_function               opal_generic_simple_pack_checksum
+ #define opal_pack_general_function                      opal_pack_general_checksum
++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
++#define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_gpu
++#define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_gpu
++#define opal_generic_simple_pack_function               opal_generic_simple_pack_gpu
++#define opal_pack_general_function                      opal_pack_general_gpu
+ #else
+ #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
+ #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
+diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
+index 2a2e79180d..7fbf0c88e2 100644
+--- a/opal/datatype/opal_datatype_pack.h
++++ b/opal/datatype/opal_datatype_pack.h
+@@ -19,11 +19,12 @@
+ 
+ #include "opal_config.h"
+ 
+-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+ /* Make use of existing macro to do CUDA style memcpy */
++#include "opal_datatype_cuda.h"
+ #undef MEMCPY_CSUM
+ #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
+-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
++    opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ #endif
+ 
+ /**
+diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
+index 668397112b..111f74f2a4 100644
+--- a/opal/datatype/opal_datatype_prototypes.h
++++ b/opal/datatype/opal_datatype_prototypes.h
+@@ -39,6 +39,16 @@ OPAL_DECLSPEC int32_t
+ opal_unpack_general_checksum( opal_convertor_t* pConvertor,
+                               struct iovec* iov, uint32_t* out_size,
+                               size_t* max_data );
++#if OPAL_CUDA_SUPPORT
++OPAL_DECLSPEC int32_t
++opal_pack_general_gpu( opal_convertor_t* pConvertor,
++                        struct iovec* iov, uint32_t* out_size,
++                        size_t* max_data );
++OPAL_DECLSPEC int32_t
++opal_unpack_general_gpu( opal_convertor_t* pConvertor,
++                          struct iovec* iov, uint32_t* out_size,
++                          size_t* max_data );
++#endif
+ 
+ /*
+  * Now the internal functions
+@@ -83,6 +93,28 @@ int32_t
+ opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
+                                      struct iovec* iov, uint32_t* out_size,
+                                      size_t* max_data );
++#if OPAL_CUDA_SUPPORT
++int32_t
++opal_pack_homogeneous_contig_gpu( opal_convertor_t* pConv,
++                                   struct iovec* iov, uint32_t* out_size,
++                                   size_t* max_data );
++int32_t
++opal_pack_homogeneous_contig_with_gaps_gpu( opal_convertor_t* pConv,
++                                             struct iovec* iov, uint32_t* out_size,
++                                             size_t* max_data );
++int32_t
++opal_generic_simple_pack_gpu( opal_convertor_t* pConvertor,
++                               struct iovec* iov, uint32_t* out_size,
++                               size_t* max_data );
++int32_t
++opal_unpack_homogeneous_contig_gpu( opal_convertor_t* pConv,
++                                     struct iovec* iov, uint32_t* out_size,
++                                     size_t* max_data );
++int32_t
++opal_generic_simple_unpack_gpu( opal_convertor_t* pConvertor,
++                                 struct iovec* iov, uint32_t* out_size,
++                                 size_t* max_data );
++#endif
+ 
+ END_C_DECLS
+ 
+diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
+index 26a5810dc0..668b6624aa 100644
+--- a/opal/datatype/opal_datatype_unpack.c
++++ b/opal/datatype/opal_datatype_unpack.c
+@@ -46,6 +46,10 @@
+ #define opal_unpack_general_function            opal_unpack_general_checksum
+ #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
+ #define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
++#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
++#define opal_unpack_general_function            opal_unpack_general_gpu
++#define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_gpu
++#define opal_generic_simple_unpack_function     opal_generic_simple_unpack_gpu
+ #else
+ #define opal_unpack_general_function            opal_unpack_general
+ #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
+@@ -204,10 +208,10 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
+     MEMCPY( temporary + start_position, partial_data, length );
+ 
+     /* Save the original content of the user memory */
+-#if OPAL_CUDA_SUPPORT
++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+     /* In the case where the data is being unpacked from device memory, need to
+      * use the special host to device memory copy. */
+-    pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor );
++    opal_cuda_memcpy(saved_data, user_data, data_length, pConvertor );
+ #else
+     MEMCPY( saved_data, user_data, data_length );
+ #endif
+@@ -222,15 +226,15 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
+ 
+     /* Rebuild the data by pulling back the unmodified bytes from the original
+      * content in the user memory. */
+-#if OPAL_CUDA_SUPPORT
++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+     /* Need to copy the modified user_data again so we can see which
+      * bytes need to be converted back to their original values. */
+     {
+         char resaved_data[16];
+-        pConvertor->cbmemcpy(resaved_data, user_data, data_length, pConvertor );
++        opal_cuda_memcpy(resaved_data, user_data, data_length, pConvertor );
+         for(size_t i = 0; i < data_length; i++ ) {
+             if( unused_byte == resaved_data[i] )
+-                pConvertor->cbmemcpy(&user_data[i], &saved_data[i], 1, pConvertor);
++                opal_cuda_memcpy(&user_data[i], &saved_data[i], 1, pConvertor);
+         }
+     }
+ #else
+diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
+index 33db837882..4159a475fc 100644
+--- a/opal/datatype/opal_datatype_unpack.h
++++ b/opal/datatype/opal_datatype_unpack.h
+@@ -19,11 +19,12 @@
+ 
+ #include "opal_config.h"
+ 
+-#if !defined(CHECKSUM) && OPAL_CUDA_SUPPORT
++#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+ /* Make use of existing macro to do CUDA style memcpy */
++#include "opal_datatype_cuda.h"
+ #undef MEMCPY_CSUM
+ #define MEMCPY_CSUM( DST, SRC, BLENGTH, CONVERTOR ) \
+-    CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
++    opal_cuda_memcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
+ #endif
+ 
+ /**
diff --git a/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..4a007d5cc467e2b2dc1d0b101d94e00ac1de18c0
--- /dev/null
+++ b/Golden_Repo/o/OpenMPI/OpenMPI-4.1.4-GCC-11.3.0.eb
@@ -0,0 +1,67 @@
+name = 'OpenMPI'
+version = '4.1.4'
+
+homepage = 'https://www.open-mpi.org/'
+description = """The Open MPI Project is an open source MPI-3 implementation."""
+
+toolchain = {'name': 'GCC', 'version': '11.3.0'}
+toolchainopts = {'pic': True}
+
+source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads']
+sources = [SOURCELOWER_TAR_BZ2]
+patches = [
+    'OpenMPI-4.1.1_opal-datatype-cuda-performance.patch',
+]
+checksums = [
+    '92912e175fd1234368c8730c03f4996fe5942e7479bb1d10059405e7f2b3930d',  # openmpi-4.1.4.tar.bz2
+    # OpenMPI-4.1.1_opal-datatype-cuda-performance.patch
+    'b767c7166cf0b32906132d58de5439c735193c9fd09ec3c5c11db8d5fa68750e',
+]
+
+osdependencies = [
+    # needed for --with-verbs
+    ('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel'),
+    # needed for --with-pmix
+    ('pmix-devel'),
+]
+
+builddependencies = [
+    ('pkgconf', '1.8.0'),
+    ('Perl', '5.34.1'),
+    ('Autotools', '20220317'),
+]
+
+dependencies = [
+    ('zlib', '1.2.12'),
+    ('hwloc', '2.7.1'),
+    ('UCX', 'default', '', SYSTEM),
+    ('CUDA', '11.7', '', SYSTEM),
+    ('libevent', '2.1.12'),
+    ('PMIx', '3.2.3'), # We rely on this version since it is the newest supported by psmgmt
+    ('UCC', 'default'),
+]
+
+# Update configure to include changes from the "internal-cuda" patch
+# by running a subset of autogen.pl sufficient to achieve this
+# without doing the full, long-running regeneration.
+preconfigopts = ' && '.join([
+    'cd config',
+    'autom4te --language=m4sh opal_get_version.m4sh -o opal_get_version.sh',
+    'cd ..',
+    'autoconf',
+    'autoheader',
+    'aclocal',
+    'automake',
+    ''
+])
+
+configopts = '--without-orte '
+configopts += '--without-psm2 '
+configopts += '--disable-oshmem '
+configopts += '--with-ime=/opt/ddn/ime '
+configopts += '--with-gpfs '
+
+# to enable SLURM integration (site-specific)
+configopts += '--with-slurm --with-pmix=external --with-libevent=external --with-ompi-pmix-rte'
+
+moduleclass = 'mpi'
diff --git a/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb b/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..953711ddd2cd2583c5834a759e53234acf6753c7
--- /dev/null
+++ b/Golden_Repo/p/PMIx/PMIx-3.2.3-GCCcore-11.3.0.eb
@@ -0,0 +1,48 @@
+##
+# Author:    Robert Mijakovic <robert.mijakovic@lxp.lu>
+##
+easyblock = 'ConfigureMake'
+
+name = 'PMIx'
+version = '3.2.3'
+
+homepage = 'https://pmix.org/'
+description = """Process Management for Exascale Environments
+PMI Exascale (PMIx) represents an attempt to
+provide an extended version of the PMI standard specifically designed
+to support clusters up to and including exascale sizes. The overall
+objective of the project is not to branch the existing pseudo-standard
+definitions - in fact, PMIx fully supports both of the existing PMI-1
+and PMI-2 APIs - but rather to (a) augment and extend those APIs to
+eliminate some current restrictions that impact scalability, and (b)
+provide a reference implementation of the PMI-server that demonstrates
+the desired level of scalability.
+"""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+toolchainopts = {'pic': True}
+
+source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s']
+sources = ['%(namelower)s-%(version)s.tar.bz2']
+checksums = ['9b835f23c2f94a193c14012ee68b3657a61c568598cdd1212a3716b32d41a135']
+
+builddependencies = [('binutils', '2.38')]
+
+dependencies = [
+    ('libevent', '2.1.12'),
+    ('zlib', '1.2.12'),
+    ('hwloc', '2.7.1'),
+]
+
+configopts = ' --with-libevent=$EBROOTLIBEVENT --with-zlib=$EBROOTZLIB'
+configopts += ' --with-hwloc=$EBROOTHWLOC'
+configopts += ' --enable-pmix-binaries'
+
+buildopts = 'V=1'
+
+sanity_check_paths = {
+    'files': ['bin/pevent', 'bin/plookup', 'bin/pmix_info', 'bin/pps'],
+    'dirs': ['etc', 'include', 'lib', 'share']
+}
+
+moduleclass = 'lib'
diff --git a/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz b/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..3400371cf58a722255fc02131699649f13f0a0b3
Binary files /dev/null and b/Golden_Repo/u/UCC/NCCL-2.14.3-1.tar.gz differ
diff --git a/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb b/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..308921f966b85ac070c9bf0caab2f38bce48ca14
--- /dev/null
+++ b/Golden_Repo/u/UCC/UCC-default-GCCcore-11.3.0.eb
@@ -0,0 +1,56 @@
+# For figuring out the correct GPU arch. Easyconfig templates can't be used,
+# since we have in some systems more than 1 compute capability.
+# The local prefix is to appease the checker
+import os as local_os
+
+easyblock = 'ConfigureMake'
+
+name = 'UCC'
+version = 'default'
+local_version = '1.1.0-rc1'
+
+homepage = 'https://www.openucx.org/'
+description = """UCC (Unified Collective Communication) is a collective
+communication operations API and library that is flexible, complete, and 
+feature-rich for current and emerging programming models and runtimes.
+"""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+toolchainopts = {'pic': True}
+
+source_urls = ['https://github.com/openucx/ucc/archive/refs/tags']
+sources = [f'v{local_version}.tar.gz']
+checksums = [
+    '4af76d706a788af081c4a6ce566b6d4e33d75629ce9a8a7b8eec1760eff13168',  # v1.1.0-rc1.tar.gz
+]
+
+builddependencies = [
+    ('binutils', '2.38'),
+    ('Autotools', '20220317'),
+]
+
+dependencies = [
+    ('UCX', 'default', '', SYSTEM),
+    ('CUDA', '11.7', '', SYSTEM),
+    ('NCCL', '2.14.3-1', '-CUDA-%(cudashortver)s'),
+]
+
+preconfigopts = "./autogen.sh && "
+
+local_gpu_cc = local_os.environ["EASYBUILD_CUDA_COMPUTE_CAPABILITIES"].split(',')[0].replace('.','')
+
+configopts = "--enable-optimizations "
+configopts += f"--with-nvcc-gencode=-gencode=arch=compute_{local_gpu_cc},code=sm_{local_gpu_cc} "
+configopts += "--with-ucx=$EBROOTUCX "
+configopts += "--with-cuda=$EBROOTCUDA "
+configopts += "--with-nccl=$EBROOTNCCL "
+configopts += "--with-sharp=/opt/mellanox/sharp "
+
+sanity_check_paths = {
+    'files': ['bin/ucc_info'],
+    'dirs': ['include', 'lib']
+}
+
+sanity_check_commands = ["ucc_info -c"]
+
+moduleclass = 'lib'
diff --git a/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb b/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb
new file mode 100644
index 0000000000000000000000000000000000000000..5858842fac29bab516a05f03ec8900415f48c541
--- /dev/null
+++ b/Golden_Repo/x/xorg-macros/xorg-macros-1.19.3-GCCcore-11.3.0.eb
@@ -0,0 +1,27 @@
+easyblock = 'ConfigureMake'
+
+name = 'xorg-macros'
+version = '1.19.3'
+
+homepage = 'https://cgit.freedesktop.org/xorg/util/macros'
+description = """X.org macros utilities."""
+
+toolchain = {'name': 'GCCcore', 'version': '11.3.0'}
+
+source_urls = ['https://gitlab.freedesktop.org/xorg/util/macros/-/archive/util-macros-%(version)s']
+sources = ['macros-util-macros-%(version)s.tar.gz']
+checksums = ['8205d210a580da0938f5ce4392a96b60cf1d9a5f792eaa1474fa4c1977aef4d0']
+
+builddependencies = [
+    ('binutils', '2.38'),
+    ('Autotools', '20220317'),
+]
+
+preconfigopts = './autogen.sh && '
+
+sanity_check_paths = {
+    'files': ['share/pkgconfig/xorg-macros.pc'],
+    'dirs': [],
+}
+
+moduleclass = 'devel'