Skip to content
Snippets Groups Projects
Commit 86984f71 authored by George Katevenis's avatar George Katevenis
Browse files

Add easyconfig for XHC, bundled with OpenMPI v5 (pre-release)

parent c5d65e8e
No related branches found
No related tags found
No related merge requests found
# If you get an oversubscription error during testing, set
# PRTE_MCA_rmaps_default_mapping_policy=:OVERSUBSCRIBE.
# (different from OpenMPI v4's OMPI_MCA_rmaps_base_oversubscribe=1)
# TODO: How to set an env var in an easyconfig?
easyblock = 'ConfigureMake'
# ------------------------------
local_xhc_version = '1.2'
local_xhc_sources = '/p/project/deepsea/wp5/xhc/source/xhc-v' + local_xhc_version + '.tar.gz'
local_xhc_checksums = 'e184e86aa90a092f53c878fdcbfe8699a0b125bdca236b434dbdbae6253f02e8'
local_ompi_version = '5.0.0rc7'
# ------------------------------
name = 'OpenMPI-XHC'
version = '%s-%s' % (local_ompi_version, local_xhc_version)
homepage = 'https://www.open-mpi.org/'
description = """The Open MPI Project is an open source MPI-3 implementation."""
toolchain = {'name': 'GCC', 'version': '11.3.0'}
toolchainopts = {'pic': True}
sources = [
# Using git clone instead of the release URL; git submodules will be needed
{
'filename': 'ompi-%s.tar.gz' % local_ompi_version,
'git_config': {
'url': 'https://github.com/open-mpi',
'repo_name': 'ompi',
'tag': 'v%s' % local_ompi_version,
'keep_git_dir': True,
},
},
{
'filename': local_xhc_sources,
'extract_cmd': 'tar -xzvf %s && mv xhc-* ./ompi/ompi/mca/coll/xhc'
}
]
patches = ['OpenMPI-XHC-fixes.patch']
checksums = [
None,
local_xhc_checksums,
'7b3e9454aa7ac28569a00f74e543c79ded0305c68502238efb6f42a83ce53a8e'
]
osdependencies = [
# needed for --with-verbs
('libibverbs-dev', 'libibverbs-devel', 'rdma-core-devel'),
]
builddependencies = [
('Autotools', '20220317'),
('pkg-config', '0.29.2'),
('Perl', '5.34.1'),
('Pandoc', '2.19.2', '', SYSTEM),
('flex', '2.6.4'),
]
dependencies = [
('zlib', '1.2.12'),
# ('hwloc', '2.5.0'), # internal
('UCX', 'default'),
('CUDA', '11.7', '', SYSTEM),
# ('libevent', '2.1.12'), # internal
# ('PMIx', '4.1.0'), # internal
]
# Developer checkout is necessary for XHC, and these are necessary for developer checkout
preconfigopts = 'git submodule update --init --recursive && '
preconfigopts += '(cd 3rd-party/openpmix && git am ../../pmix_fix_flex.patch) && '
preconfigopts += './autogen.pl && '
configopts = '--enable-shared '
configopts += '--with-hwloc=internal '
configopts += '--with-ucx=$EBROOTUCX --with-ucx-libdir=$EBROOTUCX/lib '
# OMPI has trouble with ambiguity between lib and lib64 when detecting UCX's libdir
configopts += '--with-libevent=internal '
configopts += '--with-pmix=internal '
configopts += '--with-slurm '
configopts += '--without-psm2 '
configopts += '--disable-oshmem '
configopts += '--with-cuda=$EBROOTCUDA '
configopts += '--with-xpmem=/p/project/deepsea/wp5/xpmem '
# libportals is available on deepv but not on cluster nodes.
configopts += '--without-portals4 '
# -------------------------------
# No IME or GPFS in DEEP
# configopts += '--with-ime=/opt/ddn/ime '
# configopts += '--with-gpfs '
# disable MPI1 compatibility for now, see what breaks...
# configopts = '--enable-mpi1-compatibility '
# to enable SLURM integration (site-specific)
# configopts += '--with-slurm --with-pmi=/usr/include/slurm --with-pmi-libdir=/usr'
# -------------------------------
# Trigger autofs mount of these filesystems, so that Open MPI's imperfect
# opal_path_nfs test won't fail (https://github.com/open-mpi/ompi/issues/10152)
# pretestcmds/postbuildcmds = ['stat /direct/Software /p/{arch,usersoftware,fastdata}']
skipsteps = ['test'] # not sure how to do the above though!
local_libs = ["mpi_mpifh", "mpi", "open-pal"]
sanity_check_paths = {
'files': ["bin/%s" % local_binfile for local_binfile in ["ompi_info", "opal_wrapper"]] +
["lib/lib%s.%s" % (local_libfile, SHLIB_EXT) for local_libfile in local_libs] +
["include/%s.h" % x for x in ["mpi-ext", "mpif-config",
"mpif", "mpi", "mpi_portable_platform"]],
'dirs': [],
}
moduleclass = 'mpi'
From 848096fcec8956eaf6982ef4021e0ede72e6186e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 16 Mar 2022 09:33:21 -0400
Subject: [PATCH 1/5] smsc/xpmem: Fix bound alignment
The upper bound of the mapped region must include the last byte of
the range and not reach past the aligned range.
Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
opal/mca/smsc/xpmem/smsc_xpmem_module.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/opal/mca/smsc/xpmem/smsc_xpmem_module.c b/opal/mca/smsc/xpmem/smsc_xpmem_module.c
index d2954c1e31..80a0729649 100644
--- a/opal/mca/smsc/xpmem/smsc_xpmem_module.c
+++ b/opal/mca/smsc/xpmem/smsc_xpmem_module.c
@@ -116,7 +116,7 @@ void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t fla
int rc;
base = OPAL_DOWN_ALIGN((uintptr_t) remote_ptr, attach_align, uintptr_t);
- bound = OPAL_ALIGN((uintptr_t) remote_ptr + size - 1, attach_align, uintptr_t) + 1;
+ bound = OPAL_ALIGN((uintptr_t) remote_ptr + size, attach_align, uintptr_t);
if (OPAL_UNLIKELY(bound > xpmem_endpoint->address_max)) {
bound = xpmem_endpoint->address_max;
}
--
2.31.1
From 61e00ee395f66963c069d3e28bea788d57360ade Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 16 Mar 2022 09:36:37 -0400
Subject: [PATCH 2/5] smsc/xpmem: retry with page upper bound if aligned range
cannot be mapped
The aligned range computed in mca_smsc_xpmem_map_peer_region may
reach past the end of the stack, which may cause the mapping to fail.
Retrying with an actual page as upper bound has a better chance to succeed.
Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
opal/mca/smsc/xpmem/smsc_xpmem_module.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/opal/mca/smsc/xpmem/smsc_xpmem_module.c b/opal/mca/smsc/xpmem/smsc_xpmem_module.c
index 80a0729649..6a3444a35d 100644
--- a/opal/mca/smsc/xpmem/smsc_xpmem_module.c
+++ b/opal/mca/smsc/xpmem/smsc_xpmem_module.c
@@ -23,6 +23,7 @@
#include "opal/mca/smsc/base/base.h"
#include "opal/mca/smsc/xpmem/smsc_xpmem_internal.h"
#include "opal/util/minmax.h"
+#include "opal/util/sys_limits.h"
OBJ_CLASS_INSTANCE(mca_smsc_xpmem_endpoint_t, opal_object_t, NULL, NULL);
@@ -157,8 +158,14 @@ void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t fla
reg->rcache_context = xpmem_attach(xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *) -1 == reg->rcache_context)) {
- OBJ_RELEASE(reg);
- return NULL;
+ /* retry with the page as upper bound */
+ bound = OPAL_ALIGN((uintptr_t) remote_ptr + size, opal_getpagesize(), uintptr_t);
+ reg->bound = (unsigned char *) bound;
+ reg->rcache_context = xpmem_attach(xpmem_addr, bound - base, NULL);
+ if (OPAL_UNLIKELY((void *) -1 == reg->rcache_context)) {
+ OBJ_RELEASE(reg);
+ return NULL;
+ }
}
opal_memchecker_base_mem_defined(reg->rcache_context, bound - base);
@@ -307,5 +314,5 @@ mca_smsc_xpmem_module_t mca_smsc_xpmem_module = {
.copy_from = mca_smsc_xpmem_copy_from,
.map_peer_region = mca_smsc_xpmem_map_peer_region,
.unmap_peer_region = mca_smsc_xpmem_unmap_peer_region,
- },
+ },
};
--
2.31.1
From 553bf8adf30d28da53b7d7462468e3ba4e0146e3 Mon Sep 17 00:00:00 2001
From: cc-riscv64 <cc-riscv64>
Date: Thu, 28 Apr 2022 16:42:18 +0000
Subject: [PATCH 3/5] Fix mpi_comm_dup_with_info
---
ompi/communicator/comm.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c
index 8f9c95ade1..98ed989f28 100644
--- a/ompi/communicator/comm.c
+++ b/ompi/communicator/comm.c
@@ -963,6 +963,7 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_LAZY_BARRIER);
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_ACTIVE_POLL);
if (info) {
+ opal_info_dup(info, &newcomp->super.s_info);
opal_infosubscribe_change_info(&newcomp->super, info);
}
@@ -1068,6 +1069,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_LAZY_BARRIER);
ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_ACTIVE_POLL);
if (info) {
+ opal_info_dup(info, &newcomp->super.s_info);
opal_infosubscribe_change_info(&newcomp->super, info);
}
--
2.31.1
From d5ca5c9baf308fc855563edf0b61d07eae8e3420 Mon Sep 17 00:00:00 2001
From: George Katevenis <gkatev@ics.forth.gr>
Date: Thu, 6 Oct 2022 14:35:52 +0300
Subject: [PATCH 4/5] Initialize opal/smsc outside of btl/sm, to enable its use
without it
Signed-off-by: George Katevenis <gkatev@ics.forth.gr>
---
ompi/instance/instance.c | 4 ++++
opal/mca/btl/sm/btl_sm_component.c | 9 ++++-----
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index 03dad6faeb..a713a5617d 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -480,6 +480,10 @@ static int ompi_mpi_instance_init_common (void)
/* Select which MPI components to use */
+ if (OPAL_SUCCESS != (ret = mca_smsc_base_select())) {
+ return ompi_instance_print_error ("mca_smsc_base_select() failed", ret);
+ }
+
if (OMPI_SUCCESS != (ret = mca_pml_base_select (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
}
diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c
index d3b6bfb69d..de865f9fb4 100644
--- a/opal/mca/btl/sm/btl_sm_component.c
+++ b/opal/mca/btl/sm/btl_sm_component.c
@@ -40,7 +40,6 @@
#include "opal/mca/btl/sm/btl_sm_fbox.h"
#include "opal/mca/btl/sm/btl_sm_fifo.h"
#include "opal/mca/btl/sm/btl_sm_frag.h"
-#include "opal/mca/smsc/base/base.h"
#include "opal/mca/smsc/smsc.h"
#ifdef HAVE_SYS_STAT_H
@@ -332,8 +331,8 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab
/* no fast boxes allocated initially */
component->num_fbox_in_endpoints = 0;
- rc = mca_smsc_base_select();
- if (OPAL_SUCCESS == rc) {
+ bool have_smsc = (NULL != mca_smsc);
+ if (have_smsc) {
mca_btl_sm.super.btl_flags |= MCA_BTL_FLAGS_RDMA;
mca_btl_sm.super.btl_get = mca_btl_sm_get;
mca_btl_sm.super.btl_put = mca_btl_sm_put;
@@ -355,11 +354,11 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab
} else {
BTL_ERROR(("single-copy component requires registration but could not provide the "
"registration handle size"));
- rc = (int) handle_size;
+ have_smsc = false;
}
}
}
- if (OPAL_SUCCESS != rc) {
+ if (!have_smsc) {
mca_btl_sm.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA;
mca_btl_sm.super.btl_get = NULL;
mca_btl_sm.super.btl_put = NULL;
--
2.31.1
From 77081cff10f5a3b04052f34e6e5c89fb64384f70 Mon Sep 17 00:00:00 2001
From: George Katevenis <gkatev@ics.forth.gr>
Date: Fri, 19 Jan 2024 13:58:51 +0200
Subject: [PATCH 5/5] Add patch to fix flex in internal openpmix (openpmix/#2606)
---
pmix_fix_flex.patch | 78 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 78 insertions(+)
create mode 100644 pmix_fix_flex.patch
diff --git a/pmix_fix_flex.patch b/pmix_fix_flex.patch
new file mode 100644
index 0000000000..ded91bf4c1
--- /dev/null
+++ b/pmix_fix_flex.patch
@@ -0,0 +1,78 @@
+From 1286709db150ea2540f8a1d20f286a858c7a07df Mon Sep 17 00:00:00 2001
+From: Ralph Castain <rhc@pmix.org>
+Date: Tue, 24 May 2022 19:05:00 -0700
+Subject: [PATCH] Require flex only when keyval_lex.c is not provided
+
+We currently require flex whenever we are in a Git clone, but that
+really isn't the requirement. We need flex whenever the flex output
+files are not present - otherwise, you can build just fine. So open
+things up a bit by tying the flex requirement to the actual one
+(i.e., that the flex output file exist).
+
+Signed-off-by: Ralph Castain <rhc@pmix.org>
+---
+ config/pmix.m4 | 19 -------------------
+ configure.ac | 20 +++++++++++++++++++-
+ 2 files changed, 19 insertions(+), 20 deletions(-)
+
+diff --git a/config/pmix.m4 b/config/pmix.m4
+index 9c5f83df30..c870d90a9c 100644
+--- a/config/pmix.m4
++++ b/config/pmix.m4
+@@ -1026,25 +1026,6 @@ AC_DEFUN([PMIX_DEFINE_ARGS],[
+ AC_DEFINE_UNQUOTED(PMIX_ENABLE_DLOPEN_SUPPORT, $PMIX_ENABLE_DLOPEN_SUPPORT,
+ [Whether we want to enable dlopen support])
+
+-#
+-# Is this a developer copy?
+-#
+-
+-if test -e $PMIX_TOP_SRCDIR/.git; then
+- PMIX_DEVEL=1
+- # check for Flex
+- AC_PROG_LEX(yywrap)
+- if test "x$LEX" != xflex; then
+- AC_MSG_WARN([PMIx requires Flex to build from non-tarball sources,])
+- AC_MSG_WARN([but Flex was not found. Please install Flex into])
+- AC_MSG_WARN([your path and try again])
+- AC_MSG_ERROR([Cannot continue])
+- fi
+-else
+- PMIX_DEVEL=0
+-fi
+-
+-
+ #
+ # Developer picky compiler options
+ #
+diff --git a/configure.ac b/configure.ac
+index e0f207a2d0..7d3fe1de4a 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -209,7 +209,25 @@ PMIX_SETUP_WRAPPER_INIT
+ # This did not exist pre AM 1.11.x (where x is somewhere >0 and <3),
+ # but it is necessary in AM 1.12.x.
+ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+-AC_PROG_LEX(yywrap)
++
++#
++# Is this a developer copy?
++#
++
++if test -e $PMIX_TOP_SRCDIR/.git; then
++ PMIX_DEVEL=1
++else
++ PMIX_DEVEL=0
++fi
++# check for Flex
++AC_PROG_LEX(noyywrap)
++if test "x$LEX" != xflex && test ! -e $PMIX_TOP_SRCDIR/util/keyval/keyval_lex.c; then
++ AC_MSG_WARN([PMIx requires Flex to build from sources that were not])
++ AC_MSG_WARN([fully pre-processed (e.g., an official release tarball),])
++ AC_MSG_WARN([but Flex was not found. Please install Flex into])
++ AC_MSG_WARN([your path and try again])
++ AC_MSG_ERROR([Cannot continue])
++fi
+
+ ############################################################################
+ # Configuration options
--
2.43.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment