Skip to content
Snippets Groups Projects
Commit b4286445 authored by Damian Alvarez's avatar Damian Alvarez
Browse files

To support the separation of the UCX settings from the MPI settings, which...

To support the separation of the UCX settings from the MPI settings, which provides a better view of what is tweaked, and allows to reuse UCX settings for all MPIs
parent 24e6ad76
No related branches found
No related tags found
No related merge requests found
Showing
with 180 additions and 184 deletions
...@@ -228,6 +228,23 @@ family("mpi") ...@@ -228,6 +228,23 @@ family("mpi")
ec.log.info( ec.log.info(
"[parse hook] Injecting Lmod mpi family and mpi-settings loading") "[parse hook] Injecting Lmod mpi family and mpi-settings loading")
# UCX require to load UCX-settings
if ec.name in 'UCX' and '/p/software' in install_path().lower():
key = "modluafooter"
value = '''
if not ( isloaded("UCX-settings") ) then
load("UCX-settings")
end
'''
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = "\n".join([ec[key], value])
else:
ec[key] = value
ec.log.info(
"[parse hook] Injecting UCX-settings loading")
# Check if we need to use 'modaltsoftname' # Check if we need to use 'modaltsoftname'
if ec.name in REQUIRE_MODALTSOFTNAME: if ec.name in REQUIRE_MODALTSOFTNAME:
key = "modaltsoftname" key = "modaltsoftname"
......
...@@ -19,6 +19,7 @@ CORE = 'Core' ...@@ -19,6 +19,7 @@ CORE = 'Core'
COMPILER = 'Compiler' COMPILER = 'Compiler'
MPI = 'MPI' MPI = 'MPI'
MPI_SETTINGS = 'MPI_settings' MPI_SETTINGS = 'MPI_settings'
COMM_SETTINGS = 'comm_settings'
MODULECLASS_COMPILER = 'compiler' MODULECLASS_COMPILER = 'compiler'
MODULECLASS_MPI = 'mpi' MODULECLASS_MPI = 'mpi'
...@@ -57,6 +58,9 @@ mpi_relevant_versions = { ...@@ -57,6 +58,9 @@ mpi_relevant_versions = {
# MPIs with settings modules # MPIs with settings modules
mpi_with_settings = ['psmpi', 'impi', 'OpenMPI', 'BullMPI'] mpi_with_settings = ['psmpi', 'impi', 'OpenMPI', 'BullMPI']
# Communication packages with settings modules
comm_pkg_with_settings = ['UCX', 'NCCL']
class FlexibleCustomHierarchicalMNS(HierarchicalMNS): class FlexibleCustomHierarchicalMNS(HierarchicalMNS):
"""Class implementing an example hierarchical module naming scheme.""" """Class implementing an example hierarchical module naming scheme."""
def is_short_modname_for(self, short_modname, name): def is_short_modname_for(self, short_modname, name):
...@@ -170,6 +174,9 @@ class FlexibleCustomHierarchicalMNS(HierarchicalMNS): ...@@ -170,6 +174,9 @@ class FlexibleCustomHierarchicalMNS(HierarchicalMNS):
stripped_name = re.sub('-settings$', '', ec['name']) stripped_name = re.sub('-settings$', '', ec['name'])
if stripped_name in mpi_with_settings: if stripped_name in mpi_with_settings:
subdir = os.path.join(MPI_SETTINGS, stripped_name, ec['version']) subdir = os.path.join(MPI_SETTINGS, stripped_name, ec['version'])
# or a module is for a communicaiton packages with settings
elif stripped_name in comm_pkg_with_settings and '-settings' in ec['name']:
subdir = os.path.join(COMM_SETTINGS, stripped_name)
else: else:
tc_comp_name, tc_comp_ver = self._find_relevant_compiler_info(tc_comp_info) tc_comp_name, tc_comp_ver = self._find_relevant_compiler_info(tc_comp_info)
tc_mpi = det_toolchain_mpi(ec) tc_mpi = det_toolchain_mpi(ec)
...@@ -266,4 +273,7 @@ class FlexibleCustomHierarchicalMNS(HierarchicalMNS): ...@@ -266,4 +273,7 @@ class FlexibleCustomHierarchicalMNS(HierarchicalMNS):
if ec['name'] in mpi_with_settings: if ec['name'] in mpi_with_settings:
paths.append(os.path.join(MPI_SETTINGS, mpi_name, mpi_ver)) paths.append(os.path.join(MPI_SETTINGS, mpi_name, mpi_ver))
elif ec['name'] in ['UCX', 'NCCL']:
paths.append(os.path.join(COMM_SETTINGS, ec['name']))
return paths return paths
...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle' ...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle'
name = 'impi-settings' name = 'impi-settings'
version = '2021' version = '2021'
versionsuffix = 'UCX-UD' versionsuffix = 'UCX'
homepage = '' homepage = ''
description = """This is a module to load the IntelMPI configuration with UCX and UD as TL""" description = 'This is a module to load the IntelMPI configuration with UCX'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -16,7 +16,7 @@ source_urls = [] ...@@ -16,7 +16,7 @@ source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'UCX_TLS': 'ud_x,sm,self', 'FI_PROVIDER': 'mlx',
'I_MPI_PMI_VALUE_LENGTH_MAX': '900', 'I_MPI_PMI_VALUE_LENGTH_MAX': '900',
# Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module # Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module
'HFI_NO_CPUAFFINITY': 'YES', 'HFI_NO_CPUAFFINITY': 'YES',
......
easyblock = 'SystemBundle'
name = 'impi-settings'
version = '2021'
versionsuffix = 'large-job-hybrid'
homepage = ''
description = """This is a module to load the IntelMPI configuration for large scale hybrid jobs"""
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'dc_x,sm,self',
'I_MPI_PMI_VALUE_LENGTH_MAX': '900',
# Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module
'HFI_NO_CPUAFFINITY': 'YES',
}
moduleclass = 'system'
...@@ -5,7 +5,7 @@ version = '2021' ...@@ -5,7 +5,7 @@ version = '2021'
versionsuffix = 'plain' versionsuffix = 'plain'
homepage = '' homepage = ''
description = """This is a module to load the default IntelMPI configuration""" description = 'This is a module to load the default IntelMPI configuration. It relies on the default order for libfabric'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -16,7 +16,6 @@ source_urls = [] ...@@ -16,7 +16,6 @@ source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'UCX_TLS': 'dc_x,sm,self',
'I_MPI_PMI_VALUE_LENGTH_MAX': '900', 'I_MPI_PMI_VALUE_LENGTH_MAX': '900',
# Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module # Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module
'HFI_NO_CPUAFFINITY': 'YES', 'HFI_NO_CPUAFFINITY': 'YES',
......
easyblock = 'SystemBundle'
name = 'OpenMPI-settings'
version = '4.1'
versionsuffix = 'CUDA-low-latency'
homepage = ''
description = '''This is a module to load the default OpenMPI configuration
This module is otherwise equivalent to mpi-settings/CUDA, but enables UCX_MEMTYPE_CACHE. Please read the URL below to
understand if this is something you can use:
http://openucx.github.io/ucx/faq.html#7-what-are-the-current-limitations-of-using-gpu-memory
'''
modloadmsg = '''
This module is otherwise equivalent to mpi-settings/CUDA, but enables UCX_MEMTYPE_CACHE. Please read the URL below to
understand if this is something you can use:
http://openucx.github.io/ucx/faq.html#7-what-are-the-current-limitations-of-using-gpu-memory
'''
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'SLURM_MPI_TYPE': 'pspmix',
'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
'UCX_MEMTYPE_CACHE': 'y',
'OMPI_MCA_mca_base_component_show_load_errors': '1',
'OMPI_MCA_mpi_param_check': '1',
'OMPI_MCA_mpi_show_handle_leaks': '1',
'OMPI_MCA_mpi_warn_on_fork': '1',
# Disable uct for the time being due to:
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX#running-open-mpi-with-ucx
# Also openib, since it is deprecated and should be substituted by the UCX support in the pml
'OMPI_MCA_btl': '^uct,openib',
'OMPI_MCA_btl_openib_allow_ib': '1',
'OMPI_MCA_bml_r2_show_unreach_errors': '0',
'OMPI_MCA_coll': '^ml',
'OMPI_MCA_coll_hcoll_enable': '1',
'OMPI_MCA_coll_hcoll_np': '0',
'OMPI_MCA_pml': 'ucx',
'OMPI_MCA_osc': '^rdma',
'OMPI_MCA_opal_abort_print_stack': '1',
'OMPI_MCA_opal_set_max_sys_limits': '1',
'OMPI_MCA_opal_event_include': 'epoll',
'OMPI_MCA_btl_openib_warn_default_gid_prefix': '0',
# OMPIO does not seem to work reliably on our system
'OMPI_MCA_io': 'romio321',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'OpenMPI-settings'
version = '4.1'
versionsuffix = 'UCX-RC'
homepage = ''
description = """This is a module to load the an OpenMPI configuration for nodes not equipped with GPUs"""
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'SLURM_MPI_TYPE': 'pspmix',
'UCX_TLS': 'rc_x,self,sm',
'UCX_MEMTYPE_CACHE': 'n',
'OMPI_MCA_mca_base_component_show_load_errors': '1',
'OMPI_MCA_mpi_param_check': '1',
'OMPI_MCA_mpi_show_handle_leaks': '1',
'OMPI_MCA_mpi_warn_on_fork': '1',
# Disable uct for the time being due to:
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX#running-open-mpi-with-ucx
# Also openib, since it is deprecated and should be substituted by the UCX support in the pml
'OMPI_MCA_btl': '^uct,openib',
'OMPI_MCA_btl_openib_allow_ib': '1',
'OMPI_MCA_bml_r2_show_unreach_errors': '0',
'OMPI_MCA_coll': '^ml',
'OMPI_MCA_coll_hcoll_enable': '1',
'OMPI_MCA_coll_hcoll_np': '0',
'OMPI_MCA_pml': 'ucx',
'OMPI_MCA_osc': '^rdma',
'OMPI_MCA_opal_abort_print_stack': '1',
'OMPI_MCA_opal_set_max_sys_limits': '1',
'OMPI_MCA_opal_event_include': 'epoll',
'OMPI_MCA_btl_openib_warn_default_gid_prefix': '0',
# OMPIO does not seem to work reliably on our system
'OMPI_MCA_io': 'romio321',
}
moduleclass = 'system'
...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle' ...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle'
name = 'OpenMPI-settings' name = 'OpenMPI-settings'
version = '4.1' version = '4.1'
versionsuffix = 'CUDA' versionsuffix = 'plain'
homepage = '' homepage = ''
description = """This is a module to load the default OpenMPI configuration""" description = 'This is a module to load the default OpenMPI configuration. It relies on UCX.'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -16,8 +16,6 @@ source_urls = [] ...@@ -16,8 +16,6 @@ source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'SLURM_MPI_TYPE': 'pspmix', 'SLURM_MPI_TYPE': 'pspmix',
'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
'UCX_MEMTYPE_CACHE': 'n',
'OMPI_MCA_mca_base_component_show_load_errors': '1', 'OMPI_MCA_mca_base_component_show_load_errors': '1',
'OMPI_MCA_mpi_param_check': '1', 'OMPI_MCA_mpi_param_check': '1',
'OMPI_MCA_mpi_show_handle_leaks': '1', 'OMPI_MCA_mpi_show_handle_leaks': '1',
......
easyblock = 'SystemBundle'
name = 'psmpi-settings'
version = '5.4'
versionsuffix = 'CUDA-low-latency-UD'
homepage = ''
description = '''This is a module to load the default ParaStationMPI configuration
This module is otherwise equivalent to mpi-settings/CUDA, but enables UCX_MEMTYPE_CACHE. Please read the URL below to
understand if this is something you can use:
http://openucx.github.io/ucx/faq.html#7-what-are-the-current-limitations-of-using-gpu-memory
'''
modloadmsg = '''
This module is otherwise equivalent to mpi-settings/CUDA, but enables UCX_MEMTYPE_CACHE. Please read the URL below to
understand if this is something you can use:
http://openucx.github.io/ucx/faq.html#7-what-are-the-current-limitations-of-using-gpu-memory
'''
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'PSP_CUDA': '1',
'PSP_SHM': '0',
'PSP_UCP': '1',
'PSP_HARD_ABORT': '1',
'UCX_TLS': 'ud_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
'UCX_MEMTYPE_CACHE': 'y',
}
moduleclass = 'system'
...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle' ...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle'
name = 'psmpi-settings' name = 'psmpi-settings'
version = '5.4' version = '5.4'
versionsuffix = 'CUDA-UD' versionsuffix = 'CUDA'
homepage = '' homepage = ''
description = 'This is a module to load the default ParaStationMPI configuration' description = 'This is a module to load the ParaStationMPI configuration. It enables UCX as a communication library and CUDA-aware features.'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -15,12 +15,11 @@ source_urls = [] ...@@ -15,12 +15,11 @@ source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'PSP_OPENIB': '0',
'PSP_UCP': '1',
'PSP_CUDA': '1', 'PSP_CUDA': '1',
'PSP_SHM': '0', 'PSP_SHM': '0',
'PSP_UCP': '1',
'PSP_HARD_ABORT': '1', 'PSP_HARD_ABORT': '1',
'UCX_TLS': 'ud_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
'UCX_MEMTYPE_CACHE': 'n',
} }
moduleclass = 'system' moduleclass = 'system'
...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle' ...@@ -2,10 +2,10 @@ easyblock = 'SystemBundle'
name = 'psmpi-settings' name = 'psmpi-settings'
version = '5.4' version = '5.4'
versionsuffix = 'UCX-UD' versionsuffix = 'UCX'
homepage = '' homepage = ''
description = 'This is a module to load the ParaStationMPI configuration. It enables UCX with UD as transport' description = 'This is a module to load the ParaStationMPI configuration. It enables UCX as a communication library.'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -18,7 +18,6 @@ modextravars = { ...@@ -18,7 +18,6 @@ modextravars = {
'PSP_OPENIB': '0', 'PSP_OPENIB': '0',
'PSP_UCP': '1', 'PSP_UCP': '1',
'PSP_HARD_ABORT': '1', 'PSP_HARD_ABORT': '1',
'UCX_TLS': 'ud_x,self,sm',
} }
moduleclass = 'system' moduleclass = 'system'
easyblock = 'SystemBundle' easyblock = 'SystemBundle'
name = 'impi-settings' name = 'psmpi-settings'
version = '2021' version = '5.4'
versionsuffix = 'large-job-mpi' versionsuffix = 'plain'
homepage = '' homepage = ''
description = """This is a module to load the IntelMPI configuration for large scale MPI jobs""" description = 'This is a module to load the ParaStationMPI configuration. It relies on the defaults.'
site_contacts = 'd.alvarez@fz-juelich.de' site_contacts = 'd.alvarez@fz-juelich.de'
...@@ -14,12 +14,8 @@ toolchain = SYSTEM ...@@ -14,12 +14,8 @@ toolchain = SYSTEM
source_urls = [] source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'UCX_TLS': 'dc_x,sm,self', 'PSP_HARD_ABORT': '1',
'I_MPI_PMI_VALUE_LENGTH_MAX': '900',
# Needed for PSM and harmless for InfiniBand. For ParaStation it is set on the pscom module
'HFI_NO_CPUAFFINITY': 'YES',
} }
moduleclass = 'system' moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'DC-CUDA'
homepage = ''
description = 'This is a module to load the set UCX to use DC as the transport layer, together with the CUWA-aware transports.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'dc_x,self,sm,cuda_ipc,gdr_copy,cuda_copy',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'DC'
homepage = ''
description = 'This is a module to load the set UCX to use DC as the transport layer.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'dc_x,self,sm',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'RC-CUDA'
homepage = ''
description = 'This is a module to load the set UCX to use RC as the transport layer, together with the CUWA-aware transports.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'rc_x,self,sm,cuda_ipc,gdr_copy,cuda_copy',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'RC'
homepage = ''
description = 'This is a module to load the set UCX to use RC as the transport layer.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'rc_x,self,sm',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'UD-CUDA'
homepage = ''
description = 'This is a module to load the set UCX to use UD as the transport layer, together with the CUWA-aware transports.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'ud_x,self,sm,cuda_ipc,gdr_copy,cuda_copy',
}
moduleclass = 'system'
easyblock = 'SystemBundle'
name = 'UCX-settings'
version = 'UD'
homepage = ''
description = 'This is a module to load the set UCX to use UD as the transport layer.'
site_contacts = 'd.alvarez@fz-juelich.de'
toolchain = SYSTEM
source_urls = []
sources = []
modextravars = {
'UCX_TLS': 'ud_x,self,sm',
}
moduleclass = 'system'
...@@ -16,8 +16,6 @@ source_urls = [] ...@@ -16,8 +16,6 @@ source_urls = []
sources = [] sources = []
modextravars = { modextravars = {
'SLURM_MPI_TYPE': 'pspmix', 'SLURM_MPI_TYPE': 'pspmix',
'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
'UCX_MEMTYPE_CACHE': 'n',
'OMPI_MCA_mca_base_component_show_load_errors': '1', 'OMPI_MCA_mca_base_component_show_load_errors': '1',
'OMPI_MCA_mpi_param_check': '1', 'OMPI_MCA_mpi_param_check': '1',
'OMPI_MCA_mpi_show_handle_leaks': '1', 'OMPI_MCA_mpi_show_handle_leaks': '1',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment