From a0675c9465c8ad1caea2a5eb28335ddec9be070d Mon Sep 17 00:00:00 2001 From: Damian Alvarez <swmanage@jwlogin04.juwels> Date: Tue, 25 May 2021 19:02:01 +0200 Subject: [PATCH] To indicate to psmpi to abort via PMI and do not leave processes hanging. Works with psmpi >= 5.4.9-1 --- .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb | 1 + Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb | 1 + Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 + Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb | 1 + Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb | 3 +++ Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb | 3 +++ .../p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-plain.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-CUDA.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb | 1 + .../jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb | 1 + .../jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-CUDA.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 + .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb | 1 + 21 files changed, 25 insertions(+) diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb index fe249b456..0e7bbd0c9 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', } diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb index 4a10705c3..7ea994d41 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', } diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb index 6ce4d0b15..f60132e1f 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb @@ -17,6 +17,7 @@ sources = [] modextravars = { 'PSP_OPENIB': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', } moduleclass = 'system' diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb index fc428a303..bb1186440 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', } diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb index dba5738bf..879c26739 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', } diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb index 5d04d2f5e..ba74304a9 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb @@ -17,6 +17,7 @@ sources = [] modextravars = { 'PSP_OPENIB': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', } moduleclass = 'system' diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb index 9e7e165df..f6bc981a3 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb @@ -14,5 +14,8 @@ toolchain = SYSTEM source_urls = [] sources = [] +modextravars = { + 'PSP_HARD_ABORT': '1', +} moduleclass = 'system' diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb index bdb810c78..6e5f31e73 100644 --- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb +++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb @@ -14,5 +14,8 @@ toolchain = SYSTEM source_urls = [] sources = [] +modextravars = { + 'PSP_HARD_ABORT': '1', +} moduleclass = 'system' diff --git a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb index b7bedd599..8ee1cafa0 100644 --- a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb +++ b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb @@ -17,6 +17,7 @@ sources = [] modextravars = { 'PSP_READAHEAD': '4096', + 'PSP_HARD_ABORT': '1', 'HFI_NO_CPUAFFINITY': '1', } diff --git a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb index 98b3a08e2..c364c4846 100644 --- a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb +++ b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb @@ -17,6 +17,7 @@ sources = [] modextravars = { 'PSP_READAHEAD': '4096', + 'PSP_HARD_ABORT': '1', 'HFI_NO_CPUAFFINITY': '1', } diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb index 0bfc29b0b..4c0c2ddc4 100644 --- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb +++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb index 4c2462d24..77094b637 100644 --- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb +++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb index 7e249ae8e..af53fdc9c 100644 --- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb +++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb index 05015464a..c5b260f0b 100644 --- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb +++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb index 3622c587c..b35b112ac 100644 --- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb +++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', 'UCX_NET_DEVICES': 'mlx5_1:1', diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb index de15dc6db..da1f9a94b 100644 --- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb +++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', 'UCX_NET_DEVICES': 'mlx5_1:1', diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb index 521259762..a37f3ef07 100644 --- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb +++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb @@ -15,6 +15,7 @@ source_urls = [] sources = [] modextravars = { + 'PSP_HARD_ABORT': '1', 'UCX_NET_DEVICES': 'mlx5_1:1', } diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb index 0bfc29b0b..4c0c2ddc4 100644 --- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb +++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb index 4c2462d24..77094b637 100644 --- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb +++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb index 7e249ae8e..af53fdc9c 100644 --- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb +++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb @@ -31,6 +31,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'y', 'UCX_MAX_RNDV_RAILS': '1', diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb index 05015464a..c5b260f0b 100644 --- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb +++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb @@ -18,6 +18,7 @@ modextravars = { 'PSP_CUDA': '1', 'PSP_SHM': '0', 'PSP_UCP': '1', + 'PSP_HARD_ABORT': '1', 'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy', 'UCX_MEMTYPE_CACHE': 'n', 'UCX_MAX_RNDV_RAILS': '1', -- GitLab