From a0675c9465c8ad1caea2a5eb28335ddec9be070d Mon Sep 17 00:00:00 2001
From: Damian Alvarez <swmanage@jwlogin04.juwels>
Date: Tue, 25 May 2021 19:02:01 +0200
Subject: [PATCH] To indicate to psmpi to abort via PMI and do not leave
 processes hanging. Works with psmpi >= 5.4.9-1

---
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb    | 1 +
 Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb        | 1 +
 Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb   | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 +
 Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb     | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb        | 1 +
 Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb    | 3 +++
 Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb       | 3 +++
 .../p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb            | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-plain.eb               | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb    | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA.eb                | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb             | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb    | 1 +
 .../jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb  | 1 +
 .../jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb    | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-CUDA.eb                | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb | 1 +
 .../p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb             | 1 +
 21 files changed, 25 insertions(+)

diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
index fe249b456..0e7bbd0c9 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
 }
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
index 4a10705c3..7ea994d41 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
 }
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb
index 6ce4d0b15..f60132e1f 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-UCX-plain.eb
@@ -17,6 +17,7 @@ sources = []
 modextravars = {
     'PSP_OPENIB': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
 }
 
 moduleclass = 'system'
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
index fc428a303..bb1186440 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
 }
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
index dba5738bf..879c26739 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
 }
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb
index 5d04d2f5e..ba74304a9 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-UCX-plain.eb
@@ -17,6 +17,7 @@ sources = []
 modextravars = {
     'PSP_OPENIB': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
 }
 
 moduleclass = 'system'
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
index 9e7e165df..f6bc981a3 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
@@ -14,5 +14,8 @@ toolchain = SYSTEM
 source_urls = []
 
 sources = []
+modextravars = {
+    'PSP_HARD_ABORT': '1',
+}
 
 moduleclass = 'system'
diff --git a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb
index bdb810c78..6e5f31e73 100644
--- a/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb
+++ b/Golden_Repo/p/psmpi-settings/psmpi-settings-5.4-plain.eb
@@ -14,5 +14,8 @@ toolchain = SYSTEM
 source_urls = []
 
 sources = []
+modextravars = {
+    'PSP_HARD_ABORT': '1',
+}
 
 moduleclass = 'system'
diff --git a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
index b7bedd599..8ee1cafa0 100644
--- a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
+++ b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-plain.eb
@@ -17,6 +17,7 @@ sources = []
 
 modextravars = {
     'PSP_READAHEAD': '4096',
+    'PSP_HARD_ABORT': '1',
     'HFI_NO_CPUAFFINITY': '1',
 }
 
diff --git a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
index 98b3a08e2..c364c4846 100644
--- a/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
+++ b/Overlays/jurecabooster_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
@@ -17,6 +17,7 @@ sources = []
 
 modextravars = {
     'PSP_READAHEAD': '4096',
+    'PSP_HARD_ABORT': '1',
     'HFI_NO_CPUAFFINITY': '1',
 }
 
diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
index 0bfc29b0b..4c0c2ddc4 100644
--- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
+++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
index 4c2462d24..77094b637 100644
--- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
+++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
index 7e249ae8e..af53fdc9c 100644
--- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
+++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
index 05015464a..c5b260f0b 100644
--- a/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
+++ b/Overlays/jurecadc_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
index 3622c587c..b35b112ac 100644
--- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
+++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
     'UCX_NET_DEVICES': 'mlx5_1:1',
diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
index de15dc6db..da1f9a94b 100644
--- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
+++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
     'UCX_NET_DEVICES': 'mlx5_1:1',
diff --git a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
index 521259762..a37f3ef07 100644
--- a/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
+++ b/Overlays/jusuf_overlay/p/psmpi-settings/psmpi-settings-5.4-plain.eb
@@ -15,6 +15,7 @@ source_urls = []
 
 sources = []
 modextravars = {
+    'PSP_HARD_ABORT': '1',
     'UCX_NET_DEVICES': 'mlx5_1:1',
 }
 
diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
index 0bfc29b0b..4c0c2ddc4 100644
--- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
+++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
index 4c2462d24..77094b637 100644
--- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
+++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
index 7e249ae8e..af53fdc9c 100644
--- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
+++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA-low-latency.eb
@@ -31,6 +31,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'y',
     'UCX_MAX_RNDV_RAILS': '1',
diff --git a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
index 05015464a..c5b260f0b 100644
--- a/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
+++ b/Overlays/juwelsbooster_overlay/p/psmpi-settings/psmpi-settings-5.4-mt-CUDA.eb
@@ -18,6 +18,7 @@ modextravars = {
     'PSP_CUDA': '1',
     'PSP_SHM': '0',
     'PSP_UCP': '1',
+    'PSP_HARD_ABORT': '1',
     'UCX_TLS': 'rc_x,cuda_ipc,gdr_copy,self,sm,cuda_copy',
     'UCX_MEMTYPE_CACHE': 'n',
     'UCX_MAX_RNDV_RAILS': '1',
-- 
GitLab