From 9b4b18a4b1b3e728c8161d3a43ada67e7c046fa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yannik=20M=C3=BCller?= <y.mueller@fz-juelich.de>
Date: Tue, 12 Sep 2023 16:59:35 +0200
Subject: [PATCH] Added native portals support Based on portals4 using ptl_put
 (RDMA) three kernels semi-, uni- and bi-directional where added. Debug
 options were cleaned up HAVE_XX was unified for CUDA

---
 .gitignore                    |  11 ++
 benchmark/.gitignore          |  10 --
 benchmark/Makefile            | 186 ++++++++++++----------
 benchmark/benchmark.cc        |  86 ++++-------
 benchmark/benchmark.h         |   6 +-
 benchmark/cmdline.cc          |  18 +--
 benchmark/error.cc            |  20 ++-
 benchmark/error.h             |  14 +-
 benchmark/gpu_nvidia.h        |   4 +-
 benchmark/linktest.cc         |  18 +--
 benchmark/memory.cc           |  10 +-
 benchmark/memory.h            |   8 +-
 benchmark/memory_multi.cc     |  10 +-
 benchmark/output_sion.cc      |   5 +-
 benchmark/portals4_macros.h   |  18 +++
 benchmark/vcluster.cc         | 154 ++++++++++---------
 benchmark/vcluster.h          |  33 ++--
 benchmark/vcluster_cuda.cc    |   2 +-
 benchmark/vcluster_helper.cc  |   7 +-
 benchmark/vcluster_mpi.cc     |   7 +-
 benchmark/vcluster_portals.cc | 280 ++++++++++++++++++++++++++++++++++
 benchmark/vcluster_portals.h  | 101 ++++++++++++
 benchmark/vcluster_tcp.cc     |  12 +-
 exampleBuild.sh               |   2 +-
 test/Default.xml              |  37 +++--
 test/LayerTest.xml            |  25 ++-
 test/LinktestMain.xml         |  95 +++++++-----
 test/execute_base.sbatch      |   4 +-
 28 files changed, 843 insertions(+), 340 deletions(-)
 delete mode 100644 benchmark/.gitignore
 create mode 100644 benchmark/portals4_macros.h
 create mode 100644 benchmark/vcluster_portals.cc
 create mode 100644 benchmark/vcluster_portals.h

diff --git a/.gitignore b/.gitignore
index 78a13cd..ff11029 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,14 @@
 *.egg-info
 install/
 **.ipynb_checkpoints/
+# Ignore generated source files
+benchmark/cuda_kernels.cc
+# Ignore executables
+**/linktest
+**/linktest.mpi
+**/linktest.tcp
+**/linktest.cuda
+**/linktest.psm2
+**/linktest.ucp
+**/linktest.ibverbs
+**/linktest.portals
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
deleted file mode 100644
index fe3166c..0000000
--- a/benchmark/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-# Ignore generated source files
-cuda_kernels.cc
-# Ignore executables
-**/linktest
-**/linktest.mpi
-**/linktest.tcp
-**/linktest.cuda
-**/linktest.psm2
-**/linktest.ucp
-**/linktest.ibverbs
\ No newline at end of file
diff --git a/benchmark/Makefile b/benchmark/Makefile
index f28a140..7a27cfc 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -6,21 +6,41 @@
 #**                                                                         **
 #**  See the file COPYRIGHT in the package base directory for details       **
 #****************************************************************************/
-PREFIX    = /usr/local/bin
 
-USE_POSIX                 = 1
+# DEFAULTS
+# =========================================
+PREFIX    = /usr/local/bin
 
-HAVE_SION                 = 1
-HAVE_MPI                  = 1
-HAVE_MINIPMI              = 0
-HAVE_TCP                  = 1
-HAVE_IBVERBS              = 0
-HAVE_PSM2                 = 0
-HAVE_CUDA                 = 0
-HAVE_UCP                  = 0
+USE_POSIX     = 1
+HAVE_SION     = 1
+HAVE_MPI      = 1
+HAVE_MINIPMI  = 0
+HAVE_TCP      = 1
+HAVE_IBVERBS  = 0
+HAVE_PSM2     = 0
+HAVE_CUDA     = 0
+HAVE_UCP      = 0
+HAVE_PORTALS  = 0
 
 FSANITIZE = address
 
+SYSTEM   = generic
+GIT_HASH = $(shell git rev-parse --verify HEAD)
+GIT_HASH_SHORT= $(shell git rev-parse --verify --short HEAD)
+CXX       = mpicxx
+CXXFLAGS = -std=c++17 -Wall -g -rdynamic
+CPPFLAGS =	-D_GNU_SOURCE \
+			-DLINKTEST_LINUX=1 \
+			-DLINKTEST_SYSTEM="\"$(SYSTEM)\"" \
+			-DGIT_HASH=\"$(GIT_HASH)\" 
+			-DGIT_HASH_SHORT=\"$(GIT_HASH_SHORT)\"
+LD       = $(CXX)
+LDFLAGS  =
+LIBS     =
+# =========================================
+
+# Handle Dependencies
+# =========================================
 ifeq (1, $(HAVE_IBVERBS))
 	HAVE_MINIPMI = 1
 	HAVE_TCP     = 1
@@ -37,8 +57,12 @@ ifeq (1, $(HAVE_UCP))
 	HAVE_MINIPMI = 1
 	HAVE_TCP     = 1
 endif
+ifeq (1, $(HAVE_PORTALS))
+	HAVE_MINIPMI = 1
+	HAVE_TCP     = 1
+endif
 
-ifdef V
+ifdef VERBOSE
 $(info USE_POSIX    = $(USE_POSIX))
 $(info HAVE_SION    = $(HAVE_SION))
 $(info HAVE_MPI     = $(HAVE_MPI))
@@ -48,61 +72,13 @@ $(info HAVE_IBVERBS = $(HAVE_IBVERBS))
 $(info HAVE_PSM2    = $(HAVE_PSM2))
 $(info HAVE_CUDA    = $(HAVE_CUDA))
 $(info HAVE_UCP     = $(HAVE_UCP))
+$(info HAVE_PORTALS = $(HAVE_PORTALS))
 endif
+# =========================================
 
-SYSTEM   = generic
-GIT_HASH = $(shell git rev-parse --verify HEAD)
-GIT_HASH_SHORT= $(shell git rev-parse --verify --short HEAD)
-CC       = mpicxx
-CFLAGS   = -std=c++17 -Wall
-CPPFLAGS = -D_GNU_SOURCE -DLINKTEST_LINUX=1 -DLINKTEST_SYSTEM="\"$(SYSTEM)\"" -DGIT_HASH=\"$(GIT_HASH)\" -DGIT_HASH_SHORT=\"$(GIT_HASH_SHORT)\"
-LD       = $(CC)
-LDFLAGS  = 
-LIBS     =
-
-# Use POSIX
-ifeq (1, ${USE_POSIX})
-	CPPFLAGS += -D__USE_POSIX
-endif
-
-# SIONlib Options
-ifeq (1, $(HAVE_SION))
-#	CFLAGS   +=
-	CPPFLAGS += -D_FILE_OFFSET_BITS=64 -DUSE_SION=1 $(shell sionconfig --64 --gcc --cflags --mpi)
-#	LDFLAGS  +=
-	LIBS     += $(shell sionconfig --64 --gcc --libs --mpi)
-endif
-
-# MINIPMI Options
-ifeq (1, $(HAVE_MINIPMI))
-#	CFLAGS   +=
-	CPPFLAGS += -Iminipmi -DHAVE_MINIPMI=1
-	LDFLAGS  += -Lminipmi
-	LIBS     += -lminipmi
-endif
-
-# UCP Options
-ifeq (1, $(HAVE_UCP))
-#	CFLAGS   +=
-#	CPPFLAGS +=
-#	LDFLAGS  +=
-#	LIBS     +=
-endif
-
-# CUDA Options
-ifeq (1, $(HAVE_CUDA))
-	CU       = nvcc
-	CUARCH   = 
-	ifeq (, $(CUARCH))
-$(error CUARCH is not set)
-	endif
-	CUFLAGS  = --gpu-architecture $(CUARCH)
-#	CFLAGS   +=
-	CPPFLAGS += -I$(CUDA)/include -DHAVE_CUDA=1
-	LDFLAGS  += -L$(CUDA)/lib
-	LIBS     += -lcuda -lcudart
-endif
 
+# DEFINE EXECUTABLES
+# =========================================
 linktest-versions = 
 ifeq (1, $(HAVE_MPI))
 	linktest-versions += linktest.mpi
@@ -120,15 +96,22 @@ ifeq (1, $(HAVE_MINIPMI))
 	ifeq (1, $(HAVE_UCP))
 		linktest-versions += linktest.ucp
 	endif
+	ifeq (1, $(HAVE_PORTALS))
+		linktest-versions += linktest.portals
+	endif
 	ifeq (1, $(HAVE_CUDA))
 		linktest-versions += linktest.cuda
 	endif
 endif
 
-ifdef V
+ifdef VERBOSE
 $(info linktest-versions = $(linktest-versions))
 endif
+# =========================================
+
 
+# DEFINE OBJECT FILES AND FLAGS
+# =========================================
 linktest-obj = linktest.o \
                system.o \
                benchmark.o \
@@ -152,13 +135,16 @@ linktest-obj = linktest.o \
 
 ifeq (1, $(HAVE_MPI))
 	linktest-obj += vcluster_mpi.o
-	CFLAGS       += -DHAVE_VCLUSTER_MPI=1
+	CPPFLAGS     += -DHAVE_VCLUSTER_MPI=1
 endif
 ifeq (1, $(HAVE_TCP))
 	linktest-obj += vcluster_tcp.o
-	CFLAGS       += -DHAVE_VCLUSTER_TCP=1
+	CPPFLAGS     += -DHAVE_VCLUSTER_TCP=1
 endif
 ifeq (1, $(HAVE_MINIPMI))
+	CPPFLAGS += -Iminipmi -DHAVE_MINIPMI=1
+	LDFLAGS  += -Lminipmi
+	LIBS     += -lminipmi
 	ifeq (1, $(HAVE_IBVERBS))
 		linktest-obj += vcluster_ibverbs.o \
 		                ibverbs_mr.o \
@@ -166,39 +152,70 @@ ifeq (1, $(HAVE_MINIPMI))
 		                ibverbs_cq.o \
 		                ibverbs_pd.o \
 		                ibverbs_ctx.o
-		CFLAGS       += -DHAVE_VCLUSTER_IBVERBS=1 -DIBVERBS_SEND_INLINE=1
+		CPPFLAGS     += -DHAVE_VCLUSTER_IBVERBS=1 -DIBVERBS_SEND_INLINE=1
 		LIBS         += -libverbs
 	endif
 	ifeq (1, $(HAVE_PSM2))
 		linktest-obj += vcluster_psm2.o
-		CFLAGS       += -DHAVE_VCLUSTER_PSM2=1
+		CPPFLAGS     += -DHAVE_VCLUSTER_PSM2=1
 		LIBS         += -lpsm2
 	endif
 	ifeq (1, $(HAVE_UCP))
 		linktest-obj += vcluster_ucp.o
-		CFLAGS       += -DHAVE_VCLUSTER_UCP=1
+		CPPFLAGS     += -DHAVE_VCLUSTER_UCP=1
 		LIBS         += -lucp
 	endif
+	ifeq (1, $(HAVE_PORTALS))
+		linktest-obj += vcluster_portals.o
+		CPPFLAGS     += -DHAVE_VCLUSTER_PORTALS=1 
+		LDFLAGS      += -Lportals
+		LIBS         += -lportals
+	endif
 	ifeq (1, $(HAVE_CUDA))
 		linktest-obj += vcluster_cuda.o \
 		                cuda_kernels.o \
 		                gpu_nvidia.o \
 		                memory_cuda.o
-		CFLAGS       += -DHAVE_VCLUSTER_CUDA=1
+		CU            = nvcc
+		CUARCH        = 
+		ifeq (, $(CUARCH))
+$(error CUARCH is not set)
+		endif
+		CUFLAGS       = --gpu-architecture $(CUARCH) -DHAVE_VCLUSTER_CUDA=1
+		CPPFLAGS     += -I$(CUDA)/include -DHAVE_VCLUSTER_CUDA=1
+		LDFLAGS      += -L$(CUDA)/lib
+		LIBS         += -lcuda -lcudart
 	endif
 endif
 
 ifeq (1, $(HAVE_SION))
 	linktest-obj += vcluster_sion_generic_adapter.o
+	CPPFLAGS     += -D_FILE_OFFSET_BITS=64 -DUSE_SION=1 $(shell sionconfig --64 --gcc --CXXFLAGS --mpi)
+	LIBS         += $(shell sionconfig --64 --gcc --libs --mpi)
+endif
+
+ifeq (1, ${USE_POSIX})
+	CPPFLAGS += -D__USE_POSIX
+endif
+
+ifdef VERBOSE
+$(info linktest-obj = $(linktest-obj))
+$(info CXXFLAGS = $(CXXFLAGS))
+$(info CPPFLAGS = $(CPPFLAGS))
+$(info LDFLAGS = $(LDFLAGS))
+$(info LIBS = $(LIBS))
 endif
+# =========================================
 
-ifdef V
-	Q =
+# DEFINE MAKE RULES
+# =========================================
+ifdef VERBOSE
+	QUIET =
 else
-	Q = @
+	QUIET = @
 endif
 
-link = $(Q)ln -s linktest linktest.$(1)
+link = $(QUIET)ln -s linktest linktest.$(1)
 
 SYMB_EXE := $(shell find . -type l -iname "linktest.*")
 
@@ -209,36 +226,36 @@ all: optimized
 compile: linktest $(linktest-versions)
 
 .PHONY: optimized
-optimized: CFLAGS += -O3
+optimized: CXXFLAGS += -O3
 optimized: compile
 
 .PHONY: debug
-debug: CFLAGS += -O0 -g
+debug: CXXFLAGS += -O0 -g
 debug: compile
 
 .PHONY: sanitized
 sanitized: debug
-sanitized: CFLAGS   += -fsanitize=$(FSANITIZE) -static-libasan -fno-omit-frame-pointer
+sanitized: CXXFLAGS += -fsanitize=$(FSANITIZE) -static-libasan -fno-omit-frame-pointer
 sanitized: LDFLAGS  += -fsanitize=$(FSANITIZE) -static-libasan
 sanitized: compile
 
 memory_cuda.cc: cuda_kernels.cc
 
 %.o: %.cc
-	@echo " "CC $@
-	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@	
+	@echo " "CXX $@
+	$(QUIET)$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@	
 
 %.cubin: %.cu
 	@echo " "CU $@
-	$(Q)$(CU) $(CUFLAGS) --cubin $< -o $@
+	$(QUIET)$(CU) $(CUFLAGS) --cubin $< -o $@
 
 %.cc: %.cubin convert.py
 	@echo " "CONVERT $@
-	$(Q)python3 convert.py $< $@ $(basename $@)
+	$(QUIET)python3 convert.py $< $@ $(basename $@)
 
 linktest: $(linktest-obj)
 	@echo " "LD $@
-	$(Q)$(LD) $(LDFLAGS) $^ $(LIBS) -o $@
+	$(QUIET)$(LD) $(LDFLAGS) $^ $(LIBS) -o $@
 
 linktest.tcp: linktest
 	@echo " "LN $@
@@ -260,6 +277,10 @@ linktest.ucp: linktest
 	@echo " "LN $@
 	$(call link,ucp)
 
+linktest.portals: linktest
+	@echo " "LN $@
+	$(call link,portals)
+
 linktest.cuda: linktest
 	@echo " "LN $@
 	$(call link,cuda)
@@ -274,3 +295,4 @@ install: linktest $(linktest-versions)
 	for f in $^ ; do              \
 		cp -d $$f $(PREFIX)/$$f ; \
 	done
+# =========================================
\ No newline at end of file
diff --git a/benchmark/benchmark.cc b/benchmark/benchmark.cc
index d913e12..444c346 100644
--- a/benchmark/benchmark.cc
+++ b/benchmark/benchmark.cc
@@ -19,6 +19,9 @@
 #include "environ.h"
 #include "format_units.h"
 #include "format_print.h"
+#if HAVE_VCLUSTER_PORTALS == 1
+#include "vcluster_portals.h"
+#endif
 #include <cstdlib>
 #include <cstdio>
 #include <cstring>
@@ -99,7 +102,7 @@ int Benchmark::kernel(const int from, const int to, double* const time_per_msg,
         return cl->linktest_kbipingpong(from, to, *buf1, *buf2, args, time_per_msg);
     } else if (args->do_unidir) {
         if (args->use_multi_buf) {
-            auto M=((args->num_msg>args->num_warmup_msg)?args->num_msg:args->num_warmup_msg);
+            auto M = std::max(args->num_msg, args->num_warmup_msg);
             if(args->num_multi_buf==M){
                 return cl->linktest_kUniDirMultiBuf(from, to, *buf_multi, *buf2, args, time_per_msg, doBarrier);
             }else{
@@ -438,35 +441,10 @@ int Benchmark::work_pingpong_parallel(const int partner,const int sign, double*
     auto to   = (sign < 0) ? rank() : partner;
 
     barrier();
-#ifdef DEBUG_KERNEL_SYNCHRONIZATION
-    std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank());
-    duration_t tBeforeBarrier;
-    rootWatch->start();
-#endif
     EXEC_NOFAIL(kernel(from, to, &tmp1, true));
-#ifdef DEBUG_KERNEL_SYNCHRONIZATION
-    rootWatch->stop();
-    tBeforeBarrier=rootWatch->getDuration();
-#endif
     barrier();
-#ifdef DEBUG_KERNEL_SYNCHRONIZATION
-    rootWatch->stop();
-    printTimingIfRoot(rank(), "[Kernel A->B Before Barrier]", tBeforeBarrier          );
-    printTimingIfRoot(rank(), "[Kernel A->B After  Barrier]", rootWatch->getDuration());
-    barrier(); //Additional barrier to reduce desynchronization due to printing
-    rootWatch->start();
-#endif
     EXEC_NOFAIL(kernel(to, from, &tmp2, true));
-#ifdef DEBUG_KERNEL_SYNCHRONIZATION
-    rootWatch->stop();
-    tBeforeBarrier=rootWatch->getDuration();
-#endif
     barrier();
-#ifdef DEBUG_KERNEL_SYNCHRONIZATION
-    rootWatch->stop();
-    printTimingIfRoot(rank(), "[Kernel B->A Before Barrier]", tBeforeBarrier          );
-    printTimingIfRoot(rank(), "[Kernel B->A After  Barrier]", rootWatch->getDuration());
-#endif
 
     *time_per_msg = (sign > 0) ? tmp1 : tmp2;
 
@@ -479,38 +457,28 @@ int Benchmark::work_pingpong_parallel(const int partner,const int sign, double*
 int Benchmark::work_pingpong_serial(const int partner, double* const time_per_msg){
     for (auto i = 0; i < size(); ++i) {
         barrier();
-
         if (i == rank()) {
             EXEC_NOFAIL(kernel(rank(), partner, time_per_msg, false));
-            if ( 0 != i ){
-                EXEC_NOFAIL(cl->send(0, &partner     , 1));
-                EXEC_NOFAIL(cl->send(0,  time_per_msg, 1));
-            }
+            std::printf("%6d->%6d: %ss (%sB/s)\n",
+                i, 
+                partner,
+                UnitPrefix::SI_prefix(*time_per_msg, 12).get(), 
+                UnitPrefix::IEC_prefix(args->len_msg / *time_per_msg, 10).get()
+            );
         } else if (i == partner) {
             EXEC_NOFAIL(kernel(partner, rank(), nullptr, false));
-        }
-
-        if (rank() == 0){
-            int buddy;
-            double time;
-            if(likely(0!=i)){
-                EXEC_NOFAIL(cl->recv(i, &buddy, 1));
-                EXEC_NOFAIL(cl->recv(i, &time , 1));
-            }else{
-                buddy=partner;
-                time =*time_per_msg;
+        } else {
+            #if HAVE_VCLUSTER_PORTALS == 1
+            if(cl->nameRef() == VirtualClusterPortals::NAME) {
+                // 2 per kernel, warmup kernel and measuring kernel
+                barrier();
+                barrier();
+                barrier();
+                barrier();
             }
-            std::printf("%6d->%6d: %ss (%sB/s) (l=%d)\n",
-                i, 
-                buddy,
-                UnitPrefix::SI_prefix(time, 12).get(), 
-                UnitPrefix::IEC_prefix(args->len_msg/time, 10).get(), 
-                i
-            );
-            std::fflush(stdout);
+            #endif
         }
     }
-
     barrier();
 
     return SUCCESS;
@@ -533,7 +501,6 @@ int Benchmark::work_pingpong(const int step,double* const min_time,double* const
     } else {
         EXEC_IFFAIL(work_pingpong_serial(partner, &time_per_msg), fatal("work_pingpong_serial failed."));
     }
-
     stats->accesspattern[partner] = step + 1; // In SION file steps are numbered starting with 1.
     stats->ptimings[partner] = time_per_msg;
 
@@ -649,14 +616,23 @@ int Benchmark::gather_slow_pairs(struct slow_pair* const sp,const int n){
 }
 
 int Benchmark::retest_one_slow_pair(const int from,const int to, double* const time){
+    debug("Benchmark::retest_one_slow_pair(%d, %d)",from, to);
     double tv;
-
     barrier();
 
     if ((from == rank()) || (to == rank())) {
         EXEC_NOFAIL(kernel(from, to, &tv, false));
+    } else { //TODO remove this hack. Seperating MemoryBuffer preparation from kernels. Bann all barriers from kernels
+        #if HAVE_VCLUSTER_PORTALS == 1
+        if(cl->nameRef() == VirtualClusterPortals::NAME) {
+            // 2 per kernel, warmup kernel and measuring kernel
+            barrier();
+            barrier();
+            barrier();
+            barrier();
+        }
+        #endif
     }
-
     barrier();
 
     if (0 != from) {
@@ -827,7 +803,7 @@ int Benchmark::init() {
             alloc.reset(new PosixMemAlignedAllocator());
             break;
         case(AllocatorCUDA):
-            #if HAVE_CUDA == 1
+            #if HAVE_VCLUSTER_CUDA == 1
                 if(cl->rank()==0){info("Using CUDA memory allocator"); std::fflush(stdout);}
                 gpudev.reset(new cuda::GpuDevice(System::singleton()->closest_gpu_device()));
                 gpuctx.reset(new cuda::GpuContext(gpudev.get()));
diff --git a/benchmark/benchmark.h b/benchmark/benchmark.h
index 2c68f8f..22bfe18 100644
--- a/benchmark/benchmark.h
+++ b/benchmark/benchmark.h
@@ -15,8 +15,9 @@
 #include <random>
 #include "stats.h"
 #include "slow_pairs.h"
+#include "error.h"
 
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
     #include "gpu_nvidia.h"
 #endif
 
@@ -38,6 +39,7 @@ namespace linktest{
             Benchmark() = default;
             Benchmark(const Benchmark&) = delete;
             Benchmark(Benchmark&&) = delete;
+            ~Benchmark() = default;
             int  main_cmdline();
             int  benchmark(); // Run the main benchmark
             [[nodiscard]] int                    rank()           const;
@@ -50,7 +52,7 @@ namespace linktest{
             void barrier() const;
             const struct linktest_args* args;
             std::unique_ptr<VirtualCluster> cl;
-            #if HAVE_CUDA == 1
+            #if HAVE_VCLUSTER_CUDA == 1
                 std::unique_ptr<cuda::GpuDevice>  gpudev;
                 std::unique_ptr<cuda::GpuContext> gpuctx; // Declaration order important! MemoryBuffer~ needs to be called before before GpuContext~
             #endif
diff --git a/benchmark/cmdline.cc b/benchmark/cmdline.cc
index 0baea8e..6c3ec9b 100644
--- a/benchmark/cmdline.cc
+++ b/benchmark/cmdline.cc
@@ -467,7 +467,7 @@ static bool special_cmdline_args(const std::string& prog, const std::vector<std:
         }
     }
 
-        return false;
+    return false;
 }
 
 static bool arg_match(const std::string& arg, const Argument& argdef){
@@ -661,13 +661,13 @@ const struct linktest_args* parse_cmdline_args(int argc, char **argv){
     }
 
     if(cmdline_args.alloc_typ==AllocatorCUDA){
-        #if HAVE_CUDA == 1
+        #if HAVE_VCLUSTER_CUDA == 1
         #else
             fatal("Requested CUDA memory-allocator type, but compiled without CUDA support.");
         #endif
     }else{
         if(cmdline_args.do_use_gpus||cmdline_args.virtual_cluster_implementation=="cuda"){
-            #if HAVE_CUDA == 1
+            #if HAVE_VCLUSTER_CUDA == 1
                 if(cmdline_args.alloc_typ==AllocatorDefault){
                     cmdline_args.alloc_typ=AllocatorCUDA;
                 } else {
@@ -782,11 +782,11 @@ void print_cmdline_usage(const std::string& prog)
         }
 
         std::string modeList = "[";
-        if(VirtualCluster::impls[0] != nullptr) {
-                for(auto i=0;VirtualCluster::impls[i];i++) {
-                        modeList = modeList + VirtualCluster::impls[i] + ", ";
-                }
-                modeList.erase(modeList.size()-2);
+        if(VirtualCluster::impls.size() > 0) {
+            for(const auto& name: VirtualCluster::impls) {
+                modeList = modeList + name + ", ";
+            }
+            modeList.erase(modeList.size()-2);
         }
         modeList += "]";
         std::fprintf(stderr,
@@ -821,7 +821,7 @@ void print_cmdline_args(const struct linktest_args* args){
             case(AllocatorPOSIXAlignedMalloc):
                 return "posix_memalign";
             case(AllocatorCUDA):
-                #if HAVE_CUDA == 1
+                #if HAVE_VCLUSTER_CUDA == 1
                     return "CUDA";
                 #else
                     return "No CUDA";
diff --git a/benchmark/error.cc b/benchmark/error.cc
index aaaae93..8915d20 100644
--- a/benchmark/error.cc
+++ b/benchmark/error.cc
@@ -28,54 +28,66 @@ static void report(const char* prefix, const char* file,
 
 void linktest_fatal(const char* file, const char* func, long line, const char* fmt, ...)
 {
+    #if REPORT_LEVEL >= REPORT_FATAL
     va_list vl;
 
     va_start(vl, fmt);
     report("fatal: ", file, func, line, fmt, vl);
     va_end(vl);
 
-    std::fflush(NULL);
+    std::fflush(nullptr);
+    #endif
     std::terminate();
 }
 
 void linktest_error(const char* file, const char* func, long line, const char* fmt, ...)
 {
+    #if REPORT_LEVEL >= REPORT_ERROR
     va_list vl;
 
     va_start(vl, fmt);
     report("error: ", file, func, line, fmt, vl);
     va_end(vl);
 
-    std::fflush(NULL);
+    std::fflush(nullptr);
+    #endif
 }
 
 void linktest_warn(const char* file, const char* func, long line, const char* fmt, ...)
 {
+    #if REPORT_LEVEL >= REPORT_WARN
     va_list vl;
 
     va_start(vl, fmt);
     report("warning: ", file, func, line, fmt, vl);
     va_end(vl);
 
-    std::fflush(NULL);
+    std::fflush(nullptr);
+    #endif
 }
 
 void linktest_info(const char* file, const char* func, long line, const char* fmt, ...)
 {
+    #if REPORT_LEVEL >= REPORT_INFO
     va_list vl;
 
     va_start(vl, fmt);
     report("info: ", file, func, line, fmt, vl);
     va_end(vl);
+
+    std::fflush(nullptr);
+    #endif
 }
 
 void linktest_debug(const char* file, const char* func, long line, const char* fmt, ...)
 {
+    #if REPORT_LEVEL >= REPORT_DEBUG
     va_list vl;
 
     va_start(vl, fmt);
     report("debug: ", file, func, line, fmt, vl);
     va_end(vl);
 
-    std::fflush(NULL);
+    std::fflush(nullptr);
+    #endif
 }
\ No newline at end of file
diff --git a/benchmark/error.h b/benchmark/error.h
index 92160ac..e1db667 100644
--- a/benchmark/error.h
+++ b/benchmark/error.h
@@ -9,6 +9,17 @@
 #ifndef LINKTEST_ERROR_H
 #define LINKTEST_ERROR_H
 
+#define REPORT_NONE 0
+#define REPORT_FATAL 1
+#define REPORT_ERROR 2
+#define REPORT_WARN 3
+#define REPORT_INFO 4
+#define REPORT_DEBUG 5
+
+#ifndef REPORT_LEVEL
+#define REPORT_LEVEL REPORT_WARN
+#endif
+
 constexpr int SUCCESS = 0;
 constexpr int ERROR   = 1;
 
@@ -24,10 +35,11 @@ void linktest_debug(const char* file, const char* func, long line, const char* f
  * The names are pretty generic so we have to be careful to avoid naming conflicts
  * that result in hard to understand compiler errors.
  */
+// NOLINTBEGIN
 #define fatal(fmt, ...) linktest_fatal(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__)
 #define error(fmt, ...) linktest_error(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__)
 #define warn(fmt, ...) linktest_warn(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__)
 #define info(fmt, ...) linktest_info(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__)
 #define debug(fmt, ...) linktest_debug(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__)
-
+// NOLINTEND
 #endif
\ No newline at end of file
diff --git a/benchmark/gpu_nvidia.h b/benchmark/gpu_nvidia.h
index fbf77a9..0d3b386 100644
--- a/benchmark/gpu_nvidia.h
+++ b/benchmark/gpu_nvidia.h
@@ -9,8 +9,8 @@
 #ifndef LINKTEST_GPU_NVIDIA_H
 #define LINKTEST_GPU_NVIDIA_H
 
-#if 1 != HAVE_CUDA
-#error gpu_nvidia can only compile with HAVE_CUDA=1
+#if 1 != HAVE_VCLUSTER_CUDA
+#error gpu_nvidia can only compile with HAVE_VCLUSTER_CUDA=1
 #endif
 
 #include "config.h"
diff --git a/benchmark/linktest.cc b/benchmark/linktest.cc
index 02d8623..1968a08 100644
--- a/benchmark/linktest.cc
+++ b/benchmark/linktest.cc
@@ -16,13 +16,14 @@
 #include "system.h"
 #include <thread>
 #include <memory>
+#include <iostream>
 
 void print_linktest_version()
 {
-        std::fprintf(stderr, "LinkTest (version %d.%d.%d)\n",
-                VERSION_MAJOR,
-                VERSION_MINOR,
-                VERSION_PATCH);
+    std::fprintf(stderr, "LinkTest (version %d.%d.%d)\n",
+        VERSION_MAJOR,
+        VERSION_MINOR,
+        VERSION_PATCH);
 }
 
 /* Errors are propagated up the backtrace as far as possible until
@@ -60,11 +61,7 @@ int main(int argc, char *argv[]){
 
         {
             /* Determine Virtual Cluster Type */
-            const auto name=get_vcluster_impl_name(argv,cmdline_args->virtual_cluster_implementation.c_str());
-            if (unlikely(!name)){
-                error("Failed to determine virtual-cluster implementation.");
-                return ERROR;
-            }
+            const auto name=VirtualCluster::get_vcluster_impl_name(argv,cmdline_args->virtual_cluster_implementation);
             
             /* Create Virtual Cluster */
             bench.cl.reset(VirtualCluster::factory(name));
@@ -94,7 +91,6 @@ int main(int argc, char *argv[]){
         error("Failed to execute benchmark.");
         return ERROR;
     }
-
     /* Finalize Benchmark */
     if (unlikely(bench.cl->finalize())) {
         error("Failed to finalize communication operations.");
@@ -107,5 +103,5 @@ int main(int argc, char *argv[]){
      * until the very end.
      */
 
-     return SUCCESS;
+    return SUCCESS;
 }
diff --git a/benchmark/memory.cc b/benchmark/memory.cc
index e67dd17..a9245b5 100644
--- a/benchmark/memory.cc
+++ b/benchmark/memory.cc
@@ -8,7 +8,7 @@
 ****************************************************************************/
 #include "memory.h"
 #include "compiler.h"
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
     #include "memory_cuda.h"
     #include "gpu_nvidia.h"
 #endif
@@ -89,7 +89,7 @@ void MemoryBuffer::memory_copy(MemoryBuffer& dst, MemoryBuffer& src){
 }
 
 MemoryBuffer MemoryBuffer::wrap(void* p, std::size_t len, AddressSpace::ID addr_space_id){
-    return MemoryBuffer(p, len, addr_space_id);
+    return {p, len, addr_space_id};
 }
 
 void MemoryBuffer::fill(){
@@ -109,7 +109,7 @@ void MemoryBuffer::fill(){
             if(use_mt()){
                 throw std::runtime_error("Not Implemented!");
             }else{
-                #if HAVE_CUDA == 1
+                #if HAVE_VCLUSTER_CUDA == 1
                     linktest::cuda::fill<char>(linktest::cuda::GpuContext::singleton(),
                                                pointer<char>(),
                                                pointer<char>() + len(), (char )0xff);
@@ -147,7 +147,7 @@ int MemoryBuffer::check(){
             }
             break;
         case AddressSpace::ID::CudaDeviceLocal:
-            #if HAVE_CUDA == 1
+            #if HAVE_VCLUSTER_CUDA == 1
                 throw std::runtime_error("check() called for local CUDA address space");
             #else
                 throw std::runtime_error("check() called on a CUDA address space but LinkTest was compiled without CUDA support");
@@ -231,7 +231,7 @@ int PosixMemAlignedAllocator::free(void* p, std::size_t len){
     return SUCCESS;
 }
 
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
     CudaDeviceAllocator::CudaDeviceAllocator(linktest::cuda::GpuContext* ctx):ctx_(ctx){}
     AddressSpace::ID CudaDeviceAllocator::address_space_id() const{
         return AddressSpace::ID::CudaDeviceLocal;
diff --git a/benchmark/memory.h b/benchmark/memory.h
index aa08943..1e15a67 100644
--- a/benchmark/memory.h
+++ b/benchmark/memory.h
@@ -14,7 +14,7 @@
 #include <cstdint>
 #include <unistd.h>
 
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
 	namespace linktest{
 		namespace cuda{
 			class Allocator;
@@ -129,7 +129,7 @@ class PosixMemAlignedAllocator : public Allocator {
 		size_t       pgsize_ = sysconf(_SC_PAGESIZE);
 };
 
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
 // A memory allocation on a GPU
 class CudaDeviceAllocator : public Allocator{
 	public:
@@ -165,8 +165,8 @@ class MemoryBuffer{
 
 		MemoryBuffer(const MemoryBuffer& other)             = delete;
 		MemoryBuffer& operator=(const MemoryBuffer& other)  = delete;
-		MemoryBuffer(MemoryBuffer&& other)                  = delete;
-		MemoryBuffer& operator=(const MemoryBuffer&& other) = delete;
+		MemoryBuffer(MemoryBuffer&& other)                  = default;
+		MemoryBuffer& operator=(MemoryBuffer&& other)       = default;
 
 		/* Wrap an existing pointer into a memory buffer. We do not know the allocator
 		 * and hence have to ingore it. This is acceptable since the allocator is not
diff --git a/benchmark/memory_multi.cc b/benchmark/memory_multi.cc
index ab85a0f..488486e 100644
--- a/benchmark/memory_multi.cc
+++ b/benchmark/memory_multi.cc
@@ -8,7 +8,7 @@
 ****************************************************************************/
 #include "memory_multi.h"
 #include "compiler.h"
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
 	#include "memory_cuda.h"
 	#include "gpu_nvidia.h"
 #endif
@@ -91,13 +91,13 @@ void MemoryBufferMulti::fill(){
 			break;
 		}case AddressSpace::ID::CudaDeviceLocal:{
 			if(use_mt()){
-				#if HAVE_CUDA == 1
+				#if HAVE_VCLUSTER_CUDA == 1
 					throw std::runtime_error("fill() for multiple buffers in CUDA address spaces not yet implemented");
 				#else
 					throw std::runtime_error("fill() called on CUDA address space but linktest was compiled without CUDA");
 				#endif
 			}else{
-				#if HAVE_CUDA == 1
+				#if HAVE_VCLUSTER_CUDA == 1
 					throw std::runtime_error("fill() for multiple buffers in CUDA address spaces not yet implemented");
 				#else
 					throw std::runtime_error("fill() called on CUDA address space but linktest was compiled without CUDA");
@@ -146,13 +146,13 @@ int MemoryBufferMulti::check(std::size_t* buffer, std::size_t* byte){
 			break;
 		}case AddressSpace::ID::CudaDeviceLocal:{
 			if(use_mt()){
-					#if HAVE_CUDA == 1
+					#if HAVE_VCLUSTER_CUDA == 1
 					throw std::runtime_error("check(buffer,byte) for multiple buffers in CUDA address spaces not yet implemented");
 				#else
 					throw std::runtime_error("check(buffer,byte) called on CUDA address space but linktest was compiled without CUDA");
 				#endif
 			}else{
-				#if HAVE_CUDA == 1
+				#if HAVE_VCLUSTER_CUDA == 1
 					throw std::runtime_error("check(buffer,byte) for multiple buffers in CUDA address spaces not yet implemented");
 				#else
 					throw std::runtime_error("check(buffer,byte) called on CUDA address space but linktest was compiled without CUDA");
diff --git a/benchmark/output_sion.cc b/benchmark/output_sion.cc
index f671e1f..0748a50 100644
--- a/benchmark/output_sion.cc
+++ b/benchmark/output_sion.cc
@@ -199,7 +199,7 @@ static int linktest_output_sion_funnelled_root(VirtualCluster* cl,
             return ERROR;
         }
     }
-
+    debug("linktest_output_sion_funnelled_root->barrier");
     EXEC_NOFAIL(cl->barrier());
 
     printTimingIfRoot(cl->rank(), "[sioncollect]", std::chrono::duration<double>(walltime() - begin));
@@ -326,11 +326,13 @@ int linktest_output_sion_parallel(VirtualCluster* cl,
     };
 
     auto sion_api = create_and_register_api(args->virtual_cluster_implementation);
+    debug("linktest_output_sion_parallel 1->barrier");
     cl->barrier();
 
     char* buffer;
     long long sz;
     EXEC_IFFAIL(linktest_output_sion_collect_local_data(cl, args, statsVec, &buffer, &sz), error("linktest_output_sion_collect_local_data failed."); return ERROR);
+    debug("linktest_output_sion_parallel 2->barrier");
     cl->barrier();
 
     auto filename = args->output.c_str();
@@ -362,6 +364,7 @@ int linktest_output_sion_parallel(VirtualCluster* cl,
         &fp, //fileptr
         &newfname //newfname
     );
+    debug("linktest_output_sion_parallel 3->barrier");
     cl->barrier();
     rootWatch->stop();
     printTiming("[sionopen]");
diff --git a/benchmark/portals4_macros.h b/benchmark/portals4_macros.h
new file mode 100644
index 0000000..91fb342
--- /dev/null
+++ b/benchmark/portals4_macros.h
@@ -0,0 +1,18 @@
+#ifndef LINKTEST_PORTALS4MACROS_H
+#define LINKTEST_PORTALS4MACROS_H
+// NOLINTBEGIN
+#define CHECK_RETURNVAL(x) do { int ret; \
+    switch (ret = x) { \
+        case PTL_IGNORED: \
+        case PTL_OK: break; \
+        case PTL_FAIL: fprintf(stderr, "=> %s returned PTL_FAIL (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        case PTL_NO_SPACE: fprintf(stderr, "=> %s returned PTL_NO_SPACE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        case PTL_ARG_INVALID: fprintf(stderr, "=> %s returned PTL_ARG_INVALID (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        case PTL_NO_INIT: fprintf(stderr, "=> %s returned PTL_NO_INIT (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        case PTL_PT_IN_USE: fprintf(stderr, "=> %s returned PTL_PT_IN_USE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        case PTL_IN_USE: fprintf(stderr, "=> %s returned PTL_IN_USE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \
+        default: fprintf(stderr, "=> %s returned failcode %i (line %u)\n", #x, ret, (unsigned int)__LINE__); abort(); break; \
+    } } while (0) 
+// NOLINTEND
+
+#endif //PORTALS4MACROS
\ No newline at end of file
diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc
index c493241..90d40b3 100644
--- a/benchmark/vcluster.cc
+++ b/benchmark/vcluster.cc
@@ -25,6 +25,9 @@
 #if HAVE_VCLUSTER_UCP == 1
 #include "vcluster_ucp.h"
 #endif
+#if HAVE_VCLUSTER_PORTALS == 1
+#include "vcluster_portals.h"
+#endif
 #if HAVE_VCLUSTER_CUDA == 1
 #include "vcluster_cuda.h"
 #endif
@@ -238,7 +241,7 @@ int VirtualCluster::linktest_kpingpong(const int from, const int to,
                                        const struct linktest_args* const args,
                                        double* const time){
     int err;
-
+    debug("VirtualCluster::linktest_kpingpong from %d to %d", from, to);
     // Warmup
     if(args->num_warmup_msg!=0){ //No use doing this if there are no warm-up messages.
         double dummy;
@@ -247,7 +250,6 @@ int VirtualCluster::linktest_kpingpong(const int from, const int to,
                       &dummy);
         if(unlikely(err))return ERROR;
     }
-
     err=kpingpong(from,to,buf,args->num_msg,time);
     if(unlikely(err))return err;
     if(args->check_buffers){
@@ -393,6 +395,30 @@ int VirtualCluster::linktest_kbipingpong(const int from, const int to,
     return SUCCESS;
 }
 
+const std::vector<std::string> VirtualCluster::impls = {
+    #if 1 == HAVE_VCLUSTER_TCP
+    "tcp",
+    #endif
+    #if 1 == HAVE_VCLUSTER_MPI
+    "mpi",
+    #endif
+    #if 1 == HAVE_VCLUSTER_IBVERBS
+    "ibverbs",
+    #endif
+    #if 1 == HAVE_VCLUSTER_PSM2
+    "psm2",
+    #endif
+    #if 1 == HAVE_VCLUSTER_UCP
+    "ucp",
+    #endif
+    #if 1 == HAVE_VCLUSTER_PORTALS
+    VirtualClusterPortals::NAME,
+    #endif
+    #if 1 == HAVE_VCLUSTER_CUDA
+    "cuda",
+    #endif
+};
+
 VirtualCluster*  VirtualCluster::factory(const std::string& name){
 #if 1 == HAVE_VCLUSTER_TCP
     if ("tcp" == name) {
@@ -419,6 +445,11 @@ VirtualCluster*  VirtualCluster::factory(const std::string& name){
         return new VirtualClusterUCP(name);
     } else
 #endif
+#if 1 == HAVE_VCLUSTER_PORTALS
+    if (VirtualClusterPortals::NAME == name) {
+        return new VirtualClusterPortals();
+    } else
+#endif
 #if 1 == HAVE_VCLUSTER_CUDA
     if ("cuda" == name) {
         return new VirtualClusterCUDA(name);
@@ -444,102 +475,85 @@ void VirtualClusterWithHelper::set_helper_pointer(VirtualCluster* helper)
 int VirtualClusterWithHelper::rank()
 {
     if (unlikely(!helper_))
-        throw;  // Simply returning -1 will result in complicated bugs
+        fatal("rank() called on a VirtualClusterWithHelper with undefined helper");
     return helper_->rank();
 }
 
 int VirtualClusterWithHelper::size()
 {
     if (unlikely(!helper_))
-        throw;  // Simply returning -1 will result in complicated bugs
+        fatal("size() called on a VirtualClusterWithHelper with undefined helper");
     return helper_->size();
 }
 
 int VirtualClusterWithHelper::send(int dst, MemoryBuffer& buf)
 {
     if (unlikely(!helper_))
-        return -1;
+        fatal("send() called on a VirtualClusterWithHelper with undefined helper");
     return helper_->send(dst, buf);
 }
 
 int VirtualClusterWithHelper::recv(int src, MemoryBuffer& buf)
 {
     if (unlikely(!helper_))
-        return -1;
+        fatal("recv() called on a VirtualClusterWithHelper with undefined helper");
     return helper_->recv(src, buf);
 }
 
-const char* VirtualCluster::impls[] = 
-    {
-        #if 1 == HAVE_VCLUSTER_TCP
-        "tcp",
-        #endif
-        #if 1 == HAVE_VCLUSTER_MPI
-        "mpi",
-        #endif
-        #if 1 == HAVE_VCLUSTER_IBVERBS
-        "ibverbs",
-        #endif
-        #if 1 == HAVE_VCLUSTER_PSM2
-        "psm2",
-        #endif
-        #if 1 == HAVE_VCLUSTER_UCP
-        "ucp",
-        #endif
-        #if 1 == HAVE_VCLUSTER_CUDA
-        "cuda",
-        #endif
-        nullptr
-    };
-
-const char* get_vcluster_impl_name(char** argv, const char* name)
+const std::string& VirtualCluster::get_vcluster_impl_name(char** argv, const std::string& name)
 {
-    /* Check If Virtual-Cluster Implementation Given In 'name' */
-    if(name[0]){ //Check if 'name' is a null string
-    for(auto i=0;VirtualCluster::impls[i];i++){ //Loop over possible virtual cluster implementations
-        /* Loop termination is handled by the fact that the last 'VirtualCluster::impls' is
-         * null pointer.
-         */
-        if (!strcmp(VirtualCluster::impls[i],name)){ //Compare command-line implementation to possible implementations
-        return VirtualCluster::impls[i];
-        }
+    std::string requestedImpl;
+
+    // check 'name'
+    if(name != "") {
+        requestedImpl = name;
     }
-    // If this point is reached an unknown/unsupported implementation was encountered.
-    error("Unknown/Unsupported command-line implementation encountered.");
-    return(NULL);
+
+    // check executable extension
+    std::string executableName(argv[0]);
+    const std::string dot{"."};
+    auto pos = executableName.find(dot);
+    if(pos != std::string::npos) {
+        requestedImpl = executableName.substr(pos);
     }
 
-    /* Check Executable Extension For Virtual-Cluster Implementation */
-    // Determine suffix start
-    int i=std::strlen(argv[0])-1;
-    if(likely(i>=0)){
-    int suffix_offset=-1; //Integer indicating suffix start
-    while(true){
-        if(unlikely(argv[0][i]=='.')){
-        suffix_offset=i; //Identify suffix start.
-        break;
-        }
-        if(i--==0) break;
+    // Check environment
+    const char* envName;
+    if(read_environ_str(LINKTEST_ENVIRON_PREFIX "VCLUSTER_IMPL", &envName) == SUCCESS) {
+        requestedImpl = {envName};
     }
-    if(likely(suffix_offset>=0)){ //Check if suffix was encountered
-        // Compare suffix to supported virtual cluster implementations
-        for(auto i=0;VirtualCluster::impls[i];i++){ //Loop over possible virtual cluster implementations
-        /* Loop termination is handled by the fact that the last 'VirtualCluster::impls' is
-         * null pointer.
-         */
-        if (unlikely(!strcmp(VirtualCluster::impls[i],&(argv[0][suffix_offset+1])))){ //Compare suffix to possible implementations
-            return VirtualCluster::impls[i]; //Return identified virtual-cluster implementation
-        }
+
+    #if 1 == HAVE_VCLUSTER_TCP
+    debug("HAVE_VCLUSTER_TCP == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_MPI
+    debug("HAVE_VCLUSTER_MPI == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_IBVERBS
+    debug("HAVE_VCLUSTER_IBVERBS == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_PSM2
+    debug("HAVE_VCLUSTER_PSM2 == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_UCP
+    debug("HAVE_VCLUSTER_UCP == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_PORTALS
+    debug("HAVE_VCLUSTER_PORTALS == 1");
+    #endif
+    #if 1 == HAVE_VCLUSTER_CUDA
+    debug("HAVE_VCLUSTER_CUDA == 1");
+    #endif
+    debug("requestedImpl = %s", requestedImpl.c_str());
+
+    for(const auto& impl : VirtualCluster::impls) {
+        if(impl == requestedImpl) {
+            return impl;
         }
-    }else if(unlikely(suffix_offset<-1)){ //This should not happen logically!
-        error("Internal Error: Unexpected negative executable-name-suffix offset encountered.");
-        return(NULL);
-    }
     }
-
-    /* Check If Environment Specifies Virtual-Cluster Implementation */
-    if(unlikely(read_environ_str(LINKTEST_ENVIRON_PREFIX "VCLUSTER_IMPL",&name))) return NULL;
-    return name;
+    
+    error("Internal Error: Unexpected negative executable-name-suffix offset encountered.");
+    return VirtualCluster::impls.at(0); // unreachable code to supress warning
 }
 
 int VirtualCluster::write_parallel(const linktest_args* args, const std::vector<LinktestStats>& statsVec)
diff --git a/benchmark/vcluster.h b/benchmark/vcluster.h
index 751c4fe..b516073 100644
--- a/benchmark/vcluster.h
+++ b/benchmark/vcluster.h
@@ -16,7 +16,6 @@
 #include <string>
 #include <memory>
 
-
 struct linktest_args;
 
 /* A virtual view of our cluster. The VirtualCluster
@@ -275,12 +274,11 @@ public:
     virtual int write_parallel(const linktest_args* args, const std::vector<LinktestStats>& statsVec);
     virtual int write_funnelled(const linktest_args* args, const std::vector<LinktestStats>& statsVec);
 
-    /* Given the name of the vcluster implementation create an instance. This
-        * function accesses an internal database to map the name of the implementation
-        * to a function that creates the instance. In order for this to work, the
-        * implementation needs to be properly registered (see linktest_vcluster.c).
-        */
-    static const char* impls[];
+    /** \brief List of supported transport protocol (build at compile time) */
+    static const std::vector<std::string> impls;
+    /** \return name of the requested transport protocol aka the virtual cluster imlplementation */
+    static const std::string& get_vcluster_impl_name(char** argv, const std::string& name);
+    /** \return pointer to requested VirtualCluster implementation */
     static VirtualCluster* factory(const std::string& name);
 
 private:
@@ -296,13 +294,14 @@ private:
           std::shared_ptr<int[]> hostLocalRanks_;
 };
 
-/* Since a full implementation of send()/recv() logic on top of some transport layer is
+/* VirtualClusterWithHelper 
+ * delegates calls to rank(), size(), send() and recv() to another VirtualCluster (the helper)
+ * executes calls to benchmark kernels directly
+ *
+ * Since a full implementation of send()/recv() logic on top of some transport layer is
  * more complicated than the logic required for the implementation of the communication
- * in kpingpong() it make sense to use a different VirtualCluster for the management 
- * communication than for the actual benchmark. 
- * VirtualClusterWithHelper allows derived classes to easily re-use another VirtualCluster
- * instance. We do not use inheritance since the helper logic is not really an "is-a"
- * relation.
+ * in our kernels it make sense to use a different VirtualCluster for the management 
+ * communication than for the actual benchmark.
  */
 class VirtualClusterWithHelper : public VirtualCluster {
 
@@ -320,7 +319,7 @@ public:
     int                    recv(int src, MemoryBuffer& buf) override;
 
 protected:
-    void    set_helper_pointer(VirtualCluster* helper);
+    void set_helper_pointer(VirtualCluster* helper);
     VirtualCluster*         helper_;
 
 };
@@ -338,12 +337,6 @@ enum vcluster_reduce_op {
     SUM_DOUBLE
 };
 
-/* Get the name of the vcluster implementation to be used. The implementation
- * can be chosen by means of argv[0] or an environment variable set by the
- * spawner.
- */
-const char* get_vcluster_impl_name(char** argv, const char* name);
-
 template<typename T>
 int VirtualCluster::send(const int dst,const T* const vals,const int len){
     auto tmp = MemoryBuffer::wrap<T>(const_cast<T*>(vals), len, AddressSpace::ID::Local);
diff --git a/benchmark/vcluster_cuda.cc b/benchmark/vcluster_cuda.cc
index 57ccc21..970d892 100644
--- a/benchmark/vcluster_cuda.cc
+++ b/benchmark/vcluster_cuda.cc
@@ -16,7 +16,7 @@
 #include "error.h"
 #include "output_sion.h"
 #include "pmi.h"
-#if HAVE_CUDA == 1
+#if HAVE_VCLUSTER_CUDA == 1
 #include "gpu_nvidia.h"
 #endif
 #include <cassert>
diff --git a/benchmark/vcluster_helper.cc b/benchmark/vcluster_helper.cc
index c284fa9..b977cf1 100644
--- a/benchmark/vcluster_helper.cc
+++ b/benchmark/vcluster_helper.cc
@@ -32,11 +32,16 @@ std::string determineHostname(){
 int determineCPUID(){
     return (std::int32_t)sched_getcpu();
 }
-
+#ifdef DEBUG_BARRIER
+static int barrierCounter=1;
+#endif
 int vcluster_helper_barrier(VirtualCluster* cl){
     /* We do not actually send data but we still need to have
      * a non-NULL buffer pointer
      */
+    #ifdef DEBUG_BARRIER
+    debug("vcluster_helper_barrier %d", barrierCounter++);
+    #endif
     char sp = 0;
     MemoryBuffer buf = MemoryBuffer::wrap<char>(&sp, 0, AddressSpace::ID::Local);
 
diff --git a/benchmark/vcluster_mpi.cc b/benchmark/vcluster_mpi.cc
index b481925..e7efc78 100644
--- a/benchmark/vcluster_mpi.cc
+++ b/benchmark/vcluster_mpi.cc
@@ -133,9 +133,14 @@ int VirtualClusterMPI::recv(int src, MemoryBuffer& buf)
                   src, 0, world_,
                   MPI_STATUS_IGNORE));
 }
-
+#ifdef DEBUG_BARRIER
+static int counter = 1;
+#endif
 int VirtualClusterMPI::barrier()
 {
+    #ifdef DEBUG_BARRIER
+    debug("VirtualClusterMPI::barrier %d",counter++);
+    #endif
     return _mpi_(MPI_Barrier(world_));
 }
 
diff --git a/benchmark/vcluster_portals.cc b/benchmark/vcluster_portals.cc
new file mode 100644
index 0000000..f408683
--- /dev/null
+++ b/benchmark/vcluster_portals.cc
@@ -0,0 +1,280 @@
+/****************************************************************************
+**  LinkTest                                                               **
+*****************************************************************************
+**  Copyright (c) 2008-2022                                                **
+**  Forschungszentrum Juelich, Juelich Supercomputing Centre               **
+**                                                                         **
+**  See the file COPYRIGHT in the package base directory for details       **
+****************************************************************************/
+#include "vcluster_portals.h"
+#include "portals4_macros.h"
+#include "memory.h"
+#include "error.h"
+#include "timing.h"
+#include "stopwatch.h"
+#include <assert.h>
+
+std::vector<ptl_process_t> VirtualClusterPortals::getPhysicalFromRank() {
+    ptl_process_t physId;
+    CHECK_RETURNVAL( PtlGetPhysId(mni_handle, &physId) );
+
+    debug("PMI Rank=%d, Hostname=%10s, Portals NID=%d PID=%d",
+        rank(),
+        hostname().c_str(),
+        physId.phys.nid,
+        physId.phys.pid);
+    
+    std::vector<ptl_process_t> physicalFromRank(size());
+    gather(0, physicalFromRank.data(), &physId, 1);
+    bcast(0, physicalFromRank.data(), size());
+
+    if(physicalFromRank.at(rank()).phys.nid != physId.phys.nid) fatal("Failed to broadcast physicalFromRank");
+    if(physicalFromRank.at(rank()).phys.pid != physId.phys.pid) fatal("Failed to broadcast physicalFromRank");
+
+    return physicalFromRank;
+}
+
+int VirtualClusterPortals::init()
+{
+    set_helper_pointer(VirtualCluster::factory("mpi"));
+    EXEC_NOFAIL(helper_->init());
+    
+    if(PTL_MAJOR_VERSION != 4 || PTL_MINOR_VERSION != 0) {
+        warn("Portals versions other than 4.0 may not be suppported");
+    }
+    CHECK_RETURNVAL( PtlInit() );
+    CHECK_RETURNVAL( PtlNIInit(
+        PTL_IFACE_DEFAULT, // Manual 3.3.5: "Check README"
+        PTL_NI_LOGICAL | PTL_NI_MATCHING, // Logical => using ranks, Matching => using send/recv semantics
+        PTL_PID_ANY,
+        nullptr,// &mni_limits_desired
+        &mni_limits_actual,
+        &mni_handle));
+
+    auto physicalFromRank = getPhysicalFromRank();
+    CHECK_RETURNVAL( PtlSetMap(mni_handle, physicalFromRank.size(), physicalFromRank.data()) );
+    const auto DEFAULT_OPTIONS = 0;
+    CHECK_RETURNVAL( PtlEQAlloc(mni_handle, 1000, &pt_eq_handle) );
+    CHECK_RETURNVAL( PtlPTAlloc(mni_handle, DEFAULT_OPTIONS, pt_eq_handle, PTL_PT_ANY, &pt_index) );
+
+    return SUCCESS;
+}
+
+int VirtualClusterPortals::finalize()
+{
+    debug("VirtualClusterPortals::finalize()");
+    CHECK_RETURNVAL( PtlPTFree(mni_handle, pt_index) );
+    CHECK_RETURNVAL( PtlNIFini(mni_handle) );
+    PtlFini();
+    EXEC_NOFAIL(helper_->finalize());
+    return SUCCESS;
+}
+
+void  VirtualClusterPortals::prepareSendStructs(const MemoryBuffer& buf) {
+    debug("VirtualClusterPortals::prepareSendStructs(%p)", buf.p());
+    md.start  = buf.p();
+    md.length = buf.len();
+    md.options   = PTL_MD_EVENT_CT_ACK;
+    md.eq_handle = PTL_EQ_NONE;   // i.e. don't queue send events
+    CHECK_RETURNVAL( PtlCTAlloc(mni_handle, &md.ct_handle) ); 
+    CHECK_RETURNVAL( PtlMDBind(mni_handle, &md, &md_handle) ); // Bind memory descriptor
+}
+
+void VirtualClusterPortals::prepareRecvStructs(const MemoryBuffer& buf) {
+    debug("VirtualClusterPortals::prepareRecvStructs(%p)", buf.p());
+    me.start  = buf.p();
+    me.length = buf.len();
+    me.uid    = PTL_UID_ANY;
+    me.match_id.rank = PTL_RANK_ANY;
+    me.match_bits    = MATCH_BITS;
+    me.ignore_bits   = IGNORE_BITS;
+    me.options = (PTL_ME_OP_PUT | PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_COMM_DISABLE ); // React to puts, count communication events, do not generate full communication events
+    CHECK_RETURNVAL( PtlCTAlloc(mni_handle, &me.ct_handle));
+    CHECK_RETURNVAL( PtlMEAppend(mni_handle, pt_index, &me, PTL_PRIORITY_LIST, nullptr, &me_handle));
+    ptl_event_t event;
+    CHECK_RETURNVAL( PtlEQWait(pt_eq_handle, &event) ); // TODO allow PTL_EQ_DROPPED 
+    if (! (event.type == PTL_EVENT_LINK && event.ni_fail_type == PTL_NI_OK)) {
+        // TODO Check for overflow/dropped events
+        error("PtlMEAppend failed");
+    }
+}
+
+ptl_size_t VirtualClusterPortals::getSendCounter() {
+    debug("VirtualClusterPortals::getSendCounter()");
+    CHECK_RETURNVAL( PtlCTGet(md.ct_handle, &send_ct) );
+    debug("Send (MD): success %d - failure %d", send_ct.success, send_ct.failure);
+    if(send_ct.failure > 0) {
+        error("Failed operation on MD");
+    }
+    return send_ct.success;
+}
+
+ptl_size_t VirtualClusterPortals::getRecvCounter() {
+    debug("VirtualClusterPortals::getRecvCounter()");
+    CHECK_RETURNVAL( PtlCTGet(me.ct_handle, &recv_ct) );
+    debug("Recv (ME): success %d - failure %d", recv_ct.success, recv_ct.failure);
+    if(recv_ct.failure > 0) {
+        error("Failed operation on ME");
+    }
+    return recv_ct.success;
+}
+
+void VirtualClusterPortals::recvMessages(const unsigned long num_msg, const unsigned long counter_start) {
+    debug("VirtualClusterPortals::recvMessages(%lu, %lu)", num_msg, counter_start);
+    CHECK_RETURNVAL( PtlCTWait(me.ct_handle, counter_start + num_msg, &recv_ct) );
+}
+
+void VirtualClusterPortals::sendMessages(const int to, MemoryBuffer& buf, const unsigned long num_msg, const unsigned long counter_start) {
+    debug("VirtualClusterPortals::sendMessages(%d, %p, %lu, %lu)", to, num_msg, buf.p() , counter_start);
+    const ptl_size_t localOffset = 0;
+    const ptl_size_t remoteOffset = 0;
+    const ptl_hdr_data_t header_data = 0;
+    ptl_process_t target;
+    target.rank = to;
+    
+    for(unsigned long n = 1; n <= num_msg; n++) {
+        CHECK_RETURNVAL( PtlPut(md_handle, localOffset, buf.len(), PTL_CT_ACK_REQ, target, pt_index, MATCH_BITS, remoteOffset, nullptr, header_data) );
+    }
+    CHECK_RETURNVAL( PtlCTWait(md.ct_handle, counter_start + num_msg, &send_ct) );
+}
+
+void VirtualClusterPortals::releaseRecvStructs() {
+    debug("releaseRecvStructs()");
+    CHECK_RETURNVAL( PtlMEUnlink(me_handle) );
+    CHECK_RETURNVAL( PtlCTFree(me.ct_handle) );
+};
+
+void VirtualClusterPortals::releaseSendStructs() {
+    debug("releaseSendStructs()");
+    CHECK_RETURNVAL( PtlMDRelease(md_handle) );
+    CHECK_RETURNVAL( PtlCTFree(md.ct_handle) );
+};
+
+int VirtualClusterPortals::kpingpong(const int from, const int to, MemoryBuffer& buf, const int num_msg, double* const timing)
+{
+    bool isSender = rank() == from;
+    bool isReceiver = rank() == to;
+    auto watch = Stopwatchfactory::getRankWatch(rank(), from);
+    prepareSendStructs(buf);
+    prepareRecvStructs(buf);
+    auto sendCounterBeforeKernel = getSendCounter();
+    auto recvCounterBeforeKernel = getRecvCounter();
+    
+    barrier();
+
+    if(isSender) {
+        watch->start();
+        sendMessages(to, buf, num_msg, sendCounterBeforeKernel);
+        recvMessages(num_msg, recvCounterBeforeKernel);
+        watch->stop();
+    }
+    if(isReceiver) {
+        watch->start();
+        recvMessages(num_msg, recvCounterBeforeKernel);
+        sendMessages(from, buf, num_msg, sendCounterBeforeKernel);
+        watch->stop();
+    }
+
+    barrier();
+
+    getSendCounter();
+    getRecvCounter();
+    releaseSendStructs();
+    releaseRecvStructs();
+    if (timing != nullptr)
+    {
+        *timing = watch->getDuration().count() / (2*num_msg);
+    }
+
+    return SUCCESS;
+}
+
+int VirtualClusterPortals::kUniDir(
+    const int from, const int to,
+    MemoryBuffer &buf1, MemoryBuffer &buf2,
+    const int num_msg, double *const timing,
+    const bool /*doBarrier*/)
+{
+    bool isSender = rank() == from;
+    bool isReceiver = rank() == to;
+    auto watch = Stopwatchfactory::getRankWatch(rank(), from);
+    prepareSendStructs(buf1);
+    prepareRecvStructs(buf2);
+    auto sendCounterBeforeKernel = getSendCounter();
+    auto recvCounterBeforeKernel = getRecvCounter();
+
+    barrier();
+
+    if(isSender) {
+        watch->start();
+        sendMessages(to, buf1, num_msg, sendCounterBeforeKernel);
+        recvMessages(1, recvCounterBeforeKernel);
+        watch->stop();
+    }
+    if(isReceiver) {
+        watch->start();
+        recvMessages(num_msg, recvCounterBeforeKernel);
+        sendMessages(from, buf1, 1, sendCounterBeforeKernel);
+        watch->stop();
+    }
+
+    barrier();
+
+    getSendCounter();
+    getRecvCounter();
+    releaseSendStructs();
+    releaseRecvStructs();
+    if (timing != nullptr)
+    {
+        *timing = watch->getDuration().count() / num_msg;
+    }
+
+    return SUCCESS;
+
+}
+
+int VirtualClusterPortals::kbipingpong(
+    const int from, const int to,
+    MemoryBuffer& buf1, MemoryBuffer& buf2,
+    const int num_msg, double* const timing)
+{
+    int partner;
+    if(rank() == from) {
+        partner = to;
+    }
+    if(rank() == to) {
+        partner = from;
+    }
+    auto watch = Stopwatchfactory::getRankWatch(rank(), from);
+    prepareSendStructs(buf1);
+    prepareRecvStructs(buf2);
+    auto sendCounterBeforeKernel = getSendCounter();
+    auto recvCounterBeforeKernel = getRecvCounter();
+
+    barrier();
+
+    watch->start();
+    sendMessages(partner, buf1, num_msg, sendCounterBeforeKernel);
+    recvMessages(num_msg, recvCounterBeforeKernel);
+    watch->stop();
+
+    barrier();
+
+    getSendCounter();
+    getRecvCounter();
+    releaseSendStructs();
+    releaseRecvStructs();
+    if (timing != nullptr)
+    {
+        *timing = watch->getDuration().count() / (2.0 * num_msg);
+    }
+
+    return SUCCESS;
+
+}
+
+
+VirtualClusterPortals::VirtualClusterPortals()
+: VirtualClusterWithHelper(VirtualClusterPortals::NAME)
+{
+}
diff --git a/benchmark/vcluster_portals.h b/benchmark/vcluster_portals.h
new file mode 100644
index 0000000..0a00c3d
--- /dev/null
+++ b/benchmark/vcluster_portals.h
@@ -0,0 +1,101 @@
+/****************************************************************************
+**  LinkTest                                                               **
+*****************************************************************************
+**  Copyright (c) 2008-2022                                                **
+**  Forschungszentrum Juelich, Juelich Supercomputing Centre               **
+**                                                                         **
+**  See the file COPYRIGHT in the package base directory for details       **
+****************************************************************************/
+#ifndef LINKTEST_VCLUSTER_PORTALS_H
+#define LINKTEST_VCLUSTER_PORTALS_H
+
+extern "C" {
+#include <portals4.h>
+}
+#include <vector>
+#include "vcluster.h"
+
+// VirtualCluster implementation based on a Portals 4
+class VirtualClusterPortals : public VirtualClusterWithHelper
+{
+
+public:
+    static constexpr char NAME[] = "portals";
+    VirtualClusterPortals();
+    int init() override;
+    int finalize() override;
+
+    int kpingpong(const int from, const int to, MemoryBuffer& buf, 
+                          const int num_msg, double* const timing) override;
+
+    int kUniDir(const int from, const int to,
+                        MemoryBuffer& buf1, MemoryBuffer& buf2,
+                        const int num_msg, double* const timing,
+                        const bool doBarrier) override;
+                        
+    int kUniDirMultiBuf(const int from,const int to,
+                                MemoryBufferMulti& buf_multi, MemoryBuffer& buf2,
+                                const int num_msg, double* const timing,
+                                const bool doBarrier) override {
+        throw("Not Implemented"); // Use PTL_ME_MANAGE_LOCAL
+    };
+    int kUniDirLimitedMultiBuf(const int from,const int to,
+                                       MemoryBufferMulti& buf_multi, MemoryBuffer& buf2,
+                                       const int num_msg, double* const timing,
+                                       const bool doBarrier) override {
+        throw("Not Implemented");
+    };
+
+    int kbipingpong(const int from, const int to,
+                            MemoryBuffer& buf1, MemoryBuffer& buf2,
+                            const int num_msg, double* const timing) override;
+
+private:
+    // matching (send/recv) Network Interface (ni)
+    ptl_ni_limits_t mni_limits_desired;
+    ptl_ni_limits_t mni_limits_actual;
+    ptl_handle_ni_t mni_handle;
+
+    /** @brief Portal Table (PT) Index */
+    ptl_pt_index_t  pt_index;
+    /** @brief PT Event Queue Handle  */
+    ptl_handle_eq_t pt_eq_handle;
+
+    const ptl_match_bits_t MATCH_BITS = 1; // TODO when/how to use these?
+    const ptl_match_bits_t IGNORE_BITS = ~0; // ignore all bits
+
+    // Sender
+    /** @brief Memory Descriptor (MD) */
+    ptl_md_t md;
+    /** @brief MD Handle */
+    ptl_handle_md_t md_handle;
+    /** @brief Send Counter */
+    ptl_ct_event_t send_ct;
+    /** @brief Prepare portals data structures on sender side (MD) */
+    void prepareSendStructs(const MemoryBuffer& buf);
+    /** @brief Read current send (MD) counter value */
+    ptl_size_t getSendCounter();
+    /** @brief Free portals data structures on sender side (MD) */
+    void releaseSendStructs();
+
+    // Receiver
+    /** @brief Match List Entry (ME) */
+    ptl_me_t  me;
+    /** @brief ME Handle */
+    ptl_handle_me_t me_handle;
+    /** @brief Receive Counter */
+    ptl_ct_event_t recv_ct;
+    /** @brief Prepare portals data structures on receiver side (ME) */
+    void prepareRecvStructs(const MemoryBuffer& buf);
+    /** @brief Read current recv (ME) counter value */
+    ptl_size_t getRecvCounter();
+    /** @brief Free portals data structures on receiver side (ME) */
+    void releaseRecvStructs();
+
+    void sendMessages(const int to, MemoryBuffer& buf, const unsigned long num_msg, const unsigned long counter_start);
+    void recvMessages(const unsigned long num_msg, const unsigned long counter_start);
+
+    std::vector<ptl_process_t> getPhysicalFromRank();
+};
+
+#endif
diff --git a/benchmark/vcluster_tcp.cc b/benchmark/vcluster_tcp.cc
index 494d4c4..3dfeac9 100644
--- a/benchmark/vcluster_tcp.cc
+++ b/benchmark/vcluster_tcp.cc
@@ -113,12 +113,14 @@ int VirtualClusterTCP::read_tcp_environ_rank_and_size()
 
 #if 1 == HAVE_MINIPMI
     EXEC_IFFAIL(minipmi_get_size(pmi_, &size), error("minipmi_get_size() failed."); return ERROR);
-
     EXEC_IFFAIL(minipmi_get_rank(pmi_, &rank), error("minipmi_get_rank() failed."); return ERROR);
 
     rank_ = rank;
     size_ = size;
 
+    #if defined(DEBUG_MINIPMI)
+    info("PMI rank: %d, PMI size: %d", rank, size);
+    #endif
     return SUCCESS;
 #else
     EXEC_IFFAIL(read_environ_int(TCP_ENVIRON_PREFIX "SIZE", &size), error("Failed to read environment variable " TCP_ENVIRON_PREFIX "SIZE"); return ERROR);
@@ -599,16 +601,14 @@ int VirtualClusterTCP::init()
 {
     auto ret = linktest_minipmi_context_borrow(&pmi_);
 #if 1 == HAVE_MINIPMI
-    if (unlikely(ret)) {
+    if (ret != SUCCESS) {
         error("linktest_minipmi_context_borrow() failed.");
-        return ERROR;
+        return ret;
     }
 #endif
 
     EXEC_NOFAIL(read_tcp_environ());
-
     EXEC_NOFAIL(read_tcp_environ_rank_and_size());
-
     EXEC_NOFAIL(connect_to_all());
 
     disable_nagles_algorithm();
@@ -665,7 +665,7 @@ int VirtualClusterTCP::send(int dst, MemoryBuffer& buf)
     }
 
     if (unlikely((dst < 0) || (dst >= size_ ))) {
-        error("Invalid rank.");
+        error("Invalid rank: 0 < %d < %d", dst, size_);
         return ERROR;
     }
 
diff --git a/exampleBuild.sh b/exampleBuild.sh
index 00ac73b..07eac17 100755
--- a/exampleBuild.sh
+++ b/exampleBuild.sh
@@ -23,7 +23,7 @@ export CPATH=$CPATH:~/.local/include/;
 mkdir -p install;
 cd benchmark;
 make clean
-make -j HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 PREFIX=../install install;
+make -j 12 HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 PREFIX=../install install;
 make clean
 cd ..;
 # Install linktest-report
diff --git a/test/Default.xml b/test/Default.xml
index c0abbfe..734aaad 100644
--- a/test/Default.xml
+++ b/test/Default.xml
@@ -29,11 +29,11 @@
     <parameter name="DefaultCompiler">GCC</parameter>
     <parameter name="Compiler" tag="!noCompileRunTest">GCC,Intel,NVHPC</parameter>
     <parameter name="Compiler" tag="noCompileRunTest">${DefaultCompiler}</parameter>
-    <parameter name="DefaultMPI">OpenMPI</parameter>
+    <parameter name="DefaultMPI">ParaStationMPI</parameter>
     <parameter name="MPI" mode="python" tag="!noCompileRunTest">
-        {
+        "ParaStationMPI" if "${System_Name}" == "deep" else {
         "GCC":   "ParaStationMPI,OpenMPI",
-        "Intel": "ParaStationMPI,OpenMPI,IntelMPI",
+        "Intel": "IntelMPI",
         "NVHPC": "ParaStationMPI,OpenMPI"
         }[ "${Compiler}" ]
     </parameter>
@@ -46,11 +46,13 @@
         }[ "${Compiler}" ]
     </parameter>
     <parameter name="WithCUDA">("${CUDA}" == "CUDA")</parameter>
-    <parameter name="Stack">$Compiler $MPI</parameter>
-    <parameter name="Default_Stack">$DefaultCompiler $DefaultMPI</parameter>
+    <parameter name="WithCUDATxt" mode="python">"Yes" if ${WithCUDA} else "No"</parameter>
+    <parameter name="Stack">${Compiler}_${MPI}</parameter>
+    <parameter name="StackWithCuda">${Stack}_${CUDA}</parameter>
+    <parameter name="Default_Stack">${DefaultCompiler}_${DefaultMPI}</parameter>
     <parameter name="Unload_CUDA" mode="python">"CUDA" if "${Compiler} ${MPI} ${CUDA} " == "Intel IntelMPI " else ""</parameter>
     <parameter name="Transport_Layer_Settings"  mode="python">
-        "" if not ${WithCUDA} else {
+        "" if not ${WithCUDA} or "${System_Name}" == "deep" else {
             "ParaStationMPI": "mpi-settings/CUDA",
             "OpenMPI":        "UCX-settings/RC-CUDA",
             "IntelMPI":       ""
@@ -68,7 +70,13 @@
     </parameter>
 </parameterset>
 <parameterset name="Slurm"> <!-- depends on Linktest_Args, System and Environment parameters -->
-    <parameter name="Account">cstao</parameter>
+    <parameter name="Account" mode="python">
+        {
+            "juwels": "cstao",
+            "jurecadc": "cstao",
+            "deep": "deepsea"
+        }["${System_Name}"]
+    </parameter>
     <parameter name="Partition" mode="python">
         {
             "juwels": {
@@ -78,14 +86,20 @@
             "jurecadc": {
                 False: "dc-cpu-devel",
                 True : "dc-gpu-devel"
+            },
+            "deep": {
+                False: "dp-cn",
+                True : "dp-esb"
             }
         }["${System_Name}"][ ${WithGPUs} ]
     </parameter>
     <parameter name="Max_WallClock_Time">00:01:00</parameter>
     <parameter name="Number_Of_Nodes" mode="python">1 if "${Messaging_Layer}" == "cuda" else 2</parameter>
-    <parameter name="Number_Of_Tasks_Per_Node">4</parameter>
+    <parameter name="Number_Of_Tasks_Per_Node" mode="python">
+        "1" if (${WithGPUs} and "${System_Name}" == "deep") else "4"
+    </parameter>
     <parameter name="Number_Of_Cores_Per_Task">1</parameter>
-    <parameter name="Gres" mode="python">"#SBATCH --gres=gpu:4" if ${WithGPUs} else ""</parameter>
+    <parameter name="Gres" mode="python">"#SBATCH --gres=gpu:${Number_Of_Tasks_Per_Node}" if ${WithGPUs} else ""</parameter>
     <parameter name="SRUN_Arguments" mode="python">
         "" if "${Messaging_Layer}" == "mpi" else {
             "ParaStationMPI": "--mpi=pspmi",
@@ -98,7 +112,8 @@
     <parameter name="CuArch" mode="python">
         {
             "juwels":   "sm_70",
-            "jurecadc": "sm_80"
+            "jurecadc": "sm_80",
+            "deep":     "sm_70",
         }[ "${System_Name}" ]
     </parameter>
     <parameter name="Enable_Layer" mode="python">
@@ -113,7 +128,7 @@
             "":     ""
         }[ "${CUDA}" ]
     </parameter>
-    <parameter name="Make">make -j ${Enable_Layer} ${DefineCuArch}</parameter>
+    <parameter name="Make">make -j24 ${Enable_Layer} ${DefineCuArch}</parameter>
 </parameterset>
 <parameterset name="Misc"> <!-- depends on Linktest_Args parameters -->
     <parameter name="Report_Name">linktest_${Messaging_Layer}_${Number_Of_Nodes}nx${Number_Of_Tasks_Per_Node}c</parameter>
diff --git a/test/LayerTest.xml b/test/LayerTest.xml
index 5ca1410..4ab7775 100644
--- a/test/LayerTest.xml
+++ b/test/LayerTest.xml
@@ -4,8 +4,29 @@
     <parameter name="Messaging_Layer" mode="python">
     {
     "juwels": "ibverbs,ucp,tcp,cuda",
-    "jurecadc": "ibverbs,ucp,tcp,cuda"  <!-- TODO add psm2 which is available only on jureca booster which shares login node -->
+    "jurecadc": "ibverbs,ucp,tcp,cuda",
+    "deep": "ibverbs,ucp,tcp,cuda,portals"
     }[ "${System_Name}" ]
-    </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,tcp -->
+    </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,portals,tcp -->
+    
+</parameterset>
+<parameterset name="Slurm" init_with="Default.xml"> 	
+    <parameter name="Partition" mode="python">
+    "dp-bxi" if "${Messaging_Layer}" == "portals" else {
+        "juwels": {
+            False: "devel",
+            True : "develgpus"
+        },
+        "jurecadc": {
+            False: "dc-cpu-devel",
+            True : "dc-gpu-devel"
+        },
+        "deep": {
+            False: "dp-cn",
+            True : "dc-esb"
+        }
+    }["${System_Name}"][ ${WithGPUs} ]
+    </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,portals,tcp -->
+    
 </parameterset>
 </jube>
\ No newline at end of file
diff --git a/test/LinktestMain.xml b/test/LinktestMain.xml
index a55d84c..cf67976 100644
--- a/test/LinktestMain.xml
+++ b/test/LinktestMain.xml
@@ -3,6 +3,18 @@
 	<benchmark name="JSC Linktest Test Suite" outpath="runs">
 		<comment>Testing compilation and common usages of JSC Linktest</comment>
 
+		<parameterset name="JUBE_Extra">
+			<parameter name="JUBE_REPORT_LAST_CMD" update_mode="step">
+				if [ $? -eq 0 ]; then
+					touch "${jube_wp_abspath}/ready";
+				else
+					echo "${jube_step_name} failed" >> "${jube_wp_abspath}/error";
+				fi
+			</parameter>
+		</parameterset>
+
+		
+
 		<fileset name="Sources">
 			<copy>../benchmark</copy>
 		</fileset>
@@ -49,30 +61,26 @@
 			<sub source="§SRUN_ARGS§"                dest="${SRUN_Arguments}" />
 		</substituteset>
 
-		<step name="Compile" procs="9" tag="!(noLayerTest+noModeTest+noCompileTest)" suffix="${Stack}">
-			<use>Sources</use>
+		<step name="Compile" procs="9" tag="!(noLayerTest+noModeTest+noCompileTest)" suffix="${Stack}_${CUDA}">
+			<use>JUBE_Extra,Sources</use>
 			<use from="Default.xml">System, Environment, Build</use>
 			<do done_file="ready" error_file="error" tag="!dryRun">
 				set -x
 				$Load_Modules
 				cd benchmark
 				$Make
-				if [ $? -eq 0 ]; then
-					touch ../ready;
-				else
-					echo "Linktest compile failed" >> ../error;
-				fi
+				$JUBE_REPORT_LAST_CMD
 				set +x
 			</do>
 			<do done_file="ready" error_file="error" tag="dryRun">
 				echo "Assume succesful compile"
-				touch ready
+				$JUBE_REPORT_LAST_CMD
 			</do>
 		</step>
 
 		<step name="LayerTest" depend="Compile" active="'$Stack' == '$Default_Stack' and ${WithCUDA} == ${WithGPUs}" suffix="${Messaging_Layer}" tag="!noLayerTest">
-			<use from="LayerTest.xml">Linktest_Args</use>
-			<use from="Default.xml">System, Environment, Slurm, Misc</use>
+			<use from="LayerTest.xml">Linktest_Args, Slurm</use>
+			<use from="Default.xml">System, Environment, Misc</use>
 			<use>ExecutionScript</use>
 			<use>SubstituteInputParameters</use>
 			<do done_file="ready" error_file="error" tag="!dryRun">sbatch execute.sbatch</do>
@@ -88,7 +96,7 @@
 
 		<step name="CompileLinktestReport" active="'$Stack' == '$Default_Stack'" tag="!noLinktestReportTest">
 			<use from="Default.xml">Environment</use>
-			<use>ReportSources</use>
+			<use>JUBE_Extra,ReportSources</use>
 			<do done_file="ready" error_file="error">
 				set -x
 				$Load_Modules
@@ -96,17 +104,13 @@
 				python3 -m venv venvLinktest
 				source venvLinktest/bin/activate
 				pip install ./python
-				if [ $? -eq 0 ]; then
-					touch ready;
-				else
-					echo "linktest-report compile failed" >> error;
-				fi
+				$JUBE_REPORT_LAST_CMD
 				deactivate
 				set +x
 			</do>
 		</step>
 
-		<step name="CompileRunTest" procs="9" depend="Compile" active="${WithCUDA} == ${WithGPUs}" suffix="${Stack}_${CUDA}" tag="!noCompileRunTest">
+		<step name="CompileRunTest" procs="9" depend="Compile" active="${WithCUDA} == ${WithGPUs}" suffix="${StackWithCuda}" tag="!noCompileRunTest">
 			<use from="CompileRunTest.xml">Linktest_Args</use>
 			<use from="Default.xml">System, Environment, Slurm, Misc</use>
 			<use>ExecutionScript</use>
@@ -115,16 +119,13 @@
 		</step>
 
 		<step name="LinktestReportTest" procs="7" depend="ModeTest,CompileLinktestReport" active="$No_Sion_File == 0" suffix="${Mode}" tag="!(noLinktestReportTest|noModeTest)">
+			<use>JUBE_Extra</use>
 			<do done_file="ready" error_file="error" tag="!dryRun">
 				set -x
 				$Load_Modules
 				source CompileLinktestReport/venvLinktest/bin/activate
 				linktest-report -i ModeTest/${Report_Name}.sion -o report.pdf
-				if [ $? -eq 0 ]; then
-					touch ready;
-				else
-					echo "python-report run failed" >> error;
-				fi
+				$JUBE_REPORT_LAST_CMD
 				deactivate
 				set +x
 			</do>
@@ -143,32 +144,46 @@
 			<pattern name="Options">\+ srun .*?\.sion (.*?)\n</pattern>
 		</patternset>
 
-		<patternset name="errorFilePatterns">
-			<pattern name="error_msg">.*</pattern>
+		<patternset name="genericPatterns">
+			<pattern name="all">.*</pattern>
 		</patternset>
 
 		<!-- Analyse -->
+		<analyser name="analyseCompiles">
+			<analyse step="Compile">
+				<file use="genericPatterns">error</file>
+				<file use="genericPatterns">ready</file>
+			</analyse>
+		</analyser>
+
 		<analyser name="analyseRuns">
-			<analyse step="LayerTest" tag="!noLayerTest">
+			<analyse step="Compile">
+				<file use="genericPatterns">error</file>
+				<file use="genericPatterns">ready</file>
+			</analyse>
+			<analyse step="CompileRunTest" tag="!noCompileRunTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
-				<file use="errorFilePatterns">error</file>
+				<file use="genericPatterns">error</file>
+				<file use="genericPatterns">ready</file>
 			</analyse>
-			<analyse step="ModeTest" tag="!noModeTest">
+			<analyse step="LayerTest" tag="!noLayerTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
-				<file use="errorFilePatterns">error</file>
+				<file use="genericPatterns">error</file>
+				<file use="genericPatterns">ready</file>
 			</analyse>
-			<analyse step="CompileRunTest" tag="!noCompileRunTest">
+			<analyse step="ModeTest" tag="!noModeTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
-				<file use="errorFilePatterns">error</file>
+				<file use="genericPatterns">error</file>
+				<file use="genericPatterns">ready</file>
 			</analyse>
 		</analyser>
 
 		<analyser name="analyseReports">
 			<analyse step="LinktestReportTest" tag="!(noLinktestReportTest|noModeTest)">
-				<file use="errorFilePatterns">error</file>
+				<file use="genericPatterns">error</file>
 			</analyse>
 		</analyser>
 
@@ -191,15 +206,27 @@
 		</result>
 		<result>
 			<use>analyseRuns,analyseReports</use>
-			<table name="ErrorResult" style="pretty" sort="jube_step_name">
+			<table name="RunErrors" style="pretty" sort="jube_step_name,Compiler,MPI,Transport_Layer_Settings,WithCUDATxt,Messaging_Layer,SRUN_Arguments,Options">
 				<column title="Test">jube_step_name</column>
 				<column title="Compiler">Compiler</column>
 				<column title="MPI">MPI</column>
-				<column title="Setting">Transport_Layer_Settings</column>
+				<column title="MPI Settings">Transport_Layer_Settings</column>
+				<column title="CUDA">WithCUDATxt</column>
 				<column title="Layer">Messaging_Layer</column>
 				<column title="Srun Args">SRUN_Arguments</column>
 				<column title="Options">Options</column>
-				<column title="Errors">error_msg</column>
+				<column title="Errors">all</column>
+			</table>
+		</result>
+		<result>
+			<use>analyseCompiles</use>
+			<table name="CompileErrors" style="pretty" sort="jube_step_name,Compiler,MPI,CUDA">
+				<column title="Test">jube_step_name</column>
+				<column title="Compiler">Compiler</column>
+				<column title="MPI">MPI</column>
+				<column title="MPI Settings">Transport_Layer_Settings</column>
+				<column title="CUDA">WithCUDATxt</column>
+				<column title="Errors">all</column>
 			</table>
 		</result>
 
diff --git a/test/execute_base.sbatch b/test/execute_base.sbatch
index e89c7dd..21cfb76 100644
--- a/test/execute_base.sbatch
+++ b/test/execute_base.sbatch
@@ -50,7 +50,7 @@ fi
 if [ §NUM_RANDOMIZE_TASKS§ -ne 0 ]; then
 	args+=" --num-randomize-tasks §NUM_RANDOMIZE_TASKS§"
 fi
-if [ §HOSTNAME_GROUPING§ ]; then
+if [ §HOSTNAME_GROUPING§ -ne 0 ]; then
 	args+=" --group-processes-by-hostname"
 fi
 set -x # echos commands before executing
@@ -61,7 +61,7 @@ srun --ntasks=${SLURM_NTASKS} \
 
 # Indicate Success to jube
 if [ $? -ne 0 ]; then
-	echo "linktest run failed" >> error;
+	echo "LinkTest run failed" >> error;
 else
 	touch ready;
 fi
-- 
GitLab