From 9b4b18a4b1b3e728c8161d3a43ada67e7c046fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20M=C3=BCller?= <y.mueller@fz-juelich.de> Date: Tue, 12 Sep 2023 16:59:35 +0200 Subject: [PATCH] Added native portals support Based on portals4 using ptl_put (RDMA) three kernels semi-, uni- and bi-directional where added. Debug options were cleaned up HAVE_XX was unified for CUDA --- .gitignore | 11 ++ benchmark/.gitignore | 10 -- benchmark/Makefile | 186 ++++++++++++---------- benchmark/benchmark.cc | 86 ++++------- benchmark/benchmark.h | 6 +- benchmark/cmdline.cc | 18 +-- benchmark/error.cc | 20 ++- benchmark/error.h | 14 +- benchmark/gpu_nvidia.h | 4 +- benchmark/linktest.cc | 18 +-- benchmark/memory.cc | 10 +- benchmark/memory.h | 8 +- benchmark/memory_multi.cc | 10 +- benchmark/output_sion.cc | 5 +- benchmark/portals4_macros.h | 18 +++ benchmark/vcluster.cc | 154 ++++++++++--------- benchmark/vcluster.h | 33 ++-- benchmark/vcluster_cuda.cc | 2 +- benchmark/vcluster_helper.cc | 7 +- benchmark/vcluster_mpi.cc | 7 +- benchmark/vcluster_portals.cc | 280 ++++++++++++++++++++++++++++++++++ benchmark/vcluster_portals.h | 101 ++++++++++++ benchmark/vcluster_tcp.cc | 12 +- exampleBuild.sh | 2 +- test/Default.xml | 37 +++-- test/LayerTest.xml | 25 ++- test/LinktestMain.xml | 95 +++++++----- test/execute_base.sbatch | 4 +- 28 files changed, 843 insertions(+), 340 deletions(-) delete mode 100644 benchmark/.gitignore create mode 100644 benchmark/portals4_macros.h create mode 100644 benchmark/vcluster_portals.cc create mode 100644 benchmark/vcluster_portals.h diff --git a/.gitignore b/.gitignore index 78a13cd..ff11029 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,14 @@ *.egg-info install/ **.ipynb_checkpoints/ +# Ignore generated source files +benchmark/cuda_kernels.cc +# Ignore executables +**/linktest +**/linktest.mpi +**/linktest.tcp +**/linktest.cuda +**/linktest.psm2 +**/linktest.ucp +**/linktest.ibverbs +**/linktest.portals diff --git a/benchmark/.gitignore b/benchmark/.gitignore deleted file mode 100644 index fe3166c..0000000 --- a/benchmark/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -# Ignore generated source files -cuda_kernels.cc -# Ignore executables -**/linktest -**/linktest.mpi -**/linktest.tcp -**/linktest.cuda -**/linktest.psm2 -**/linktest.ucp -**/linktest.ibverbs \ No newline at end of file diff --git a/benchmark/Makefile b/benchmark/Makefile index f28a140..7a27cfc 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -6,21 +6,41 @@ #** ** #** See the file COPYRIGHT in the package base directory for details ** #****************************************************************************/ -PREFIX = /usr/local/bin -USE_POSIX = 1 +# DEFAULTS +# ========================================= +PREFIX = /usr/local/bin -HAVE_SION = 1 -HAVE_MPI = 1 -HAVE_MINIPMI = 0 -HAVE_TCP = 1 -HAVE_IBVERBS = 0 -HAVE_PSM2 = 0 -HAVE_CUDA = 0 -HAVE_UCP = 0 +USE_POSIX = 1 +HAVE_SION = 1 +HAVE_MPI = 1 +HAVE_MINIPMI = 0 +HAVE_TCP = 1 +HAVE_IBVERBS = 0 +HAVE_PSM2 = 0 +HAVE_CUDA = 0 +HAVE_UCP = 0 +HAVE_PORTALS = 0 FSANITIZE = address +SYSTEM = generic +GIT_HASH = $(shell git rev-parse --verify HEAD) +GIT_HASH_SHORT= $(shell git rev-parse --verify --short HEAD) +CXX = mpicxx +CXXFLAGS = -std=c++17 -Wall -g -rdynamic +CPPFLAGS = -D_GNU_SOURCE \ + -DLINKTEST_LINUX=1 \ + -DLINKTEST_SYSTEM="\"$(SYSTEM)\"" \ + -DGIT_HASH=\"$(GIT_HASH)\" + -DGIT_HASH_SHORT=\"$(GIT_HASH_SHORT)\" +LD = $(CXX) +LDFLAGS = +LIBS = +# ========================================= + +# Handle Dependencies +# ========================================= ifeq (1, $(HAVE_IBVERBS)) HAVE_MINIPMI = 1 HAVE_TCP = 1 @@ -37,8 +57,12 @@ ifeq (1, $(HAVE_UCP)) HAVE_MINIPMI = 1 HAVE_TCP = 1 endif +ifeq (1, $(HAVE_PORTALS)) + HAVE_MINIPMI = 1 + HAVE_TCP = 1 +endif -ifdef V +ifdef VERBOSE $(info USE_POSIX = $(USE_POSIX)) $(info HAVE_SION = $(HAVE_SION)) $(info HAVE_MPI = $(HAVE_MPI)) @@ -48,61 +72,13 @@ $(info HAVE_IBVERBS = $(HAVE_IBVERBS)) $(info HAVE_PSM2 = $(HAVE_PSM2)) $(info HAVE_CUDA = $(HAVE_CUDA)) $(info HAVE_UCP = $(HAVE_UCP)) +$(info HAVE_PORTALS = $(HAVE_PORTALS)) endif +# ========================================= -SYSTEM = generic -GIT_HASH = $(shell git rev-parse --verify HEAD) -GIT_HASH_SHORT= $(shell git rev-parse --verify --short HEAD) -CC = mpicxx -CFLAGS = -std=c++17 -Wall -CPPFLAGS = -D_GNU_SOURCE -DLINKTEST_LINUX=1 -DLINKTEST_SYSTEM="\"$(SYSTEM)\"" -DGIT_HASH=\"$(GIT_HASH)\" -DGIT_HASH_SHORT=\"$(GIT_HASH_SHORT)\" -LD = $(CC) -LDFLAGS = -LIBS = - -# Use POSIX -ifeq (1, ${USE_POSIX}) - CPPFLAGS += -D__USE_POSIX -endif - -# SIONlib Options -ifeq (1, $(HAVE_SION)) -# CFLAGS += - CPPFLAGS += -D_FILE_OFFSET_BITS=64 -DUSE_SION=1 $(shell sionconfig --64 --gcc --cflags --mpi) -# LDFLAGS += - LIBS += $(shell sionconfig --64 --gcc --libs --mpi) -endif - -# MINIPMI Options -ifeq (1, $(HAVE_MINIPMI)) -# CFLAGS += - CPPFLAGS += -Iminipmi -DHAVE_MINIPMI=1 - LDFLAGS += -Lminipmi - LIBS += -lminipmi -endif - -# UCP Options -ifeq (1, $(HAVE_UCP)) -# CFLAGS += -# CPPFLAGS += -# LDFLAGS += -# LIBS += -endif - -# CUDA Options -ifeq (1, $(HAVE_CUDA)) - CU = nvcc - CUARCH = - ifeq (, $(CUARCH)) -$(error CUARCH is not set) - endif - CUFLAGS = --gpu-architecture $(CUARCH) -# CFLAGS += - CPPFLAGS += -I$(CUDA)/include -DHAVE_CUDA=1 - LDFLAGS += -L$(CUDA)/lib - LIBS += -lcuda -lcudart -endif +# DEFINE EXECUTABLES +# ========================================= linktest-versions = ifeq (1, $(HAVE_MPI)) linktest-versions += linktest.mpi @@ -120,15 +96,22 @@ ifeq (1, $(HAVE_MINIPMI)) ifeq (1, $(HAVE_UCP)) linktest-versions += linktest.ucp endif + ifeq (1, $(HAVE_PORTALS)) + linktest-versions += linktest.portals + endif ifeq (1, $(HAVE_CUDA)) linktest-versions += linktest.cuda endif endif -ifdef V +ifdef VERBOSE $(info linktest-versions = $(linktest-versions)) endif +# ========================================= + +# DEFINE OBJECT FILES AND FLAGS +# ========================================= linktest-obj = linktest.o \ system.o \ benchmark.o \ @@ -152,13 +135,16 @@ linktest-obj = linktest.o \ ifeq (1, $(HAVE_MPI)) linktest-obj += vcluster_mpi.o - CFLAGS += -DHAVE_VCLUSTER_MPI=1 + CPPFLAGS += -DHAVE_VCLUSTER_MPI=1 endif ifeq (1, $(HAVE_TCP)) linktest-obj += vcluster_tcp.o - CFLAGS += -DHAVE_VCLUSTER_TCP=1 + CPPFLAGS += -DHAVE_VCLUSTER_TCP=1 endif ifeq (1, $(HAVE_MINIPMI)) + CPPFLAGS += -Iminipmi -DHAVE_MINIPMI=1 + LDFLAGS += -Lminipmi + LIBS += -lminipmi ifeq (1, $(HAVE_IBVERBS)) linktest-obj += vcluster_ibverbs.o \ ibverbs_mr.o \ @@ -166,39 +152,70 @@ ifeq (1, $(HAVE_MINIPMI)) ibverbs_cq.o \ ibverbs_pd.o \ ibverbs_ctx.o - CFLAGS += -DHAVE_VCLUSTER_IBVERBS=1 -DIBVERBS_SEND_INLINE=1 + CPPFLAGS += -DHAVE_VCLUSTER_IBVERBS=1 -DIBVERBS_SEND_INLINE=1 LIBS += -libverbs endif ifeq (1, $(HAVE_PSM2)) linktest-obj += vcluster_psm2.o - CFLAGS += -DHAVE_VCLUSTER_PSM2=1 + CPPFLAGS += -DHAVE_VCLUSTER_PSM2=1 LIBS += -lpsm2 endif ifeq (1, $(HAVE_UCP)) linktest-obj += vcluster_ucp.o - CFLAGS += -DHAVE_VCLUSTER_UCP=1 + CPPFLAGS += -DHAVE_VCLUSTER_UCP=1 LIBS += -lucp endif + ifeq (1, $(HAVE_PORTALS)) + linktest-obj += vcluster_portals.o + CPPFLAGS += -DHAVE_VCLUSTER_PORTALS=1 + LDFLAGS += -Lportals + LIBS += -lportals + endif ifeq (1, $(HAVE_CUDA)) linktest-obj += vcluster_cuda.o \ cuda_kernels.o \ gpu_nvidia.o \ memory_cuda.o - CFLAGS += -DHAVE_VCLUSTER_CUDA=1 + CU = nvcc + CUARCH = + ifeq (, $(CUARCH)) +$(error CUARCH is not set) + endif + CUFLAGS = --gpu-architecture $(CUARCH) -DHAVE_VCLUSTER_CUDA=1 + CPPFLAGS += -I$(CUDA)/include -DHAVE_VCLUSTER_CUDA=1 + LDFLAGS += -L$(CUDA)/lib + LIBS += -lcuda -lcudart endif endif ifeq (1, $(HAVE_SION)) linktest-obj += vcluster_sion_generic_adapter.o + CPPFLAGS += -D_FILE_OFFSET_BITS=64 -DUSE_SION=1 $(shell sionconfig --64 --gcc --CXXFLAGS --mpi) + LIBS += $(shell sionconfig --64 --gcc --libs --mpi) +endif + +ifeq (1, ${USE_POSIX}) + CPPFLAGS += -D__USE_POSIX +endif + +ifdef VERBOSE +$(info linktest-obj = $(linktest-obj)) +$(info CXXFLAGS = $(CXXFLAGS)) +$(info CPPFLAGS = $(CPPFLAGS)) +$(info LDFLAGS = $(LDFLAGS)) +$(info LIBS = $(LIBS)) endif +# ========================================= -ifdef V - Q = +# DEFINE MAKE RULES +# ========================================= +ifdef VERBOSE + QUIET = else - Q = @ + QUIET = @ endif -link = $(Q)ln -s linktest linktest.$(1) +link = $(QUIET)ln -s linktest linktest.$(1) SYMB_EXE := $(shell find . -type l -iname "linktest.*") @@ -209,36 +226,36 @@ all: optimized compile: linktest $(linktest-versions) .PHONY: optimized -optimized: CFLAGS += -O3 +optimized: CXXFLAGS += -O3 optimized: compile .PHONY: debug -debug: CFLAGS += -O0 -g +debug: CXXFLAGS += -O0 -g debug: compile .PHONY: sanitized sanitized: debug -sanitized: CFLAGS += -fsanitize=$(FSANITIZE) -static-libasan -fno-omit-frame-pointer +sanitized: CXXFLAGS += -fsanitize=$(FSANITIZE) -static-libasan -fno-omit-frame-pointer sanitized: LDFLAGS += -fsanitize=$(FSANITIZE) -static-libasan sanitized: compile memory_cuda.cc: cuda_kernels.cc %.o: %.cc - @echo " "CC $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@ + @echo " "CXX $@ + $(QUIET)$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ %.cubin: %.cu @echo " "CU $@ - $(Q)$(CU) $(CUFLAGS) --cubin $< -o $@ + $(QUIET)$(CU) $(CUFLAGS) --cubin $< -o $@ %.cc: %.cubin convert.py @echo " "CONVERT $@ - $(Q)python3 convert.py $< $@ $(basename $@) + $(QUIET)python3 convert.py $< $@ $(basename $@) linktest: $(linktest-obj) @echo " "LD $@ - $(Q)$(LD) $(LDFLAGS) $^ $(LIBS) -o $@ + $(QUIET)$(LD) $(LDFLAGS) $^ $(LIBS) -o $@ linktest.tcp: linktest @echo " "LN $@ @@ -260,6 +277,10 @@ linktest.ucp: linktest @echo " "LN $@ $(call link,ucp) +linktest.portals: linktest + @echo " "LN $@ + $(call link,portals) + linktest.cuda: linktest @echo " "LN $@ $(call link,cuda) @@ -274,3 +295,4 @@ install: linktest $(linktest-versions) for f in $^ ; do \ cp -d $$f $(PREFIX)/$$f ; \ done +# ========================================= \ No newline at end of file diff --git a/benchmark/benchmark.cc b/benchmark/benchmark.cc index d913e12..444c346 100644 --- a/benchmark/benchmark.cc +++ b/benchmark/benchmark.cc @@ -19,6 +19,9 @@ #include "environ.h" #include "format_units.h" #include "format_print.h" +#if HAVE_VCLUSTER_PORTALS == 1 +#include "vcluster_portals.h" +#endif #include <cstdlib> #include <cstdio> #include <cstring> @@ -99,7 +102,7 @@ int Benchmark::kernel(const int from, const int to, double* const time_per_msg, return cl->linktest_kbipingpong(from, to, *buf1, *buf2, args, time_per_msg); } else if (args->do_unidir) { if (args->use_multi_buf) { - auto M=((args->num_msg>args->num_warmup_msg)?args->num_msg:args->num_warmup_msg); + auto M = std::max(args->num_msg, args->num_warmup_msg); if(args->num_multi_buf==M){ return cl->linktest_kUniDirMultiBuf(from, to, *buf_multi, *buf2, args, time_per_msg, doBarrier); }else{ @@ -438,35 +441,10 @@ int Benchmark::work_pingpong_parallel(const int partner,const int sign, double* auto to = (sign < 0) ? rank() : partner; barrier(); -#ifdef DEBUG_KERNEL_SYNCHRONIZATION - std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank()); - duration_t tBeforeBarrier; - rootWatch->start(); -#endif EXEC_NOFAIL(kernel(from, to, &tmp1, true)); -#ifdef DEBUG_KERNEL_SYNCHRONIZATION - rootWatch->stop(); - tBeforeBarrier=rootWatch->getDuration(); -#endif barrier(); -#ifdef DEBUG_KERNEL_SYNCHRONIZATION - rootWatch->stop(); - printTimingIfRoot(rank(), "[Kernel A->B Before Barrier]", tBeforeBarrier ); - printTimingIfRoot(rank(), "[Kernel A->B After Barrier]", rootWatch->getDuration()); - barrier(); //Additional barrier to reduce desynchronization due to printing - rootWatch->start(); -#endif EXEC_NOFAIL(kernel(to, from, &tmp2, true)); -#ifdef DEBUG_KERNEL_SYNCHRONIZATION - rootWatch->stop(); - tBeforeBarrier=rootWatch->getDuration(); -#endif barrier(); -#ifdef DEBUG_KERNEL_SYNCHRONIZATION - rootWatch->stop(); - printTimingIfRoot(rank(), "[Kernel B->A Before Barrier]", tBeforeBarrier ); - printTimingIfRoot(rank(), "[Kernel B->A After Barrier]", rootWatch->getDuration()); -#endif *time_per_msg = (sign > 0) ? tmp1 : tmp2; @@ -479,38 +457,28 @@ int Benchmark::work_pingpong_parallel(const int partner,const int sign, double* int Benchmark::work_pingpong_serial(const int partner, double* const time_per_msg){ for (auto i = 0; i < size(); ++i) { barrier(); - if (i == rank()) { EXEC_NOFAIL(kernel(rank(), partner, time_per_msg, false)); - if ( 0 != i ){ - EXEC_NOFAIL(cl->send(0, &partner , 1)); - EXEC_NOFAIL(cl->send(0, time_per_msg, 1)); - } + std::printf("%6d->%6d: %ss (%sB/s)\n", + i, + partner, + UnitPrefix::SI_prefix(*time_per_msg, 12).get(), + UnitPrefix::IEC_prefix(args->len_msg / *time_per_msg, 10).get() + ); } else if (i == partner) { EXEC_NOFAIL(kernel(partner, rank(), nullptr, false)); - } - - if (rank() == 0){ - int buddy; - double time; - if(likely(0!=i)){ - EXEC_NOFAIL(cl->recv(i, &buddy, 1)); - EXEC_NOFAIL(cl->recv(i, &time , 1)); - }else{ - buddy=partner; - time =*time_per_msg; + } else { + #if HAVE_VCLUSTER_PORTALS == 1 + if(cl->nameRef() == VirtualClusterPortals::NAME) { + // 2 per kernel, warmup kernel and measuring kernel + barrier(); + barrier(); + barrier(); + barrier(); } - std::printf("%6d->%6d: %ss (%sB/s) (l=%d)\n", - i, - buddy, - UnitPrefix::SI_prefix(time, 12).get(), - UnitPrefix::IEC_prefix(args->len_msg/time, 10).get(), - i - ); - std::fflush(stdout); + #endif } } - barrier(); return SUCCESS; @@ -533,7 +501,6 @@ int Benchmark::work_pingpong(const int step,double* const min_time,double* const } else { EXEC_IFFAIL(work_pingpong_serial(partner, &time_per_msg), fatal("work_pingpong_serial failed.")); } - stats->accesspattern[partner] = step + 1; // In SION file steps are numbered starting with 1. stats->ptimings[partner] = time_per_msg; @@ -649,14 +616,23 @@ int Benchmark::gather_slow_pairs(struct slow_pair* const sp,const int n){ } int Benchmark::retest_one_slow_pair(const int from,const int to, double* const time){ + debug("Benchmark::retest_one_slow_pair(%d, %d)",from, to); double tv; - barrier(); if ((from == rank()) || (to == rank())) { EXEC_NOFAIL(kernel(from, to, &tv, false)); + } else { //TODO remove this hack. Seperating MemoryBuffer preparation from kernels. Bann all barriers from kernels + #if HAVE_VCLUSTER_PORTALS == 1 + if(cl->nameRef() == VirtualClusterPortals::NAME) { + // 2 per kernel, warmup kernel and measuring kernel + barrier(); + barrier(); + barrier(); + barrier(); + } + #endif } - barrier(); if (0 != from) { @@ -827,7 +803,7 @@ int Benchmark::init() { alloc.reset(new PosixMemAlignedAllocator()); break; case(AllocatorCUDA): - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 if(cl->rank()==0){info("Using CUDA memory allocator"); std::fflush(stdout);} gpudev.reset(new cuda::GpuDevice(System::singleton()->closest_gpu_device())); gpuctx.reset(new cuda::GpuContext(gpudev.get())); diff --git a/benchmark/benchmark.h b/benchmark/benchmark.h index 2c68f8f..22bfe18 100644 --- a/benchmark/benchmark.h +++ b/benchmark/benchmark.h @@ -15,8 +15,9 @@ #include <random> #include "stats.h" #include "slow_pairs.h" +#include "error.h" -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 #include "gpu_nvidia.h" #endif @@ -38,6 +39,7 @@ namespace linktest{ Benchmark() = default; Benchmark(const Benchmark&) = delete; Benchmark(Benchmark&&) = delete; + ~Benchmark() = default; int main_cmdline(); int benchmark(); // Run the main benchmark [[nodiscard]] int rank() const; @@ -50,7 +52,7 @@ namespace linktest{ void barrier() const; const struct linktest_args* args; std::unique_ptr<VirtualCluster> cl; - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 std::unique_ptr<cuda::GpuDevice> gpudev; std::unique_ptr<cuda::GpuContext> gpuctx; // Declaration order important! MemoryBuffer~ needs to be called before before GpuContext~ #endif diff --git a/benchmark/cmdline.cc b/benchmark/cmdline.cc index 0baea8e..6c3ec9b 100644 --- a/benchmark/cmdline.cc +++ b/benchmark/cmdline.cc @@ -467,7 +467,7 @@ static bool special_cmdline_args(const std::string& prog, const std::vector<std: } } - return false; + return false; } static bool arg_match(const std::string& arg, const Argument& argdef){ @@ -661,13 +661,13 @@ const struct linktest_args* parse_cmdline_args(int argc, char **argv){ } if(cmdline_args.alloc_typ==AllocatorCUDA){ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 #else fatal("Requested CUDA memory-allocator type, but compiled without CUDA support."); #endif }else{ if(cmdline_args.do_use_gpus||cmdline_args.virtual_cluster_implementation=="cuda"){ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 if(cmdline_args.alloc_typ==AllocatorDefault){ cmdline_args.alloc_typ=AllocatorCUDA; } else { @@ -782,11 +782,11 @@ void print_cmdline_usage(const std::string& prog) } std::string modeList = "["; - if(VirtualCluster::impls[0] != nullptr) { - for(auto i=0;VirtualCluster::impls[i];i++) { - modeList = modeList + VirtualCluster::impls[i] + ", "; - } - modeList.erase(modeList.size()-2); + if(VirtualCluster::impls.size() > 0) { + for(const auto& name: VirtualCluster::impls) { + modeList = modeList + name + ", "; + } + modeList.erase(modeList.size()-2); } modeList += "]"; std::fprintf(stderr, @@ -821,7 +821,7 @@ void print_cmdline_args(const struct linktest_args* args){ case(AllocatorPOSIXAlignedMalloc): return "posix_memalign"; case(AllocatorCUDA): - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 return "CUDA"; #else return "No CUDA"; diff --git a/benchmark/error.cc b/benchmark/error.cc index aaaae93..8915d20 100644 --- a/benchmark/error.cc +++ b/benchmark/error.cc @@ -28,54 +28,66 @@ static void report(const char* prefix, const char* file, void linktest_fatal(const char* file, const char* func, long line, const char* fmt, ...) { + #if REPORT_LEVEL >= REPORT_FATAL va_list vl; va_start(vl, fmt); report("fatal: ", file, func, line, fmt, vl); va_end(vl); - std::fflush(NULL); + std::fflush(nullptr); + #endif std::terminate(); } void linktest_error(const char* file, const char* func, long line, const char* fmt, ...) { + #if REPORT_LEVEL >= REPORT_ERROR va_list vl; va_start(vl, fmt); report("error: ", file, func, line, fmt, vl); va_end(vl); - std::fflush(NULL); + std::fflush(nullptr); + #endif } void linktest_warn(const char* file, const char* func, long line, const char* fmt, ...) { + #if REPORT_LEVEL >= REPORT_WARN va_list vl; va_start(vl, fmt); report("warning: ", file, func, line, fmt, vl); va_end(vl); - std::fflush(NULL); + std::fflush(nullptr); + #endif } void linktest_info(const char* file, const char* func, long line, const char* fmt, ...) { + #if REPORT_LEVEL >= REPORT_INFO va_list vl; va_start(vl, fmt); report("info: ", file, func, line, fmt, vl); va_end(vl); + + std::fflush(nullptr); + #endif } void linktest_debug(const char* file, const char* func, long line, const char* fmt, ...) { + #if REPORT_LEVEL >= REPORT_DEBUG va_list vl; va_start(vl, fmt); report("debug: ", file, func, line, fmt, vl); va_end(vl); - std::fflush(NULL); + std::fflush(nullptr); + #endif } \ No newline at end of file diff --git a/benchmark/error.h b/benchmark/error.h index 92160ac..e1db667 100644 --- a/benchmark/error.h +++ b/benchmark/error.h @@ -9,6 +9,17 @@ #ifndef LINKTEST_ERROR_H #define LINKTEST_ERROR_H +#define REPORT_NONE 0 +#define REPORT_FATAL 1 +#define REPORT_ERROR 2 +#define REPORT_WARN 3 +#define REPORT_INFO 4 +#define REPORT_DEBUG 5 + +#ifndef REPORT_LEVEL +#define REPORT_LEVEL REPORT_WARN +#endif + constexpr int SUCCESS = 0; constexpr int ERROR = 1; @@ -24,10 +35,11 @@ void linktest_debug(const char* file, const char* func, long line, const char* f * The names are pretty generic so we have to be careful to avoid naming conflicts * that result in hard to understand compiler errors. */ +// NOLINTBEGIN #define fatal(fmt, ...) linktest_fatal(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__) #define error(fmt, ...) linktest_error(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__) #define warn(fmt, ...) linktest_warn(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__) #define info(fmt, ...) linktest_info(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__) #define debug(fmt, ...) linktest_debug(__FILE__, __func__, __LINE__, fmt, ## __VA_ARGS__) - +// NOLINTEND #endif \ No newline at end of file diff --git a/benchmark/gpu_nvidia.h b/benchmark/gpu_nvidia.h index fbf77a9..0d3b386 100644 --- a/benchmark/gpu_nvidia.h +++ b/benchmark/gpu_nvidia.h @@ -9,8 +9,8 @@ #ifndef LINKTEST_GPU_NVIDIA_H #define LINKTEST_GPU_NVIDIA_H -#if 1 != HAVE_CUDA -#error gpu_nvidia can only compile with HAVE_CUDA=1 +#if 1 != HAVE_VCLUSTER_CUDA +#error gpu_nvidia can only compile with HAVE_VCLUSTER_CUDA=1 #endif #include "config.h" diff --git a/benchmark/linktest.cc b/benchmark/linktest.cc index 02d8623..1968a08 100644 --- a/benchmark/linktest.cc +++ b/benchmark/linktest.cc @@ -16,13 +16,14 @@ #include "system.h" #include <thread> #include <memory> +#include <iostream> void print_linktest_version() { - std::fprintf(stderr, "LinkTest (version %d.%d.%d)\n", - VERSION_MAJOR, - VERSION_MINOR, - VERSION_PATCH); + std::fprintf(stderr, "LinkTest (version %d.%d.%d)\n", + VERSION_MAJOR, + VERSION_MINOR, + VERSION_PATCH); } /* Errors are propagated up the backtrace as far as possible until @@ -60,11 +61,7 @@ int main(int argc, char *argv[]){ { /* Determine Virtual Cluster Type */ - const auto name=get_vcluster_impl_name(argv,cmdline_args->virtual_cluster_implementation.c_str()); - if (unlikely(!name)){ - error("Failed to determine virtual-cluster implementation."); - return ERROR; - } + const auto name=VirtualCluster::get_vcluster_impl_name(argv,cmdline_args->virtual_cluster_implementation); /* Create Virtual Cluster */ bench.cl.reset(VirtualCluster::factory(name)); @@ -94,7 +91,6 @@ int main(int argc, char *argv[]){ error("Failed to execute benchmark."); return ERROR; } - /* Finalize Benchmark */ if (unlikely(bench.cl->finalize())) { error("Failed to finalize communication operations."); @@ -107,5 +103,5 @@ int main(int argc, char *argv[]){ * until the very end. */ - return SUCCESS; + return SUCCESS; } diff --git a/benchmark/memory.cc b/benchmark/memory.cc index e67dd17..a9245b5 100644 --- a/benchmark/memory.cc +++ b/benchmark/memory.cc @@ -8,7 +8,7 @@ ****************************************************************************/ #include "memory.h" #include "compiler.h" -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 #include "memory_cuda.h" #include "gpu_nvidia.h" #endif @@ -89,7 +89,7 @@ void MemoryBuffer::memory_copy(MemoryBuffer& dst, MemoryBuffer& src){ } MemoryBuffer MemoryBuffer::wrap(void* p, std::size_t len, AddressSpace::ID addr_space_id){ - return MemoryBuffer(p, len, addr_space_id); + return {p, len, addr_space_id}; } void MemoryBuffer::fill(){ @@ -109,7 +109,7 @@ void MemoryBuffer::fill(){ if(use_mt()){ throw std::runtime_error("Not Implemented!"); }else{ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 linktest::cuda::fill<char>(linktest::cuda::GpuContext::singleton(), pointer<char>(), pointer<char>() + len(), (char )0xff); @@ -147,7 +147,7 @@ int MemoryBuffer::check(){ } break; case AddressSpace::ID::CudaDeviceLocal: - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 throw std::runtime_error("check() called for local CUDA address space"); #else throw std::runtime_error("check() called on a CUDA address space but LinkTest was compiled without CUDA support"); @@ -231,7 +231,7 @@ int PosixMemAlignedAllocator::free(void* p, std::size_t len){ return SUCCESS; } -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 CudaDeviceAllocator::CudaDeviceAllocator(linktest::cuda::GpuContext* ctx):ctx_(ctx){} AddressSpace::ID CudaDeviceAllocator::address_space_id() const{ return AddressSpace::ID::CudaDeviceLocal; diff --git a/benchmark/memory.h b/benchmark/memory.h index aa08943..1e15a67 100644 --- a/benchmark/memory.h +++ b/benchmark/memory.h @@ -14,7 +14,7 @@ #include <cstdint> #include <unistd.h> -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 namespace linktest{ namespace cuda{ class Allocator; @@ -129,7 +129,7 @@ class PosixMemAlignedAllocator : public Allocator { size_t pgsize_ = sysconf(_SC_PAGESIZE); }; -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 // A memory allocation on a GPU class CudaDeviceAllocator : public Allocator{ public: @@ -165,8 +165,8 @@ class MemoryBuffer{ MemoryBuffer(const MemoryBuffer& other) = delete; MemoryBuffer& operator=(const MemoryBuffer& other) = delete; - MemoryBuffer(MemoryBuffer&& other) = delete; - MemoryBuffer& operator=(const MemoryBuffer&& other) = delete; + MemoryBuffer(MemoryBuffer&& other) = default; + MemoryBuffer& operator=(MemoryBuffer&& other) = default; /* Wrap an existing pointer into a memory buffer. We do not know the allocator * and hence have to ingore it. This is acceptable since the allocator is not diff --git a/benchmark/memory_multi.cc b/benchmark/memory_multi.cc index ab85a0f..488486e 100644 --- a/benchmark/memory_multi.cc +++ b/benchmark/memory_multi.cc @@ -8,7 +8,7 @@ ****************************************************************************/ #include "memory_multi.h" #include "compiler.h" -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 #include "memory_cuda.h" #include "gpu_nvidia.h" #endif @@ -91,13 +91,13 @@ void MemoryBufferMulti::fill(){ break; }case AddressSpace::ID::CudaDeviceLocal:{ if(use_mt()){ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 throw std::runtime_error("fill() for multiple buffers in CUDA address spaces not yet implemented"); #else throw std::runtime_error("fill() called on CUDA address space but linktest was compiled without CUDA"); #endif }else{ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 throw std::runtime_error("fill() for multiple buffers in CUDA address spaces not yet implemented"); #else throw std::runtime_error("fill() called on CUDA address space but linktest was compiled without CUDA"); @@ -146,13 +146,13 @@ int MemoryBufferMulti::check(std::size_t* buffer, std::size_t* byte){ break; }case AddressSpace::ID::CudaDeviceLocal:{ if(use_mt()){ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 throw std::runtime_error("check(buffer,byte) for multiple buffers in CUDA address spaces not yet implemented"); #else throw std::runtime_error("check(buffer,byte) called on CUDA address space but linktest was compiled without CUDA"); #endif }else{ - #if HAVE_CUDA == 1 + #if HAVE_VCLUSTER_CUDA == 1 throw std::runtime_error("check(buffer,byte) for multiple buffers in CUDA address spaces not yet implemented"); #else throw std::runtime_error("check(buffer,byte) called on CUDA address space but linktest was compiled without CUDA"); diff --git a/benchmark/output_sion.cc b/benchmark/output_sion.cc index f671e1f..0748a50 100644 --- a/benchmark/output_sion.cc +++ b/benchmark/output_sion.cc @@ -199,7 +199,7 @@ static int linktest_output_sion_funnelled_root(VirtualCluster* cl, return ERROR; } } - + debug("linktest_output_sion_funnelled_root->barrier"); EXEC_NOFAIL(cl->barrier()); printTimingIfRoot(cl->rank(), "[sioncollect]", std::chrono::duration<double>(walltime() - begin)); @@ -326,11 +326,13 @@ int linktest_output_sion_parallel(VirtualCluster* cl, }; auto sion_api = create_and_register_api(args->virtual_cluster_implementation); + debug("linktest_output_sion_parallel 1->barrier"); cl->barrier(); char* buffer; long long sz; EXEC_IFFAIL(linktest_output_sion_collect_local_data(cl, args, statsVec, &buffer, &sz), error("linktest_output_sion_collect_local_data failed."); return ERROR); + debug("linktest_output_sion_parallel 2->barrier"); cl->barrier(); auto filename = args->output.c_str(); @@ -362,6 +364,7 @@ int linktest_output_sion_parallel(VirtualCluster* cl, &fp, //fileptr &newfname //newfname ); + debug("linktest_output_sion_parallel 3->barrier"); cl->barrier(); rootWatch->stop(); printTiming("[sionopen]"); diff --git a/benchmark/portals4_macros.h b/benchmark/portals4_macros.h new file mode 100644 index 0000000..91fb342 --- /dev/null +++ b/benchmark/portals4_macros.h @@ -0,0 +1,18 @@ +#ifndef LINKTEST_PORTALS4MACROS_H +#define LINKTEST_PORTALS4MACROS_H +// NOLINTBEGIN +#define CHECK_RETURNVAL(x) do { int ret; \ + switch (ret = x) { \ + case PTL_IGNORED: \ + case PTL_OK: break; \ + case PTL_FAIL: fprintf(stderr, "=> %s returned PTL_FAIL (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + case PTL_NO_SPACE: fprintf(stderr, "=> %s returned PTL_NO_SPACE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + case PTL_ARG_INVALID: fprintf(stderr, "=> %s returned PTL_ARG_INVALID (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + case PTL_NO_INIT: fprintf(stderr, "=> %s returned PTL_NO_INIT (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + case PTL_PT_IN_USE: fprintf(stderr, "=> %s returned PTL_PT_IN_USE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + case PTL_IN_USE: fprintf(stderr, "=> %s returned PTL_IN_USE (line %u)\n", #x, (unsigned int)__LINE__); abort(); break; \ + default: fprintf(stderr, "=> %s returned failcode %i (line %u)\n", #x, ret, (unsigned int)__LINE__); abort(); break; \ + } } while (0) +// NOLINTEND + +#endif //PORTALS4MACROS \ No newline at end of file diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc index c493241..90d40b3 100644 --- a/benchmark/vcluster.cc +++ b/benchmark/vcluster.cc @@ -25,6 +25,9 @@ #if HAVE_VCLUSTER_UCP == 1 #include "vcluster_ucp.h" #endif +#if HAVE_VCLUSTER_PORTALS == 1 +#include "vcluster_portals.h" +#endif #if HAVE_VCLUSTER_CUDA == 1 #include "vcluster_cuda.h" #endif @@ -238,7 +241,7 @@ int VirtualCluster::linktest_kpingpong(const int from, const int to, const struct linktest_args* const args, double* const time){ int err; - + debug("VirtualCluster::linktest_kpingpong from %d to %d", from, to); // Warmup if(args->num_warmup_msg!=0){ //No use doing this if there are no warm-up messages. double dummy; @@ -247,7 +250,6 @@ int VirtualCluster::linktest_kpingpong(const int from, const int to, &dummy); if(unlikely(err))return ERROR; } - err=kpingpong(from,to,buf,args->num_msg,time); if(unlikely(err))return err; if(args->check_buffers){ @@ -393,6 +395,30 @@ int VirtualCluster::linktest_kbipingpong(const int from, const int to, return SUCCESS; } +const std::vector<std::string> VirtualCluster::impls = { + #if 1 == HAVE_VCLUSTER_TCP + "tcp", + #endif + #if 1 == HAVE_VCLUSTER_MPI + "mpi", + #endif + #if 1 == HAVE_VCLUSTER_IBVERBS + "ibverbs", + #endif + #if 1 == HAVE_VCLUSTER_PSM2 + "psm2", + #endif + #if 1 == HAVE_VCLUSTER_UCP + "ucp", + #endif + #if 1 == HAVE_VCLUSTER_PORTALS + VirtualClusterPortals::NAME, + #endif + #if 1 == HAVE_VCLUSTER_CUDA + "cuda", + #endif +}; + VirtualCluster* VirtualCluster::factory(const std::string& name){ #if 1 == HAVE_VCLUSTER_TCP if ("tcp" == name) { @@ -419,6 +445,11 @@ VirtualCluster* VirtualCluster::factory(const std::string& name){ return new VirtualClusterUCP(name); } else #endif +#if 1 == HAVE_VCLUSTER_PORTALS + if (VirtualClusterPortals::NAME == name) { + return new VirtualClusterPortals(); + } else +#endif #if 1 == HAVE_VCLUSTER_CUDA if ("cuda" == name) { return new VirtualClusterCUDA(name); @@ -444,102 +475,85 @@ void VirtualClusterWithHelper::set_helper_pointer(VirtualCluster* helper) int VirtualClusterWithHelper::rank() { if (unlikely(!helper_)) - throw; // Simply returning -1 will result in complicated bugs + fatal("rank() called on a VirtualClusterWithHelper with undefined helper"); return helper_->rank(); } int VirtualClusterWithHelper::size() { if (unlikely(!helper_)) - throw; // Simply returning -1 will result in complicated bugs + fatal("size() called on a VirtualClusterWithHelper with undefined helper"); return helper_->size(); } int VirtualClusterWithHelper::send(int dst, MemoryBuffer& buf) { if (unlikely(!helper_)) - return -1; + fatal("send() called on a VirtualClusterWithHelper with undefined helper"); return helper_->send(dst, buf); } int VirtualClusterWithHelper::recv(int src, MemoryBuffer& buf) { if (unlikely(!helper_)) - return -1; + fatal("recv() called on a VirtualClusterWithHelper with undefined helper"); return helper_->recv(src, buf); } -const char* VirtualCluster::impls[] = - { - #if 1 == HAVE_VCLUSTER_TCP - "tcp", - #endif - #if 1 == HAVE_VCLUSTER_MPI - "mpi", - #endif - #if 1 == HAVE_VCLUSTER_IBVERBS - "ibverbs", - #endif - #if 1 == HAVE_VCLUSTER_PSM2 - "psm2", - #endif - #if 1 == HAVE_VCLUSTER_UCP - "ucp", - #endif - #if 1 == HAVE_VCLUSTER_CUDA - "cuda", - #endif - nullptr - }; - -const char* get_vcluster_impl_name(char** argv, const char* name) +const std::string& VirtualCluster::get_vcluster_impl_name(char** argv, const std::string& name) { - /* Check If Virtual-Cluster Implementation Given In 'name' */ - if(name[0]){ //Check if 'name' is a null string - for(auto i=0;VirtualCluster::impls[i];i++){ //Loop over possible virtual cluster implementations - /* Loop termination is handled by the fact that the last 'VirtualCluster::impls' is - * null pointer. - */ - if (!strcmp(VirtualCluster::impls[i],name)){ //Compare command-line implementation to possible implementations - return VirtualCluster::impls[i]; - } + std::string requestedImpl; + + // check 'name' + if(name != "") { + requestedImpl = name; } - // If this point is reached an unknown/unsupported implementation was encountered. - error("Unknown/Unsupported command-line implementation encountered."); - return(NULL); + + // check executable extension + std::string executableName(argv[0]); + const std::string dot{"."}; + auto pos = executableName.find(dot); + if(pos != std::string::npos) { + requestedImpl = executableName.substr(pos); } - /* Check Executable Extension For Virtual-Cluster Implementation */ - // Determine suffix start - int i=std::strlen(argv[0])-1; - if(likely(i>=0)){ - int suffix_offset=-1; //Integer indicating suffix start - while(true){ - if(unlikely(argv[0][i]=='.')){ - suffix_offset=i; //Identify suffix start. - break; - } - if(i--==0) break; + // Check environment + const char* envName; + if(read_environ_str(LINKTEST_ENVIRON_PREFIX "VCLUSTER_IMPL", &envName) == SUCCESS) { + requestedImpl = {envName}; } - if(likely(suffix_offset>=0)){ //Check if suffix was encountered - // Compare suffix to supported virtual cluster implementations - for(auto i=0;VirtualCluster::impls[i];i++){ //Loop over possible virtual cluster implementations - /* Loop termination is handled by the fact that the last 'VirtualCluster::impls' is - * null pointer. - */ - if (unlikely(!strcmp(VirtualCluster::impls[i],&(argv[0][suffix_offset+1])))){ //Compare suffix to possible implementations - return VirtualCluster::impls[i]; //Return identified virtual-cluster implementation - } + + #if 1 == HAVE_VCLUSTER_TCP + debug("HAVE_VCLUSTER_TCP == 1"); + #endif + #if 1 == HAVE_VCLUSTER_MPI + debug("HAVE_VCLUSTER_MPI == 1"); + #endif + #if 1 == HAVE_VCLUSTER_IBVERBS + debug("HAVE_VCLUSTER_IBVERBS == 1"); + #endif + #if 1 == HAVE_VCLUSTER_PSM2 + debug("HAVE_VCLUSTER_PSM2 == 1"); + #endif + #if 1 == HAVE_VCLUSTER_UCP + debug("HAVE_VCLUSTER_UCP == 1"); + #endif + #if 1 == HAVE_VCLUSTER_PORTALS + debug("HAVE_VCLUSTER_PORTALS == 1"); + #endif + #if 1 == HAVE_VCLUSTER_CUDA + debug("HAVE_VCLUSTER_CUDA == 1"); + #endif + debug("requestedImpl = %s", requestedImpl.c_str()); + + for(const auto& impl : VirtualCluster::impls) { + if(impl == requestedImpl) { + return impl; } - }else if(unlikely(suffix_offset<-1)){ //This should not happen logically! - error("Internal Error: Unexpected negative executable-name-suffix offset encountered."); - return(NULL); - } } - - /* Check If Environment Specifies Virtual-Cluster Implementation */ - if(unlikely(read_environ_str(LINKTEST_ENVIRON_PREFIX "VCLUSTER_IMPL",&name))) return NULL; - return name; + + error("Internal Error: Unexpected negative executable-name-suffix offset encountered."); + return VirtualCluster::impls.at(0); // unreachable code to supress warning } int VirtualCluster::write_parallel(const linktest_args* args, const std::vector<LinktestStats>& statsVec) diff --git a/benchmark/vcluster.h b/benchmark/vcluster.h index 751c4fe..b516073 100644 --- a/benchmark/vcluster.h +++ b/benchmark/vcluster.h @@ -16,7 +16,6 @@ #include <string> #include <memory> - struct linktest_args; /* A virtual view of our cluster. The VirtualCluster @@ -275,12 +274,11 @@ public: virtual int write_parallel(const linktest_args* args, const std::vector<LinktestStats>& statsVec); virtual int write_funnelled(const linktest_args* args, const std::vector<LinktestStats>& statsVec); - /* Given the name of the vcluster implementation create an instance. This - * function accesses an internal database to map the name of the implementation - * to a function that creates the instance. In order for this to work, the - * implementation needs to be properly registered (see linktest_vcluster.c). - */ - static const char* impls[]; + /** \brief List of supported transport protocol (build at compile time) */ + static const std::vector<std::string> impls; + /** \return name of the requested transport protocol aka the virtual cluster imlplementation */ + static const std::string& get_vcluster_impl_name(char** argv, const std::string& name); + /** \return pointer to requested VirtualCluster implementation */ static VirtualCluster* factory(const std::string& name); private: @@ -296,13 +294,14 @@ private: std::shared_ptr<int[]> hostLocalRanks_; }; -/* Since a full implementation of send()/recv() logic on top of some transport layer is +/* VirtualClusterWithHelper + * delegates calls to rank(), size(), send() and recv() to another VirtualCluster (the helper) + * executes calls to benchmark kernels directly + * + * Since a full implementation of send()/recv() logic on top of some transport layer is * more complicated than the logic required for the implementation of the communication - * in kpingpong() it make sense to use a different VirtualCluster for the management - * communication than for the actual benchmark. - * VirtualClusterWithHelper allows derived classes to easily re-use another VirtualCluster - * instance. We do not use inheritance since the helper logic is not really an "is-a" - * relation. + * in our kernels it make sense to use a different VirtualCluster for the management + * communication than for the actual benchmark. */ class VirtualClusterWithHelper : public VirtualCluster { @@ -320,7 +319,7 @@ public: int recv(int src, MemoryBuffer& buf) override; protected: - void set_helper_pointer(VirtualCluster* helper); + void set_helper_pointer(VirtualCluster* helper); VirtualCluster* helper_; }; @@ -338,12 +337,6 @@ enum vcluster_reduce_op { SUM_DOUBLE }; -/* Get the name of the vcluster implementation to be used. The implementation - * can be chosen by means of argv[0] or an environment variable set by the - * spawner. - */ -const char* get_vcluster_impl_name(char** argv, const char* name); - template<typename T> int VirtualCluster::send(const int dst,const T* const vals,const int len){ auto tmp = MemoryBuffer::wrap<T>(const_cast<T*>(vals), len, AddressSpace::ID::Local); diff --git a/benchmark/vcluster_cuda.cc b/benchmark/vcluster_cuda.cc index 57ccc21..970d892 100644 --- a/benchmark/vcluster_cuda.cc +++ b/benchmark/vcluster_cuda.cc @@ -16,7 +16,7 @@ #include "error.h" #include "output_sion.h" #include "pmi.h" -#if HAVE_CUDA == 1 +#if HAVE_VCLUSTER_CUDA == 1 #include "gpu_nvidia.h" #endif #include <cassert> diff --git a/benchmark/vcluster_helper.cc b/benchmark/vcluster_helper.cc index c284fa9..b977cf1 100644 --- a/benchmark/vcluster_helper.cc +++ b/benchmark/vcluster_helper.cc @@ -32,11 +32,16 @@ std::string determineHostname(){ int determineCPUID(){ return (std::int32_t)sched_getcpu(); } - +#ifdef DEBUG_BARRIER +static int barrierCounter=1; +#endif int vcluster_helper_barrier(VirtualCluster* cl){ /* We do not actually send data but we still need to have * a non-NULL buffer pointer */ + #ifdef DEBUG_BARRIER + debug("vcluster_helper_barrier %d", barrierCounter++); + #endif char sp = 0; MemoryBuffer buf = MemoryBuffer::wrap<char>(&sp, 0, AddressSpace::ID::Local); diff --git a/benchmark/vcluster_mpi.cc b/benchmark/vcluster_mpi.cc index b481925..e7efc78 100644 --- a/benchmark/vcluster_mpi.cc +++ b/benchmark/vcluster_mpi.cc @@ -133,9 +133,14 @@ int VirtualClusterMPI::recv(int src, MemoryBuffer& buf) src, 0, world_, MPI_STATUS_IGNORE)); } - +#ifdef DEBUG_BARRIER +static int counter = 1; +#endif int VirtualClusterMPI::barrier() { + #ifdef DEBUG_BARRIER + debug("VirtualClusterMPI::barrier %d",counter++); + #endif return _mpi_(MPI_Barrier(world_)); } diff --git a/benchmark/vcluster_portals.cc b/benchmark/vcluster_portals.cc new file mode 100644 index 0000000..f408683 --- /dev/null +++ b/benchmark/vcluster_portals.cc @@ -0,0 +1,280 @@ +/**************************************************************************** +** LinkTest ** +***************************************************************************** +** Copyright (c) 2008-2022 ** +** Forschungszentrum Juelich, Juelich Supercomputing Centre ** +** ** +** See the file COPYRIGHT in the package base directory for details ** +****************************************************************************/ +#include "vcluster_portals.h" +#include "portals4_macros.h" +#include "memory.h" +#include "error.h" +#include "timing.h" +#include "stopwatch.h" +#include <assert.h> + +std::vector<ptl_process_t> VirtualClusterPortals::getPhysicalFromRank() { + ptl_process_t physId; + CHECK_RETURNVAL( PtlGetPhysId(mni_handle, &physId) ); + + debug("PMI Rank=%d, Hostname=%10s, Portals NID=%d PID=%d", + rank(), + hostname().c_str(), + physId.phys.nid, + physId.phys.pid); + + std::vector<ptl_process_t> physicalFromRank(size()); + gather(0, physicalFromRank.data(), &physId, 1); + bcast(0, physicalFromRank.data(), size()); + + if(physicalFromRank.at(rank()).phys.nid != physId.phys.nid) fatal("Failed to broadcast physicalFromRank"); + if(physicalFromRank.at(rank()).phys.pid != physId.phys.pid) fatal("Failed to broadcast physicalFromRank"); + + return physicalFromRank; +} + +int VirtualClusterPortals::init() +{ + set_helper_pointer(VirtualCluster::factory("mpi")); + EXEC_NOFAIL(helper_->init()); + + if(PTL_MAJOR_VERSION != 4 || PTL_MINOR_VERSION != 0) { + warn("Portals versions other than 4.0 may not be suppported"); + } + CHECK_RETURNVAL( PtlInit() ); + CHECK_RETURNVAL( PtlNIInit( + PTL_IFACE_DEFAULT, // Manual 3.3.5: "Check README" + PTL_NI_LOGICAL | PTL_NI_MATCHING, // Logical => using ranks, Matching => using send/recv semantics + PTL_PID_ANY, + nullptr,// &mni_limits_desired + &mni_limits_actual, + &mni_handle)); + + auto physicalFromRank = getPhysicalFromRank(); + CHECK_RETURNVAL( PtlSetMap(mni_handle, physicalFromRank.size(), physicalFromRank.data()) ); + const auto DEFAULT_OPTIONS = 0; + CHECK_RETURNVAL( PtlEQAlloc(mni_handle, 1000, &pt_eq_handle) ); + CHECK_RETURNVAL( PtlPTAlloc(mni_handle, DEFAULT_OPTIONS, pt_eq_handle, PTL_PT_ANY, &pt_index) ); + + return SUCCESS; +} + +int VirtualClusterPortals::finalize() +{ + debug("VirtualClusterPortals::finalize()"); + CHECK_RETURNVAL( PtlPTFree(mni_handle, pt_index) ); + CHECK_RETURNVAL( PtlNIFini(mni_handle) ); + PtlFini(); + EXEC_NOFAIL(helper_->finalize()); + return SUCCESS; +} + +void VirtualClusterPortals::prepareSendStructs(const MemoryBuffer& buf) { + debug("VirtualClusterPortals::prepareSendStructs(%p)", buf.p()); + md.start = buf.p(); + md.length = buf.len(); + md.options = PTL_MD_EVENT_CT_ACK; + md.eq_handle = PTL_EQ_NONE; // i.e. don't queue send events + CHECK_RETURNVAL( PtlCTAlloc(mni_handle, &md.ct_handle) ); + CHECK_RETURNVAL( PtlMDBind(mni_handle, &md, &md_handle) ); // Bind memory descriptor +} + +void VirtualClusterPortals::prepareRecvStructs(const MemoryBuffer& buf) { + debug("VirtualClusterPortals::prepareRecvStructs(%p)", buf.p()); + me.start = buf.p(); + me.length = buf.len(); + me.uid = PTL_UID_ANY; + me.match_id.rank = PTL_RANK_ANY; + me.match_bits = MATCH_BITS; + me.ignore_bits = IGNORE_BITS; + me.options = (PTL_ME_OP_PUT | PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_COMM_DISABLE ); // React to puts, count communication events, do not generate full communication events + CHECK_RETURNVAL( PtlCTAlloc(mni_handle, &me.ct_handle)); + CHECK_RETURNVAL( PtlMEAppend(mni_handle, pt_index, &me, PTL_PRIORITY_LIST, nullptr, &me_handle)); + ptl_event_t event; + CHECK_RETURNVAL( PtlEQWait(pt_eq_handle, &event) ); // TODO allow PTL_EQ_DROPPED + if (! (event.type == PTL_EVENT_LINK && event.ni_fail_type == PTL_NI_OK)) { + // TODO Check for overflow/dropped events + error("PtlMEAppend failed"); + } +} + +ptl_size_t VirtualClusterPortals::getSendCounter() { + debug("VirtualClusterPortals::getSendCounter()"); + CHECK_RETURNVAL( PtlCTGet(md.ct_handle, &send_ct) ); + debug("Send (MD): success %d - failure %d", send_ct.success, send_ct.failure); + if(send_ct.failure > 0) { + error("Failed operation on MD"); + } + return send_ct.success; +} + +ptl_size_t VirtualClusterPortals::getRecvCounter() { + debug("VirtualClusterPortals::getRecvCounter()"); + CHECK_RETURNVAL( PtlCTGet(me.ct_handle, &recv_ct) ); + debug("Recv (ME): success %d - failure %d", recv_ct.success, recv_ct.failure); + if(recv_ct.failure > 0) { + error("Failed operation on ME"); + } + return recv_ct.success; +} + +void VirtualClusterPortals::recvMessages(const unsigned long num_msg, const unsigned long counter_start) { + debug("VirtualClusterPortals::recvMessages(%lu, %lu)", num_msg, counter_start); + CHECK_RETURNVAL( PtlCTWait(me.ct_handle, counter_start + num_msg, &recv_ct) ); +} + +void VirtualClusterPortals::sendMessages(const int to, MemoryBuffer& buf, const unsigned long num_msg, const unsigned long counter_start) { + debug("VirtualClusterPortals::sendMessages(%d, %p, %lu, %lu)", to, num_msg, buf.p() , counter_start); + const ptl_size_t localOffset = 0; + const ptl_size_t remoteOffset = 0; + const ptl_hdr_data_t header_data = 0; + ptl_process_t target; + target.rank = to; + + for(unsigned long n = 1; n <= num_msg; n++) { + CHECK_RETURNVAL( PtlPut(md_handle, localOffset, buf.len(), PTL_CT_ACK_REQ, target, pt_index, MATCH_BITS, remoteOffset, nullptr, header_data) ); + } + CHECK_RETURNVAL( PtlCTWait(md.ct_handle, counter_start + num_msg, &send_ct) ); +} + +void VirtualClusterPortals::releaseRecvStructs() { + debug("releaseRecvStructs()"); + CHECK_RETURNVAL( PtlMEUnlink(me_handle) ); + CHECK_RETURNVAL( PtlCTFree(me.ct_handle) ); +}; + +void VirtualClusterPortals::releaseSendStructs() { + debug("releaseSendStructs()"); + CHECK_RETURNVAL( PtlMDRelease(md_handle) ); + CHECK_RETURNVAL( PtlCTFree(md.ct_handle) ); +}; + +int VirtualClusterPortals::kpingpong(const int from, const int to, MemoryBuffer& buf, const int num_msg, double* const timing) +{ + bool isSender = rank() == from; + bool isReceiver = rank() == to; + auto watch = Stopwatchfactory::getRankWatch(rank(), from); + prepareSendStructs(buf); + prepareRecvStructs(buf); + auto sendCounterBeforeKernel = getSendCounter(); + auto recvCounterBeforeKernel = getRecvCounter(); + + barrier(); + + if(isSender) { + watch->start(); + sendMessages(to, buf, num_msg, sendCounterBeforeKernel); + recvMessages(num_msg, recvCounterBeforeKernel); + watch->stop(); + } + if(isReceiver) { + watch->start(); + recvMessages(num_msg, recvCounterBeforeKernel); + sendMessages(from, buf, num_msg, sendCounterBeforeKernel); + watch->stop(); + } + + barrier(); + + getSendCounter(); + getRecvCounter(); + releaseSendStructs(); + releaseRecvStructs(); + if (timing != nullptr) + { + *timing = watch->getDuration().count() / (2*num_msg); + } + + return SUCCESS; +} + +int VirtualClusterPortals::kUniDir( + const int from, const int to, + MemoryBuffer &buf1, MemoryBuffer &buf2, + const int num_msg, double *const timing, + const bool /*doBarrier*/) +{ + bool isSender = rank() == from; + bool isReceiver = rank() == to; + auto watch = Stopwatchfactory::getRankWatch(rank(), from); + prepareSendStructs(buf1); + prepareRecvStructs(buf2); + auto sendCounterBeforeKernel = getSendCounter(); + auto recvCounterBeforeKernel = getRecvCounter(); + + barrier(); + + if(isSender) { + watch->start(); + sendMessages(to, buf1, num_msg, sendCounterBeforeKernel); + recvMessages(1, recvCounterBeforeKernel); + watch->stop(); + } + if(isReceiver) { + watch->start(); + recvMessages(num_msg, recvCounterBeforeKernel); + sendMessages(from, buf1, 1, sendCounterBeforeKernel); + watch->stop(); + } + + barrier(); + + getSendCounter(); + getRecvCounter(); + releaseSendStructs(); + releaseRecvStructs(); + if (timing != nullptr) + { + *timing = watch->getDuration().count() / num_msg; + } + + return SUCCESS; + +} + +int VirtualClusterPortals::kbipingpong( + const int from, const int to, + MemoryBuffer& buf1, MemoryBuffer& buf2, + const int num_msg, double* const timing) +{ + int partner; + if(rank() == from) { + partner = to; + } + if(rank() == to) { + partner = from; + } + auto watch = Stopwatchfactory::getRankWatch(rank(), from); + prepareSendStructs(buf1); + prepareRecvStructs(buf2); + auto sendCounterBeforeKernel = getSendCounter(); + auto recvCounterBeforeKernel = getRecvCounter(); + + barrier(); + + watch->start(); + sendMessages(partner, buf1, num_msg, sendCounterBeforeKernel); + recvMessages(num_msg, recvCounterBeforeKernel); + watch->stop(); + + barrier(); + + getSendCounter(); + getRecvCounter(); + releaseSendStructs(); + releaseRecvStructs(); + if (timing != nullptr) + { + *timing = watch->getDuration().count() / (2.0 * num_msg); + } + + return SUCCESS; + +} + + +VirtualClusterPortals::VirtualClusterPortals() +: VirtualClusterWithHelper(VirtualClusterPortals::NAME) +{ +} diff --git a/benchmark/vcluster_portals.h b/benchmark/vcluster_portals.h new file mode 100644 index 0000000..0a00c3d --- /dev/null +++ b/benchmark/vcluster_portals.h @@ -0,0 +1,101 @@ +/**************************************************************************** +** LinkTest ** +***************************************************************************** +** Copyright (c) 2008-2022 ** +** Forschungszentrum Juelich, Juelich Supercomputing Centre ** +** ** +** See the file COPYRIGHT in the package base directory for details ** +****************************************************************************/ +#ifndef LINKTEST_VCLUSTER_PORTALS_H +#define LINKTEST_VCLUSTER_PORTALS_H + +extern "C" { +#include <portals4.h> +} +#include <vector> +#include "vcluster.h" + +// VirtualCluster implementation based on a Portals 4 +class VirtualClusterPortals : public VirtualClusterWithHelper +{ + +public: + static constexpr char NAME[] = "portals"; + VirtualClusterPortals(); + int init() override; + int finalize() override; + + int kpingpong(const int from, const int to, MemoryBuffer& buf, + const int num_msg, double* const timing) override; + + int kUniDir(const int from, const int to, + MemoryBuffer& buf1, MemoryBuffer& buf2, + const int num_msg, double* const timing, + const bool doBarrier) override; + + int kUniDirMultiBuf(const int from,const int to, + MemoryBufferMulti& buf_multi, MemoryBuffer& buf2, + const int num_msg, double* const timing, + const bool doBarrier) override { + throw("Not Implemented"); // Use PTL_ME_MANAGE_LOCAL + }; + int kUniDirLimitedMultiBuf(const int from,const int to, + MemoryBufferMulti& buf_multi, MemoryBuffer& buf2, + const int num_msg, double* const timing, + const bool doBarrier) override { + throw("Not Implemented"); + }; + + int kbipingpong(const int from, const int to, + MemoryBuffer& buf1, MemoryBuffer& buf2, + const int num_msg, double* const timing) override; + +private: + // matching (send/recv) Network Interface (ni) + ptl_ni_limits_t mni_limits_desired; + ptl_ni_limits_t mni_limits_actual; + ptl_handle_ni_t mni_handle; + + /** @brief Portal Table (PT) Index */ + ptl_pt_index_t pt_index; + /** @brief PT Event Queue Handle */ + ptl_handle_eq_t pt_eq_handle; + + const ptl_match_bits_t MATCH_BITS = 1; // TODO when/how to use these? + const ptl_match_bits_t IGNORE_BITS = ~0; // ignore all bits + + // Sender + /** @brief Memory Descriptor (MD) */ + ptl_md_t md; + /** @brief MD Handle */ + ptl_handle_md_t md_handle; + /** @brief Send Counter */ + ptl_ct_event_t send_ct; + /** @brief Prepare portals data structures on sender side (MD) */ + void prepareSendStructs(const MemoryBuffer& buf); + /** @brief Read current send (MD) counter value */ + ptl_size_t getSendCounter(); + /** @brief Free portals data structures on sender side (MD) */ + void releaseSendStructs(); + + // Receiver + /** @brief Match List Entry (ME) */ + ptl_me_t me; + /** @brief ME Handle */ + ptl_handle_me_t me_handle; + /** @brief Receive Counter */ + ptl_ct_event_t recv_ct; + /** @brief Prepare portals data structures on receiver side (ME) */ + void prepareRecvStructs(const MemoryBuffer& buf); + /** @brief Read current recv (ME) counter value */ + ptl_size_t getRecvCounter(); + /** @brief Free portals data structures on receiver side (ME) */ + void releaseRecvStructs(); + + void sendMessages(const int to, MemoryBuffer& buf, const unsigned long num_msg, const unsigned long counter_start); + void recvMessages(const unsigned long num_msg, const unsigned long counter_start); + + std::vector<ptl_process_t> getPhysicalFromRank(); +}; + +#endif diff --git a/benchmark/vcluster_tcp.cc b/benchmark/vcluster_tcp.cc index 494d4c4..3dfeac9 100644 --- a/benchmark/vcluster_tcp.cc +++ b/benchmark/vcluster_tcp.cc @@ -113,12 +113,14 @@ int VirtualClusterTCP::read_tcp_environ_rank_and_size() #if 1 == HAVE_MINIPMI EXEC_IFFAIL(minipmi_get_size(pmi_, &size), error("minipmi_get_size() failed."); return ERROR); - EXEC_IFFAIL(minipmi_get_rank(pmi_, &rank), error("minipmi_get_rank() failed."); return ERROR); rank_ = rank; size_ = size; + #if defined(DEBUG_MINIPMI) + info("PMI rank: %d, PMI size: %d", rank, size); + #endif return SUCCESS; #else EXEC_IFFAIL(read_environ_int(TCP_ENVIRON_PREFIX "SIZE", &size), error("Failed to read environment variable " TCP_ENVIRON_PREFIX "SIZE"); return ERROR); @@ -599,16 +601,14 @@ int VirtualClusterTCP::init() { auto ret = linktest_minipmi_context_borrow(&pmi_); #if 1 == HAVE_MINIPMI - if (unlikely(ret)) { + if (ret != SUCCESS) { error("linktest_minipmi_context_borrow() failed."); - return ERROR; + return ret; } #endif EXEC_NOFAIL(read_tcp_environ()); - EXEC_NOFAIL(read_tcp_environ_rank_and_size()); - EXEC_NOFAIL(connect_to_all()); disable_nagles_algorithm(); @@ -665,7 +665,7 @@ int VirtualClusterTCP::send(int dst, MemoryBuffer& buf) } if (unlikely((dst < 0) || (dst >= size_ ))) { - error("Invalid rank."); + error("Invalid rank: 0 < %d < %d", dst, size_); return ERROR; } diff --git a/exampleBuild.sh b/exampleBuild.sh index 00ac73b..07eac17 100755 --- a/exampleBuild.sh +++ b/exampleBuild.sh @@ -23,7 +23,7 @@ export CPATH=$CPATH:~/.local/include/; mkdir -p install; cd benchmark; make clean -make -j HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 PREFIX=../install install; +make -j 12 HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 PREFIX=../install install; make clean cd ..; # Install linktest-report diff --git a/test/Default.xml b/test/Default.xml index c0abbfe..734aaad 100644 --- a/test/Default.xml +++ b/test/Default.xml @@ -29,11 +29,11 @@ <parameter name="DefaultCompiler">GCC</parameter> <parameter name="Compiler" tag="!noCompileRunTest">GCC,Intel,NVHPC</parameter> <parameter name="Compiler" tag="noCompileRunTest">${DefaultCompiler}</parameter> - <parameter name="DefaultMPI">OpenMPI</parameter> + <parameter name="DefaultMPI">ParaStationMPI</parameter> <parameter name="MPI" mode="python" tag="!noCompileRunTest"> - { + "ParaStationMPI" if "${System_Name}" == "deep" else { "GCC": "ParaStationMPI,OpenMPI", - "Intel": "ParaStationMPI,OpenMPI,IntelMPI", + "Intel": "IntelMPI", "NVHPC": "ParaStationMPI,OpenMPI" }[ "${Compiler}" ] </parameter> @@ -46,11 +46,13 @@ }[ "${Compiler}" ] </parameter> <parameter name="WithCUDA">("${CUDA}" == "CUDA")</parameter> - <parameter name="Stack">$Compiler $MPI</parameter> - <parameter name="Default_Stack">$DefaultCompiler $DefaultMPI</parameter> + <parameter name="WithCUDATxt" mode="python">"Yes" if ${WithCUDA} else "No"</parameter> + <parameter name="Stack">${Compiler}_${MPI}</parameter> + <parameter name="StackWithCuda">${Stack}_${CUDA}</parameter> + <parameter name="Default_Stack">${DefaultCompiler}_${DefaultMPI}</parameter> <parameter name="Unload_CUDA" mode="python">"CUDA" if "${Compiler} ${MPI} ${CUDA} " == "Intel IntelMPI " else ""</parameter> <parameter name="Transport_Layer_Settings" mode="python"> - "" if not ${WithCUDA} else { + "" if not ${WithCUDA} or "${System_Name}" == "deep" else { "ParaStationMPI": "mpi-settings/CUDA", "OpenMPI": "UCX-settings/RC-CUDA", "IntelMPI": "" @@ -68,7 +70,13 @@ </parameter> </parameterset> <parameterset name="Slurm"> <!-- depends on Linktest_Args, System and Environment parameters --> - <parameter name="Account">cstao</parameter> + <parameter name="Account" mode="python"> + { + "juwels": "cstao", + "jurecadc": "cstao", + "deep": "deepsea" + }["${System_Name}"] + </parameter> <parameter name="Partition" mode="python"> { "juwels": { @@ -78,14 +86,20 @@ "jurecadc": { False: "dc-cpu-devel", True : "dc-gpu-devel" + }, + "deep": { + False: "dp-cn", + True : "dp-esb" } }["${System_Name}"][ ${WithGPUs} ] </parameter> <parameter name="Max_WallClock_Time">00:01:00</parameter> <parameter name="Number_Of_Nodes" mode="python">1 if "${Messaging_Layer}" == "cuda" else 2</parameter> - <parameter name="Number_Of_Tasks_Per_Node">4</parameter> + <parameter name="Number_Of_Tasks_Per_Node" mode="python"> + "1" if (${WithGPUs} and "${System_Name}" == "deep") else "4" + </parameter> <parameter name="Number_Of_Cores_Per_Task">1</parameter> - <parameter name="Gres" mode="python">"#SBATCH --gres=gpu:4" if ${WithGPUs} else ""</parameter> + <parameter name="Gres" mode="python">"#SBATCH --gres=gpu:${Number_Of_Tasks_Per_Node}" if ${WithGPUs} else ""</parameter> <parameter name="SRUN_Arguments" mode="python"> "" if "${Messaging_Layer}" == "mpi" else { "ParaStationMPI": "--mpi=pspmi", @@ -98,7 +112,8 @@ <parameter name="CuArch" mode="python"> { "juwels": "sm_70", - "jurecadc": "sm_80" + "jurecadc": "sm_80", + "deep": "sm_70", }[ "${System_Name}" ] </parameter> <parameter name="Enable_Layer" mode="python"> @@ -113,7 +128,7 @@ "": "" }[ "${CUDA}" ] </parameter> - <parameter name="Make">make -j ${Enable_Layer} ${DefineCuArch}</parameter> + <parameter name="Make">make -j24 ${Enable_Layer} ${DefineCuArch}</parameter> </parameterset> <parameterset name="Misc"> <!-- depends on Linktest_Args parameters --> <parameter name="Report_Name">linktest_${Messaging_Layer}_${Number_Of_Nodes}nx${Number_Of_Tasks_Per_Node}c</parameter> diff --git a/test/LayerTest.xml b/test/LayerTest.xml index 5ca1410..4ab7775 100644 --- a/test/LayerTest.xml +++ b/test/LayerTest.xml @@ -4,8 +4,29 @@ <parameter name="Messaging_Layer" mode="python"> { "juwels": "ibverbs,ucp,tcp,cuda", - "jurecadc": "ibverbs,ucp,tcp,cuda" <!-- TODO add psm2 which is available only on jureca booster which shares login node --> + "jurecadc": "ibverbs,ucp,tcp,cuda", + "deep": "ibverbs,ucp,tcp,cuda,portals" }[ "${System_Name}" ] - </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,tcp --> + </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,portals,tcp --> + +</parameterset> +<parameterset name="Slurm" init_with="Default.xml"> + <parameter name="Partition" mode="python"> + "dp-bxi" if "${Messaging_Layer}" == "portals" else { + "juwels": { + False: "devel", + True : "develgpus" + }, + "jurecadc": { + False: "dc-cpu-devel", + True : "dc-gpu-devel" + }, + "deep": { + False: "dp-cn", + True : "dc-esb" + } + }["${System_Name}"][ ${WithGPUs} ] + </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,portals,tcp --> + </parameterset> </jube> \ No newline at end of file diff --git a/test/LinktestMain.xml b/test/LinktestMain.xml index a55d84c..cf67976 100644 --- a/test/LinktestMain.xml +++ b/test/LinktestMain.xml @@ -3,6 +3,18 @@ <benchmark name="JSC Linktest Test Suite" outpath="runs"> <comment>Testing compilation and common usages of JSC Linktest</comment> + <parameterset name="JUBE_Extra"> + <parameter name="JUBE_REPORT_LAST_CMD" update_mode="step"> + if [ $? -eq 0 ]; then + touch "${jube_wp_abspath}/ready"; + else + echo "${jube_step_name} failed" >> "${jube_wp_abspath}/error"; + fi + </parameter> + </parameterset> + + + <fileset name="Sources"> <copy>../benchmark</copy> </fileset> @@ -49,30 +61,26 @@ <sub source="§SRUN_ARGS§" dest="${SRUN_Arguments}" /> </substituteset> - <step name="Compile" procs="9" tag="!(noLayerTest+noModeTest+noCompileTest)" suffix="${Stack}"> - <use>Sources</use> + <step name="Compile" procs="9" tag="!(noLayerTest+noModeTest+noCompileTest)" suffix="${Stack}_${CUDA}"> + <use>JUBE_Extra,Sources</use> <use from="Default.xml">System, Environment, Build</use> <do done_file="ready" error_file="error" tag="!dryRun"> set -x $Load_Modules cd benchmark $Make - if [ $? -eq 0 ]; then - touch ../ready; - else - echo "Linktest compile failed" >> ../error; - fi + $JUBE_REPORT_LAST_CMD set +x </do> <do done_file="ready" error_file="error" tag="dryRun"> echo "Assume succesful compile" - touch ready + $JUBE_REPORT_LAST_CMD </do> </step> <step name="LayerTest" depend="Compile" active="'$Stack' == '$Default_Stack' and ${WithCUDA} == ${WithGPUs}" suffix="${Messaging_Layer}" tag="!noLayerTest"> - <use from="LayerTest.xml">Linktest_Args</use> - <use from="Default.xml">System, Environment, Slurm, Misc</use> + <use from="LayerTest.xml">Linktest_Args, Slurm</use> + <use from="Default.xml">System, Environment, Misc</use> <use>ExecutionScript</use> <use>SubstituteInputParameters</use> <do done_file="ready" error_file="error" tag="!dryRun">sbatch execute.sbatch</do> @@ -88,7 +96,7 @@ <step name="CompileLinktestReport" active="'$Stack' == '$Default_Stack'" tag="!noLinktestReportTest"> <use from="Default.xml">Environment</use> - <use>ReportSources</use> + <use>JUBE_Extra,ReportSources</use> <do done_file="ready" error_file="error"> set -x $Load_Modules @@ -96,17 +104,13 @@ python3 -m venv venvLinktest source venvLinktest/bin/activate pip install ./python - if [ $? -eq 0 ]; then - touch ready; - else - echo "linktest-report compile failed" >> error; - fi + $JUBE_REPORT_LAST_CMD deactivate set +x </do> </step> - <step name="CompileRunTest" procs="9" depend="Compile" active="${WithCUDA} == ${WithGPUs}" suffix="${Stack}_${CUDA}" tag="!noCompileRunTest"> + <step name="CompileRunTest" procs="9" depend="Compile" active="${WithCUDA} == ${WithGPUs}" suffix="${StackWithCuda}" tag="!noCompileRunTest"> <use from="CompileRunTest.xml">Linktest_Args</use> <use from="Default.xml">System, Environment, Slurm, Misc</use> <use>ExecutionScript</use> @@ -115,16 +119,13 @@ </step> <step name="LinktestReportTest" procs="7" depend="ModeTest,CompileLinktestReport" active="$No_Sion_File == 0" suffix="${Mode}" tag="!(noLinktestReportTest|noModeTest)"> + <use>JUBE_Extra</use> <do done_file="ready" error_file="error" tag="!dryRun"> set -x $Load_Modules source CompileLinktestReport/venvLinktest/bin/activate linktest-report -i ModeTest/${Report_Name}.sion -o report.pdf - if [ $? -eq 0 ]; then - touch ready; - else - echo "python-report run failed" >> error; - fi + $JUBE_REPORT_LAST_CMD deactivate set +x </do> @@ -143,32 +144,46 @@ <pattern name="Options">\+ srun .*?\.sion (.*?)\n</pattern> </patternset> - <patternset name="errorFilePatterns"> - <pattern name="error_msg">.*</pattern> + <patternset name="genericPatterns"> + <pattern name="all">.*</pattern> </patternset> <!-- Analyse --> + <analyser name="analyseCompiles"> + <analyse step="Compile"> + <file use="genericPatterns">error</file> + <file use="genericPatterns">ready</file> + </analyse> + </analyser> + <analyser name="analyseRuns"> - <analyse step="LayerTest" tag="!noLayerTest"> + <analyse step="Compile"> + <file use="genericPatterns">error</file> + <file use="genericPatterns">ready</file> + </analyse> + <analyse step="CompileRunTest" tag="!noCompileRunTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> - <file use="errorFilePatterns">error</file> + <file use="genericPatterns">error</file> + <file use="genericPatterns">ready</file> </analyse> - <analyse step="ModeTest" tag="!noModeTest"> + <analyse step="LayerTest" tag="!noLayerTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> - <file use="errorFilePatterns">error</file> + <file use="genericPatterns">error</file> + <file use="genericPatterns">ready</file> </analyse> - <analyse step="CompileRunTest" tag="!noCompileRunTest"> + <analyse step="ModeTest" tag="!noModeTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> - <file use="errorFilePatterns">error</file> + <file use="genericPatterns">error</file> + <file use="genericPatterns">ready</file> </analyse> </analyser> <analyser name="analyseReports"> <analyse step="LinktestReportTest" tag="!(noLinktestReportTest|noModeTest)"> - <file use="errorFilePatterns">error</file> + <file use="genericPatterns">error</file> </analyse> </analyser> @@ -191,15 +206,27 @@ </result> <result> <use>analyseRuns,analyseReports</use> - <table name="ErrorResult" style="pretty" sort="jube_step_name"> + <table name="RunErrors" style="pretty" sort="jube_step_name,Compiler,MPI,Transport_Layer_Settings,WithCUDATxt,Messaging_Layer,SRUN_Arguments,Options"> <column title="Test">jube_step_name</column> <column title="Compiler">Compiler</column> <column title="MPI">MPI</column> - <column title="Setting">Transport_Layer_Settings</column> + <column title="MPI Settings">Transport_Layer_Settings</column> + <column title="CUDA">WithCUDATxt</column> <column title="Layer">Messaging_Layer</column> <column title="Srun Args">SRUN_Arguments</column> <column title="Options">Options</column> - <column title="Errors">error_msg</column> + <column title="Errors">all</column> + </table> + </result> + <result> + <use>analyseCompiles</use> + <table name="CompileErrors" style="pretty" sort="jube_step_name,Compiler,MPI,CUDA"> + <column title="Test">jube_step_name</column> + <column title="Compiler">Compiler</column> + <column title="MPI">MPI</column> + <column title="MPI Settings">Transport_Layer_Settings</column> + <column title="CUDA">WithCUDATxt</column> + <column title="Errors">all</column> </table> </result> diff --git a/test/execute_base.sbatch b/test/execute_base.sbatch index e89c7dd..21cfb76 100644 --- a/test/execute_base.sbatch +++ b/test/execute_base.sbatch @@ -50,7 +50,7 @@ fi if [ §NUM_RANDOMIZE_TASKS§ -ne 0 ]; then args+=" --num-randomize-tasks §NUM_RANDOMIZE_TASKS§" fi -if [ §HOSTNAME_GROUPING§ ]; then +if [ §HOSTNAME_GROUPING§ -ne 0 ]; then args+=" --group-processes-by-hostname" fi set -x # echos commands before executing @@ -61,7 +61,7 @@ srun --ntasks=${SLURM_NTASKS} \ # Indicate Success to jube if [ $? -ne 0 ]; then - echo "linktest run failed" >> error; + echo "LinkTest run failed" >> error; else touch ready; fi -- GitLab