From b4e19d9bf862bc4696b62a6a0e77d1e0270ac704 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yannik=20M=C3=BCller?= <y.mueller@fz-juelich.de>
Date: Wed, 21 Feb 2024 16:29:30 +0100
Subject: [PATCH] Adding CI/CD pipeline together with recent bugfixes

---
 .gitlab-ci.yml                | 25 ++++++++++++++++++++
 benchmark/benchmark.cc        | 44 +++++++++++++++++------------------
 benchmark/gpu_nvidia.cc       | 11 +--------
 benchmark/memusage.cc         |  2 +-
 benchmark/utils.cc            |  2 +-
 benchmark/vcluster.cc         |  2 +-
 benchmark/vcluster_ibverbs.cc |  5 ++--
 test/Default.xml              | 10 +++++++-
 test/LinktestMain.xml         | 29 +++++++----------------
 test/execute_base.sbatch      |  6 ++++-
 10 files changed, 77 insertions(+), 59 deletions(-)
 create mode 100644 .gitlab-ci.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..26a6d0a
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,25 @@
+workflow:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
+    - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == $CI_DEFAULT_BRANCH
+
+stages:
+  - test
+
+juwels-test-job:
+  stage: test
+  tags: [jacamar,shell,juwels,login]
+  script:
+    - module load JUBE
+    - jube-autorun test/LinktestMain.xml
+    - '[ $(jube result test/runs/ | grep -cF "Compile failed") -le 1 ]' # Known Issues: Intel + IntelMPI
+    - '[ $(jube result test/runs/ | grep -cF "Cleanup failed") -le 4 ]' # Known Issues: GCC/NVHPC + ParaStationMPI + CUDA, Layertest ibverbs/CUDA
+
+jureca-test-job:
+  stage: test
+  tags: [jacamar,shell,jureca,login]
+  script:
+    - module load JUBE
+    - jube-autorun test/LinktestMain.xml
+    - '[ $(jube result test/runs/ | grep -cF "Compile failed") -le 1 ]' # Known Issues: Intel + IntelMPI
+    - '[ $(jube result test/runs/ | grep -cF "Cleanup failed") -le 4 ]' # Known Issues: GCC/NVHPC + ParaStationMPI + CUDA, Layertest ibverbs/CUDA
\ No newline at end of file
diff --git a/benchmark/benchmark.cc b/benchmark/benchmark.cc
index 444c346..88ae35f 100644
--- a/benchmark/benchmark.cc
+++ b/benchmark/benchmark.cc
@@ -710,28 +710,28 @@ int Benchmark::handle_slow_pairs(const int iter){
     const int n = args->max_stest;
 
     if(n){ //It makes no sense to run the serial tests suit if no tests are to be run
-    std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank());
-
-    rootWatch->start();
-    auto sp = new slow_pair[n];
-    auto ret = gather_slow_pairs(sp, n);
-    if (unlikely(ret))
-        goto out;
-
-    rootWatch->stop();
-    printTimingIfRoot(rank(), "[search slow]", rootWatch->getDuration());
-
-    rootWatch->start();
-    ret = retest_slow_pairs(sp, n, iter);
-    if (unlikely(ret)) goto out;
-    rootWatch->stop();
-    printTimingIfRoot(rank(), "[test slow]", rootWatch->getDuration());
-
-    delete[] sp;
-    return SUCCESS;
-out:
-    delete[] sp;
-    return ERROR;
+        std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank());
+
+        rootWatch->start();
+        auto sp = new slow_pair[n];
+        auto ret = gather_slow_pairs(sp, n);
+        if (unlikely(ret))
+            goto out;
+
+        rootWatch->stop();
+        printTimingIfRoot(rank(), "[search slow]", rootWatch->getDuration());
+
+        rootWatch->start();
+        ret = retest_slow_pairs(sp, n, iter);
+        if (unlikely(ret)) goto out;
+        rootWatch->stop();
+        printTimingIfRoot(rank(), "[test slow]", rootWatch->getDuration());
+
+        delete[] sp;
+        return SUCCESS;
+        out:
+        delete[] sp;
+        return ERROR;
     }
     return SUCCESS;
 }
diff --git a/benchmark/gpu_nvidia.cc b/benchmark/gpu_nvidia.cc
index a82d30e..043df6c 100644
--- a/benchmark/gpu_nvidia.cc
+++ b/benchmark/gpu_nvidia.cc
@@ -159,15 +159,11 @@ void* GpuContext::memalloc(std::uint64_t len)
         return nullptr;
 
     return reinterpret_cast<void*>(p);
-
-    return nullptr;
-
 }
 
 int GpuContext::memfree(void* p)
 {
-
-    auto err = execute([&p] {
+    return execute([&p] {
         auto err = cuMemFree(reinterpret_cast<CUdeviceptr>(p));
         if (unlikely(CUDA_SUCCESS != err)) {
             report_cuda_error("cuMemFree", err);
@@ -176,11 +172,6 @@ int GpuContext::memfree(void* p)
 
         return SUCCESS;
     });
-
-    return err;
-
-    return -1;
-
 }
 
 GpuContext* GpuContext::singleton()
diff --git a/benchmark/memusage.cc b/benchmark/memusage.cc
index 9d2898b..29e5edf 100644
--- a/benchmark/memusage.cc
+++ b/benchmark/memusage.cc
@@ -28,7 +28,7 @@ std::size_t memusage()
     struct rusage ru;
 
     if (getrusage(RUSAGE_SELF, &ru))
-        return SUCCESS;
+        return 0;
 
     return ru.ru_maxrss;
 }
diff --git a/benchmark/utils.cc b/benchmark/utils.cc
index 37fb5bb..778cb61 100644
--- a/benchmark/utils.cc
+++ b/benchmark/utils.cc
@@ -101,7 +101,7 @@ int collective_print(VirtualCluster* cl, char* str, int len)
 double lat_to_bw(double time, const struct linktest_args* args)
 {
     if (unlikely(time < 1e-16))
-        return SUCCESS;
+        return 0.0l;
     else
         return (args->len_msg * 1.0)/(time * 1024.0 * 1024.0);
 }
diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc
index 90d40b3..214bd9e 100644
--- a/benchmark/vcluster.cc
+++ b/benchmark/vcluster.cc
@@ -218,7 +218,7 @@ int VirtualCluster::kUniDirLimitedMultiBuf(const int from,const int to,
 
 uint64_t VirtualCluster::kpingpong_minimal_buffer_overhead() const
 {
-    return SUCCESS;
+    return 0;
 }
 
 int VirtualCluster::kalltoall(MemoryBuffer& buf, int len_msg, int num_msg, double *timing)
diff --git a/benchmark/vcluster_ibverbs.cc b/benchmark/vcluster_ibverbs.cc
index 6efd5b9..5544cc1 100644
--- a/benchmark/vcluster_ibverbs.cc
+++ b/benchmark/vcluster_ibverbs.cc
@@ -32,6 +32,7 @@
 #include <chrono>
 #include <unistd.h>
 #include <malloc.h>
+#include <utility>
 
 #undef  IBVERBS_ENVIRON_PREFIX
 #define IBVERBS_ENVIRON_PREFIX LINKTEST_ENVIRON_PREFIX "IBVERBS_"
@@ -494,7 +495,7 @@ constexpr int maybe_send_inline(const linktest::ibverbs::MemoryRegion* buf)
     return (AddressSpace::ID::Local == buf->address_space_id()) and
            (buf->len() <= LINKTEST_IBVERBS_MAX_INLINE_SZ_) ? IBV_SEND_INLINE : 0;
 #else
-    return SUCCESS;
+    return SUCCESS; //TODO Check if return 0 or return SUCCESS is meant here
 #endif
 }
 
@@ -673,7 +674,7 @@ uint64_t VirtualClusterImpl::kpingpong_minimal_buffer_overhead() const
 #if 1 == IBVERBS_USE_POLL_ON_TAIL
     return sizeof(uint64_t);
 #else
-    return SUCCESS;
+    return 0;
 #endif
 }
 
diff --git a/test/Default.xml b/test/Default.xml
index 734aaad..670e41e 100644
--- a/test/Default.xml
+++ b/test/Default.xml
@@ -2,6 +2,13 @@
 <jube>
 <parameterset name="System">
     <parameter name="System_Name" mode="shell">cat /etc/FZJ/systemname | tr -d '\n'</parameter>
+    <parameter name="NumpyCoreIncludePath" mode="python">
+    {
+        "juwels":   "/p/software/juwels/stages/2024/software/SciPy-bundle/2023.07-gcccoreflexiblas-12.3.0-3.3.1/lib/python3.11/site-packages/numpy/core/include/",
+        "jurecadc": "/p/software/jurecadc/stages/2024/software/SciPy-bundle/2023.07-gcccoreflexiblas-12.3.0-3.3.1/lib/python3.11/site-packages/numpy/core/include/",
+        "deep":     "/p/software/deep/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages/numpy/core/include/"
+    }["${System_Name}"]
+    </parameter>
 </parameterset>
 <parameterset name="Linktest_Args">
     <parameter name="Messaging_Layer">mpi</parameter> <!-- Options: mpi,tcp,ibverbs,psm2,cuda,ucp -->
@@ -29,7 +36,7 @@
     <parameter name="DefaultCompiler">GCC</parameter>
     <parameter name="Compiler" tag="!noCompileRunTest">GCC,Intel,NVHPC</parameter>
     <parameter name="Compiler" tag="noCompileRunTest">${DefaultCompiler}</parameter>
-    <parameter name="DefaultMPI">ParaStationMPI</parameter>
+    <parameter name="DefaultMPI">OpenMPI</parameter>
     <parameter name="MPI" mode="python" tag="!noCompileRunTest">
         "ParaStationMPI" if "${System_Name}" == "deep" else {
         "GCC":   "ParaStationMPI,OpenMPI",
@@ -59,6 +66,7 @@
         } [ "${MPI}" ]
     </parameter>
     <parameter name="Load_Modules">
+        module load Stages/2023
         module load ${Compiler}
         module load ${MPI}
         module load ${CUDA}
diff --git a/test/LinktestMain.xml b/test/LinktestMain.xml
index cf67976..96bf4c2 100644
--- a/test/LinktestMain.xml
+++ b/test/LinktestMain.xml
@@ -13,8 +13,6 @@
 			</parameter>
 		</parameterset>
 
-		
-
 		<fileset name="Sources">
 			<copy>../benchmark</copy>
 		</fileset>
@@ -95,12 +93,12 @@
 		</step>
 
 		<step name="CompileLinktestReport" active="'$Stack' == '$Default_Stack'" tag="!noLinktestReportTest">
-			<use from="Default.xml">Environment</use>
+			<use from="Default.xml">System,Environment</use>
 			<use>JUBE_Extra,ReportSources</use>
 			<do done_file="ready" error_file="error">
 				set -x
 				$Load_Modules
-				export CPATH=/p/software/juwels/stages/2022/software/SciPy-bundle/2021.10-gcccoremkl-11.2.0-2021.4.0/lib/python3.9/site-packages/numpy/core/include:$CPATH
+				export CPATH=$NumpyCoreIncludePath:$$CPATH
 				python3 -m venv venvLinktest
 				source venvLinktest/bin/activate
 				pip install ./python
@@ -150,38 +148,31 @@
 
 		<!-- Analyse -->
 		<analyser name="analyseCompiles">
-			<analyse step="Compile">
-				<file use="genericPatterns">error</file>
-				<file use="genericPatterns">ready</file>
+			<use>genericPatterns</use>
+			<analyse step="Compile" tag="!(noLayerTest+noModeTest+noCompileTest)">
+				<file>error</file>
+			</analyse>
+			<analyse step="CompileLinktestReport" tag="!noLinktestReportTest">
+				<file>error</file>
 			</analyse>
 		</analyser>
 
 		<analyser name="analyseRuns">
-			<analyse step="Compile">
-				<file use="genericPatterns">error</file>
-				<file use="genericPatterns">ready</file>
-			</analyse>
 			<analyse step="CompileRunTest" tag="!noCompileRunTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
 				<file use="genericPatterns">error</file>
-				<file use="genericPatterns">ready</file>
 			</analyse>
 			<analyse step="LayerTest" tag="!noLayerTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
 				<file use="genericPatterns">error</file>
-				<file use="genericPatterns">ready</file>
 			</analyse>
 			<analyse step="ModeTest" tag="!noModeTest">
 				<file use="LinktestOutPatterns">linktest.log</file>
 				<file use="LinktestErrPatterns">linktest.error</file>
 				<file use="genericPatterns">error</file>
-				<file use="genericPatterns">ready</file>
 			</analyse>
-		</analyser>
-
-		<analyser name="analyseReports">
 			<analyse step="LinktestReportTest" tag="!(noLinktestReportTest|noModeTest)">
 				<file use="genericPatterns">error</file>
 			</analyse>
@@ -205,12 +196,11 @@
 			</table>
 		</result>
 		<result>
-			<use>analyseRuns,analyseReports</use>
+			<use>analyseRuns</use>
 			<table name="RunErrors" style="pretty" sort="jube_step_name,Compiler,MPI,Transport_Layer_Settings,WithCUDATxt,Messaging_Layer,SRUN_Arguments,Options">
 				<column title="Test">jube_step_name</column>
 				<column title="Compiler">Compiler</column>
 				<column title="MPI">MPI</column>
-				<column title="MPI Settings">Transport_Layer_Settings</column>
 				<column title="CUDA">WithCUDATxt</column>
 				<column title="Layer">Messaging_Layer</column>
 				<column title="Srun Args">SRUN_Arguments</column>
@@ -224,7 +214,6 @@
 				<column title="Test">jube_step_name</column>
 				<column title="Compiler">Compiler</column>
 				<column title="MPI">MPI</column>
-				<column title="MPI Settings">Transport_Layer_Settings</column>
 				<column title="CUDA">WithCUDATxt</column>
 				<column title="Errors">all</column>
 			</table>
diff --git a/test/execute_base.sbatch b/test/execute_base.sbatch
index 21cfb76..c385780 100644
--- a/test/execute_base.sbatch
+++ b/test/execute_base.sbatch
@@ -61,7 +61,11 @@ srun --ntasks=${SLURM_NTASKS} \
 
 # Indicate Success to jube
 if [ $? -ne 0 ]; then
-	echo "LinkTest run failed" >> error;
+	if [ $(cat linktest.error | grep -v grep | grep -cF "timings[000] [all]") -eq 1 ]; then
+		echo "Cleanup failed" >> error;
+	else
+		echo "Run failed" >> error;
+	fi
 else
 	touch ready;
 fi
-- 
GitLab