From b4e19d9bf862bc4696b62a6a0e77d1e0270ac704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20M=C3=BCller?= <y.mueller@fz-juelich.de> Date: Wed, 21 Feb 2024 16:29:30 +0100 Subject: [PATCH] Adding CI/CD pipeline together with recent bugfixes --- .gitlab-ci.yml | 25 ++++++++++++++++++++ benchmark/benchmark.cc | 44 +++++++++++++++++------------------ benchmark/gpu_nvidia.cc | 11 +-------- benchmark/memusage.cc | 2 +- benchmark/utils.cc | 2 +- benchmark/vcluster.cc | 2 +- benchmark/vcluster_ibverbs.cc | 5 ++-- test/Default.xml | 10 +++++++- test/LinktestMain.xml | 29 +++++++---------------- test/execute_base.sbatch | 6 ++++- 10 files changed, 77 insertions(+), 59 deletions(-) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..26a6d0a --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,25 @@ +workflow: + rules: + - if: $CI_PIPELINE_SOURCE == 'merge_request_event' + - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == $CI_DEFAULT_BRANCH + +stages: + - test + +juwels-test-job: + stage: test + tags: [jacamar,shell,juwels,login] + script: + - module load JUBE + - jube-autorun test/LinktestMain.xml + - '[ $(jube result test/runs/ | grep -cF "Compile failed") -le 1 ]' # Known Issues: Intel + IntelMPI + - '[ $(jube result test/runs/ | grep -cF "Cleanup failed") -le 4 ]' # Known Issues: GCC/NVHPC + ParaStationMPI + CUDA, Layertest ibverbs/CUDA + +jureca-test-job: + stage: test + tags: [jacamar,shell,jureca,login] + script: + - module load JUBE + - jube-autorun test/LinktestMain.xml + - '[ $(jube result test/runs/ | grep -cF "Compile failed") -le 1 ]' # Known Issues: Intel + IntelMPI + - '[ $(jube result test/runs/ | grep -cF "Cleanup failed") -le 4 ]' # Known Issues: GCC/NVHPC + ParaStationMPI + CUDA, Layertest ibverbs/CUDA \ No newline at end of file diff --git a/benchmark/benchmark.cc b/benchmark/benchmark.cc index 444c346..88ae35f 100644 --- a/benchmark/benchmark.cc +++ b/benchmark/benchmark.cc @@ -710,28 +710,28 @@ int Benchmark::handle_slow_pairs(const int iter){ const int n = args->max_stest; if(n){ //It makes no sense to run the serial tests suit if no tests are to be run - std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank()); - - rootWatch->start(); - auto sp = new slow_pair[n]; - auto ret = gather_slow_pairs(sp, n); - if (unlikely(ret)) - goto out; - - rootWatch->stop(); - printTimingIfRoot(rank(), "[search slow]", rootWatch->getDuration()); - - rootWatch->start(); - ret = retest_slow_pairs(sp, n, iter); - if (unlikely(ret)) goto out; - rootWatch->stop(); - printTimingIfRoot(rank(), "[test slow]", rootWatch->getDuration()); - - delete[] sp; - return SUCCESS; -out: - delete[] sp; - return ERROR; + std::unique_ptr<StopwatchI> rootWatch = Stopwatchfactory::getRootWatch(rank()); + + rootWatch->start(); + auto sp = new slow_pair[n]; + auto ret = gather_slow_pairs(sp, n); + if (unlikely(ret)) + goto out; + + rootWatch->stop(); + printTimingIfRoot(rank(), "[search slow]", rootWatch->getDuration()); + + rootWatch->start(); + ret = retest_slow_pairs(sp, n, iter); + if (unlikely(ret)) goto out; + rootWatch->stop(); + printTimingIfRoot(rank(), "[test slow]", rootWatch->getDuration()); + + delete[] sp; + return SUCCESS; + out: + delete[] sp; + return ERROR; } return SUCCESS; } diff --git a/benchmark/gpu_nvidia.cc b/benchmark/gpu_nvidia.cc index a82d30e..043df6c 100644 --- a/benchmark/gpu_nvidia.cc +++ b/benchmark/gpu_nvidia.cc @@ -159,15 +159,11 @@ void* GpuContext::memalloc(std::uint64_t len) return nullptr; return reinterpret_cast<void*>(p); - - return nullptr; - } int GpuContext::memfree(void* p) { - - auto err = execute([&p] { + return execute([&p] { auto err = cuMemFree(reinterpret_cast<CUdeviceptr>(p)); if (unlikely(CUDA_SUCCESS != err)) { report_cuda_error("cuMemFree", err); @@ -176,11 +172,6 @@ int GpuContext::memfree(void* p) return SUCCESS; }); - - return err; - - return -1; - } GpuContext* GpuContext::singleton() diff --git a/benchmark/memusage.cc b/benchmark/memusage.cc index 9d2898b..29e5edf 100644 --- a/benchmark/memusage.cc +++ b/benchmark/memusage.cc @@ -28,7 +28,7 @@ std::size_t memusage() struct rusage ru; if (getrusage(RUSAGE_SELF, &ru)) - return SUCCESS; + return 0; return ru.ru_maxrss; } diff --git a/benchmark/utils.cc b/benchmark/utils.cc index 37fb5bb..778cb61 100644 --- a/benchmark/utils.cc +++ b/benchmark/utils.cc @@ -101,7 +101,7 @@ int collective_print(VirtualCluster* cl, char* str, int len) double lat_to_bw(double time, const struct linktest_args* args) { if (unlikely(time < 1e-16)) - return SUCCESS; + return 0.0l; else return (args->len_msg * 1.0)/(time * 1024.0 * 1024.0); } diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc index 90d40b3..214bd9e 100644 --- a/benchmark/vcluster.cc +++ b/benchmark/vcluster.cc @@ -218,7 +218,7 @@ int VirtualCluster::kUniDirLimitedMultiBuf(const int from,const int to, uint64_t VirtualCluster::kpingpong_minimal_buffer_overhead() const { - return SUCCESS; + return 0; } int VirtualCluster::kalltoall(MemoryBuffer& buf, int len_msg, int num_msg, double *timing) diff --git a/benchmark/vcluster_ibverbs.cc b/benchmark/vcluster_ibverbs.cc index 6efd5b9..5544cc1 100644 --- a/benchmark/vcluster_ibverbs.cc +++ b/benchmark/vcluster_ibverbs.cc @@ -32,6 +32,7 @@ #include <chrono> #include <unistd.h> #include <malloc.h> +#include <utility> #undef IBVERBS_ENVIRON_PREFIX #define IBVERBS_ENVIRON_PREFIX LINKTEST_ENVIRON_PREFIX "IBVERBS_" @@ -494,7 +495,7 @@ constexpr int maybe_send_inline(const linktest::ibverbs::MemoryRegion* buf) return (AddressSpace::ID::Local == buf->address_space_id()) and (buf->len() <= LINKTEST_IBVERBS_MAX_INLINE_SZ_) ? IBV_SEND_INLINE : 0; #else - return SUCCESS; + return SUCCESS; //TODO Check if return 0 or return SUCCESS is meant here #endif } @@ -673,7 +674,7 @@ uint64_t VirtualClusterImpl::kpingpong_minimal_buffer_overhead() const #if 1 == IBVERBS_USE_POLL_ON_TAIL return sizeof(uint64_t); #else - return SUCCESS; + return 0; #endif } diff --git a/test/Default.xml b/test/Default.xml index 734aaad..670e41e 100644 --- a/test/Default.xml +++ b/test/Default.xml @@ -2,6 +2,13 @@ <jube> <parameterset name="System"> <parameter name="System_Name" mode="shell">cat /etc/FZJ/systemname | tr -d '\n'</parameter> + <parameter name="NumpyCoreIncludePath" mode="python"> + { + "juwels": "/p/software/juwels/stages/2024/software/SciPy-bundle/2023.07-gcccoreflexiblas-12.3.0-3.3.1/lib/python3.11/site-packages/numpy/core/include/", + "jurecadc": "/p/software/jurecadc/stages/2024/software/SciPy-bundle/2023.07-gcccoreflexiblas-12.3.0-3.3.1/lib/python3.11/site-packages/numpy/core/include/", + "deep": "/p/software/deep/stages/2023/software/SciPy-bundle/2022.05-gcccoremkl-11.3.0-2022.1.0/lib/python3.10/site-packages/numpy/core/include/" + }["${System_Name}"] + </parameter> </parameterset> <parameterset name="Linktest_Args"> <parameter name="Messaging_Layer">mpi</parameter> <!-- Options: mpi,tcp,ibverbs,psm2,cuda,ucp --> @@ -29,7 +36,7 @@ <parameter name="DefaultCompiler">GCC</parameter> <parameter name="Compiler" tag="!noCompileRunTest">GCC,Intel,NVHPC</parameter> <parameter name="Compiler" tag="noCompileRunTest">${DefaultCompiler}</parameter> - <parameter name="DefaultMPI">ParaStationMPI</parameter> + <parameter name="DefaultMPI">OpenMPI</parameter> <parameter name="MPI" mode="python" tag="!noCompileRunTest"> "ParaStationMPI" if "${System_Name}" == "deep" else { "GCC": "ParaStationMPI,OpenMPI", @@ -59,6 +66,7 @@ } [ "${MPI}" ] </parameter> <parameter name="Load_Modules"> + module load Stages/2023 module load ${Compiler} module load ${MPI} module load ${CUDA} diff --git a/test/LinktestMain.xml b/test/LinktestMain.xml index cf67976..96bf4c2 100644 --- a/test/LinktestMain.xml +++ b/test/LinktestMain.xml @@ -13,8 +13,6 @@ </parameter> </parameterset> - - <fileset name="Sources"> <copy>../benchmark</copy> </fileset> @@ -95,12 +93,12 @@ </step> <step name="CompileLinktestReport" active="'$Stack' == '$Default_Stack'" tag="!noLinktestReportTest"> - <use from="Default.xml">Environment</use> + <use from="Default.xml">System,Environment</use> <use>JUBE_Extra,ReportSources</use> <do done_file="ready" error_file="error"> set -x $Load_Modules - export CPATH=/p/software/juwels/stages/2022/software/SciPy-bundle/2021.10-gcccoremkl-11.2.0-2021.4.0/lib/python3.9/site-packages/numpy/core/include:$CPATH + export CPATH=$NumpyCoreIncludePath:$$CPATH python3 -m venv venvLinktest source venvLinktest/bin/activate pip install ./python @@ -150,38 +148,31 @@ <!-- Analyse --> <analyser name="analyseCompiles"> - <analyse step="Compile"> - <file use="genericPatterns">error</file> - <file use="genericPatterns">ready</file> + <use>genericPatterns</use> + <analyse step="Compile" tag="!(noLayerTest+noModeTest+noCompileTest)"> + <file>error</file> + </analyse> + <analyse step="CompileLinktestReport" tag="!noLinktestReportTest"> + <file>error</file> </analyse> </analyser> <analyser name="analyseRuns"> - <analyse step="Compile"> - <file use="genericPatterns">error</file> - <file use="genericPatterns">ready</file> - </analyse> <analyse step="CompileRunTest" tag="!noCompileRunTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> <file use="genericPatterns">error</file> - <file use="genericPatterns">ready</file> </analyse> <analyse step="LayerTest" tag="!noLayerTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> <file use="genericPatterns">error</file> - <file use="genericPatterns">ready</file> </analyse> <analyse step="ModeTest" tag="!noModeTest"> <file use="LinktestOutPatterns">linktest.log</file> <file use="LinktestErrPatterns">linktest.error</file> <file use="genericPatterns">error</file> - <file use="genericPatterns">ready</file> </analyse> - </analyser> - - <analyser name="analyseReports"> <analyse step="LinktestReportTest" tag="!(noLinktestReportTest|noModeTest)"> <file use="genericPatterns">error</file> </analyse> @@ -205,12 +196,11 @@ </table> </result> <result> - <use>analyseRuns,analyseReports</use> + <use>analyseRuns</use> <table name="RunErrors" style="pretty" sort="jube_step_name,Compiler,MPI,Transport_Layer_Settings,WithCUDATxt,Messaging_Layer,SRUN_Arguments,Options"> <column title="Test">jube_step_name</column> <column title="Compiler">Compiler</column> <column title="MPI">MPI</column> - <column title="MPI Settings">Transport_Layer_Settings</column> <column title="CUDA">WithCUDATxt</column> <column title="Layer">Messaging_Layer</column> <column title="Srun Args">SRUN_Arguments</column> @@ -224,7 +214,6 @@ <column title="Test">jube_step_name</column> <column title="Compiler">Compiler</column> <column title="MPI">MPI</column> - <column title="MPI Settings">Transport_Layer_Settings</column> <column title="CUDA">WithCUDATxt</column> <column title="Errors">all</column> </table> diff --git a/test/execute_base.sbatch b/test/execute_base.sbatch index 21cfb76..c385780 100644 --- a/test/execute_base.sbatch +++ b/test/execute_base.sbatch @@ -61,7 +61,11 @@ srun --ntasks=${SLURM_NTASKS} \ # Indicate Success to jube if [ $? -ne 0 ]; then - echo "LinkTest run failed" >> error; + if [ $(cat linktest.error | grep -v grep | grep -cF "timings[000] [all]") -eq 1 ]; then + echo "Cleanup failed" >> error; + else + echo "Run failed" >> error; + fi else touch ready; fi -- GitLab