From 6b7b03a803a3684e3c938350b817ae7890f13051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yannik=20M=C3=BCller?= <y.mueller@fz-juelich.de> Date: Wed, 16 Nov 2022 15:13:38 +0100 Subject: [PATCH] Added jube tests --- test/.gitignore | 1 + test/CompileRunTest.xml | 6 ++ test/Default.xml | 122 +++++++++++++++++++++++ test/LayerTest.xml | 11 +++ test/LinktestMain.xml | 207 +++++++++++++++++++++++++++++++++++++++ test/ModeTest.xml | 15 +++ test/README.md | 23 +++++ test/execute_base.sbatch | 69 +++++++++++++ 8 files changed, 454 insertions(+) create mode 100644 test/.gitignore create mode 100644 test/CompileRunTest.xml create mode 100644 test/Default.xml create mode 100644 test/LayerTest.xml create mode 100644 test/LinktestMain.xml create mode 100644 test/ModeTest.xml create mode 100644 test/README.md create mode 100644 test/execute_base.sbatch diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..643cb18 --- /dev/null +++ b/test/.gitignore @@ -0,0 +1 @@ +runs/ \ No newline at end of file diff --git a/test/CompileRunTest.xml b/test/CompileRunTest.xml new file mode 100644 index 0000000..b34d5ff --- /dev/null +++ b/test/CompileRunTest.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> +<parameterset name="Linktest_Args" init_with="Default.xml"> + <parameter name="Use_Gpu_Memory">0,1</parameter> +</parameterset> +</jube> \ No newline at end of file diff --git a/test/Default.xml b/test/Default.xml new file mode 100644 index 0000000..c0abbfe --- /dev/null +++ b/test/Default.xml @@ -0,0 +1,122 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> +<parameterset name="System"> + <parameter name="System_Name" mode="shell">cat /etc/FZJ/systemname | tr -d '\n'</parameter> +</parameterset> +<parameterset name="Linktest_Args"> + <parameter name="Messaging_Layer">mpi</parameter> <!-- Options: mpi,tcp,ibverbs,psm2,cuda,ucp --> + <parameter name="Number_Of_WarmUp_Messages">10</parameter> + <parameter name="Number_Of_Messages">1000</parameter> + <parameter name="Message_Size">1024</parameter> <!-- 2^10 --> + <parameter name="Number_Of_Slowest">2</parameter> <!-- -1 (default)--> + <parameter name="Collect_P_Num">1</parameter> <!-- -1 (default)--> + <parameter name="All_To_All">0</parameter> <!--0 (default),1--> + <parameter name="Bidirectional">0</parameter> <!--0 (default),1--> + <parameter name="Unidirectional">0</parameter> <!--0 (default),1--> + <parameter name="Bisection">0</parameter> <!--0 (default),1--> + <parameter name="Serial">0</parameter> <!--0 (default),1--> + <parameter name="Mix">0</parameter> <!--0 (default),1--> + <parameter name="Serial_Tests">0</parameter> <!--0 (default),1--> + <parameter name="No_Sion_File">1</parameter> <!--0,1 (default)--> + <parameter name="Parallel_IO">0</parameter> <!--0 (default),1--> + <parameter name="Use_Gpu_Memory">0</parameter> <!--0 (default),1--> + <parameter name="Number_Of_Randomized_Tasks_Iterations">0</parameter> <!--0 (default),1--> + <parameter name="Group_Hostname">0</parameter> <!--0 (default),1--> + <parameter name="Additional_Arguments"></parameter> + <parameter name="WithGPUs">("${Messaging_Layer}" == "cuda" or ${Use_Gpu_Memory} == 1)</parameter> +</parameterset> +<parameterset name="Environment"> + <parameter name="DefaultCompiler">GCC</parameter> + <parameter name="Compiler" tag="!noCompileRunTest">GCC,Intel,NVHPC</parameter> + <parameter name="Compiler" tag="noCompileRunTest">${DefaultCompiler}</parameter> + <parameter name="DefaultMPI">OpenMPI</parameter> + <parameter name="MPI" mode="python" tag="!noCompileRunTest"> + { + "GCC": "ParaStationMPI,OpenMPI", + "Intel": "ParaStationMPI,OpenMPI,IntelMPI", + "NVHPC": "ParaStationMPI,OpenMPI" + }[ "${Compiler}" ] + </parameter> + <parameter name="MPI" tag="noCompileRunTest">${DefaultMPI}</parameter> + <parameter name="CUDA" mode="python"> + { + "GCC": "CUDA,", + "Intel": "", + "NVHPC": "CUDA" + }[ "${Compiler}" ] + </parameter> + <parameter name="WithCUDA">("${CUDA}" == "CUDA")</parameter> + <parameter name="Stack">$Compiler $MPI</parameter> + <parameter name="Default_Stack">$DefaultCompiler $DefaultMPI</parameter> + <parameter name="Unload_CUDA" mode="python">"CUDA" if "${Compiler} ${MPI} ${CUDA} " == "Intel IntelMPI " else ""</parameter> + <parameter name="Transport_Layer_Settings" mode="python"> + "" if not ${WithCUDA} else { + "ParaStationMPI": "mpi-settings/CUDA", + "OpenMPI": "UCX-settings/RC-CUDA", + "IntelMPI": "" + } [ "${MPI}" ] + </parameter> + <parameter name="Load_Modules"> + module load ${Compiler} + module load ${MPI} + module load ${CUDA} + module load ${Transport_Layer_Settings} + module load SIONlib + module unload ${Unload_CUDA} + module load SciPy-Stack + module list + </parameter> +</parameterset> +<parameterset name="Slurm"> <!-- depends on Linktest_Args, System and Environment parameters --> + <parameter name="Account">cstao</parameter> + <parameter name="Partition" mode="python"> + { + "juwels": { + False: "devel", + True : "develgpus" + }, + "jurecadc": { + False: "dc-cpu-devel", + True : "dc-gpu-devel" + } + }["${System_Name}"][ ${WithGPUs} ] + </parameter> + <parameter name="Max_WallClock_Time">00:01:00</parameter> + <parameter name="Number_Of_Nodes" mode="python">1 if "${Messaging_Layer}" == "cuda" else 2</parameter> + <parameter name="Number_Of_Tasks_Per_Node">4</parameter> + <parameter name="Number_Of_Cores_Per_Task">1</parameter> + <parameter name="Gres" mode="python">"#SBATCH --gres=gpu:4" if ${WithGPUs} else ""</parameter> + <parameter name="SRUN_Arguments" mode="python"> + "" if "${Messaging_Layer}" == "mpi" else { + "ParaStationMPI": "--mpi=pspmi", + "OpenMPI": "--mpi=pmi2", + "IntelMPI": "" + }["${MPI}"] + </parameter> +</parameterset> +<parameterset name="Build"> <!-- depends on System and Environment parameters --> + <parameter name="CuArch" mode="python"> + { + "juwels": "sm_70", + "jurecadc": "sm_80" + }[ "${System_Name}" ] + </parameter> + <parameter name="Enable_Layer" mode="python"> + { + "CUDA": "HAVE_MPI=1 HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 HAVE_CUDA=1", + "": "HAVE_MPI=1 HAVE_TCP=1 HAVE_IBVERBS=1 HAVE_UCP=1 HAVE_CUDA=0" + }[ "${CUDA}" ] + </parameter> + <parameter name="DefineCuArch" mode="python"> + { + "CUDA": "CUARCH=${CuArch}", + "": "" + }[ "${CUDA}" ] + </parameter> + <parameter name="Make">make -j ${Enable_Layer} ${DefineCuArch}</parameter> +</parameterset> +<parameterset name="Misc"> <!-- depends on Linktest_Args parameters --> + <parameter name="Report_Name">linktest_${Messaging_Layer}_${Number_Of_Nodes}nx${Number_Of_Tasks_Per_Node}c</parameter> + <parameter name="Linktest_Bin">Compile/benchmark/linktest</parameter> +</parameterset> +</jube> diff --git a/test/LayerTest.xml b/test/LayerTest.xml new file mode 100644 index 0000000..5ca1410 --- /dev/null +++ b/test/LayerTest.xml @@ -0,0 +1,11 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> +<parameterset name="Linktest_Args" init_with="Default.xml"> + <parameter name="Messaging_Layer" mode="python"> + { + "juwels": "ibverbs,ucp,tcp,cuda", + "jurecadc": "ibverbs,ucp,tcp,cuda" <!-- TODO add psm2 which is available only on jureca booster which shares login node --> + }[ "${System_Name}" ] + </parameter> <!-- Options: mpi,ibverbs,psm2,cuda,ucp,tcp --> +</parameterset> +</jube> \ No newline at end of file diff --git a/test/LinktestMain.xml b/test/LinktestMain.xml new file mode 100644 index 0000000..a55d84c --- /dev/null +++ b/test/LinktestMain.xml @@ -0,0 +1,207 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> + <benchmark name="JSC Linktest Test Suite" outpath="runs"> + <comment>Testing compilation and common usages of JSC Linktest</comment> + + <fileset name="Sources"> + <copy>../benchmark</copy> + </fileset> + <fileset name="AnalysisSources"> + <copy>../analysis</copy> + </fileset> + <fileset name="ReportSources"> + <copy>../python</copy> + </fileset> + <fileset name="ExecutionScript"> + <copy>execute_base.sbatch</copy> + </fileset> + + <substituteset name="SubstituteInputParameters"> + <iofile in="execute_base.sbatch" out="execute.sbatch" /> + <sub source="§SLURM_ACCOUNT§" dest="${Account}" /> + <sub source="§SLURM_PARTITION§" dest="${Partition}" /> + <sub source="§SLURM_MAX_WALLCLOCK_TIME§" dest="${Max_WallClock_Time}" /> + <sub source="§NUM_NODES§" dest="${Number_Of_Nodes}" /> + <sub source="§NUM_TASKS_PER_NODE§" dest="${Number_Of_Tasks_Per_Node}" /> + <sub source="§CORES_PER_TASK§" dest="${Number_Of_Cores_Per_Task}" /> + <sub source="§MESSAGING_LAYER§" dest="${Messaging_Layer}" /> + <sub source="§NUM_WARM-UP_MESSAGES§" dest="${Number_Of_WarmUp_Messages}" /> + <sub source="§NUM_MESSAGES§" dest="${Number_Of_Messages}" /> + <sub source="§MESSAGE_SIZE§" dest="${Message_Size}" /> + <sub source="§NUM_RANDOMIZE_TASKS§" dest="${Number_Of_Randomized_Tasks_Iterations}" /> + <sub source="§NUM_SLOWEST§" dest="${Number_Of_Slowest}" /> + <sub source="§COLLECTPNUM§" dest="${Collect_P_Num}" /> + <sub source="§ALL_TO_ALL§" dest="${All_To_All}" /> + <sub source="§BIDIR§" dest="${Bidirectional}" /> + <sub source="§UNIDIR§" dest="${Unidirectional}" /> + <sub source="§BISECT§" dest="${Bisection}" /> + <sub source="§MIX§" dest="${Mix}" /> + <sub source="§SERIAL_TESTS§" dest="${Serial_Tests}" /> + <sub source="§NO_SION_FILE§" dest="${No_Sion_File}" /> + <sub source="§PARALLEL_IO§" dest="${Parallel_IO}" /> + <sub source="§USE_GPU_MEMORY§" dest="${Use_Gpu_Memory}" /> + <sub source="§HOSTNAME_GROUPING§" dest="${Hostname_Grouping}" /> + <sub source="§ADDITIONAL_ARGUMENTS§" dest="${Additional_Arguments}" /> + <sub source="§REPORT_NAME§" dest="${Report_Name}" /> + <sub source="§EXEC_BIN§" dest="${Linktest_Bin}" /> + <sub source="§GRES§" dest="${Gres}" /> + <sub source="§LOAD_MODULES§" dest="${Load_Modules}" /> + <sub source="§SRUN_ARGS§" dest="${SRUN_Arguments}" /> + </substituteset> + + <step name="Compile" procs="9" tag="!(noLayerTest+noModeTest+noCompileTest)" suffix="${Stack}"> + <use>Sources</use> + <use from="Default.xml">System, Environment, Build</use> + <do done_file="ready" error_file="error" tag="!dryRun"> + set -x + $Load_Modules + cd benchmark + $Make + if [ $? -eq 0 ]; then + touch ../ready; + else + echo "Linktest compile failed" >> ../error; + fi + set +x + </do> + <do done_file="ready" error_file="error" tag="dryRun"> + echo "Assume succesful compile" + touch ready + </do> + </step> + + <step name="LayerTest" depend="Compile" active="'$Stack' == '$Default_Stack' and ${WithCUDA} == ${WithGPUs}" suffix="${Messaging_Layer}" tag="!noLayerTest"> + <use from="LayerTest.xml">Linktest_Args</use> + <use from="Default.xml">System, Environment, Slurm, Misc</use> + <use>ExecutionScript</use> + <use>SubstituteInputParameters</use> + <do done_file="ready" error_file="error" tag="!dryRun">sbatch execute.sbatch</do> + </step> + + <step name="ModeTest" depend="Compile" active="'$Stack' == '$Default_Stack' and ${WithCUDA} == ${WithGPUs}" suffix="${Mode}" tag="!noModeTest"> + <use from="ModeTest.xml">Linktest_Args</use> + <use from="Default.xml">System, Environment, Slurm, Misc</use> + <use>ExecutionScript</use> + <use>SubstituteInputParameters</use> + <do done_file="ready" error_file="error" tag="!dryRun">sbatch execute.sbatch</do> + </step> + + <step name="CompileLinktestReport" active="'$Stack' == '$Default_Stack'" tag="!noLinktestReportTest"> + <use from="Default.xml">Environment</use> + <use>ReportSources</use> + <do done_file="ready" error_file="error"> + set -x + $Load_Modules + export CPATH=/p/software/juwels/stages/2022/software/SciPy-bundle/2021.10-gcccoremkl-11.2.0-2021.4.0/lib/python3.9/site-packages/numpy/core/include:$CPATH + python3 -m venv venvLinktest + source venvLinktest/bin/activate + pip install ./python + if [ $? -eq 0 ]; then + touch ready; + else + echo "linktest-report compile failed" >> error; + fi + deactivate + set +x + </do> + </step> + + <step name="CompileRunTest" procs="9" depend="Compile" active="${WithCUDA} == ${WithGPUs}" suffix="${Stack}_${CUDA}" tag="!noCompileRunTest"> + <use from="CompileRunTest.xml">Linktest_Args</use> + <use from="Default.xml">System, Environment, Slurm, Misc</use> + <use>ExecutionScript</use> + <use>SubstituteInputParameters</use> + <do done_file="ready" error_file="error" tag="!dryRun">sbatch execute.sbatch</do> + </step> + + <step name="LinktestReportTest" procs="7" depend="ModeTest,CompileLinktestReport" active="$No_Sion_File == 0" suffix="${Mode}" tag="!(noLinktestReportTest|noModeTest)"> + <do done_file="ready" error_file="error" tag="!dryRun"> + set -x + $Load_Modules + source CompileLinktestReport/venvLinktest/bin/activate + linktest-report -i ModeTest/${Report_Name}.sion -o report.pdf + if [ $? -eq 0 ]; then + touch ready; + else + echo "python-report run failed" >> error; + fi + deactivate + set +x + </do> + </step> + + <patternset name="LinktestOutPatterns"> + <pattern name="min_time">RESULT: Min Time:\s+(${jube_pat_nfp}\s[n|u|m| |k]s) </pattern> + <pattern name="avg_time">RESULT: Avg Time:\s+(${jube_pat_nfp}\s[n|u|m| |k]s) </pattern> + <pattern name="max_time">RESULT: Max Time:\s+(${jube_pat_nfp}\s[n|u|m| |k]s) </pattern> + <pattern name="max_bw" >RESULT: Min Time:\s+$jube_pat_nfp\s[n|u|m| |k]s \(\s*(${jube_pat_nfp}\s[T|G|M|k| ][i| ]B/s)\)</pattern> + <pattern name="avg_bw" >RESULT: Avg Time:\s+$jube_pat_nfp\s[n|u|m| |k]s \(\s*(${jube_pat_nfp}\s[T|G|M|k| ][i| ]B/s)\)</pattern> + <pattern name="min_bw" >RESULT: Max Time:\s+$jube_pat_nfp\s[n|u|m| |k]s \(\s*(${jube_pat_nfp}\s[T|G|M|k| ][i| ]B/s)\)</pattern> + </patternset> + + <patternset name="LinktestErrPatterns"> + <pattern name="Options">\+ srun .*?\.sion (.*?)\n</pattern> + </patternset> + + <patternset name="errorFilePatterns"> + <pattern name="error_msg">.*</pattern> + </patternset> + + <!-- Analyse --> + <analyser name="analyseRuns"> + <analyse step="LayerTest" tag="!noLayerTest"> + <file use="LinktestOutPatterns">linktest.log</file> + <file use="LinktestErrPatterns">linktest.error</file> + <file use="errorFilePatterns">error</file> + </analyse> + <analyse step="ModeTest" tag="!noModeTest"> + <file use="LinktestOutPatterns">linktest.log</file> + <file use="LinktestErrPatterns">linktest.error</file> + <file use="errorFilePatterns">error</file> + </analyse> + <analyse step="CompileRunTest" tag="!noCompileRunTest"> + <file use="LinktestOutPatterns">linktest.log</file> + <file use="LinktestErrPatterns">linktest.error</file> + <file use="errorFilePatterns">error</file> + </analyse> + </analyser> + + <analyser name="analyseReports"> + <analyse step="LinktestReportTest" tag="!(noLinktestReportTest|noModeTest)"> + <file use="errorFilePatterns">error</file> + </analyse> + </analyser> + + <!-- Results --> + <result> + <use>analyseRuns</use> + <table name="BandwidthResult" style="pretty" sort="jube_step_name" filter="'$jube_step_name' in ['LayerTest','ModeTest','CompileRunTest']"> + <column title="Test">jube_step_name</column> + <column title="Compiler">Compiler</column> + <column title="MPI">MPI</column> + <column title="Layer">Messaging_Layer</column> + <column title="Options">Options</column> + <column title="Min Time">min_time</column> + <column title="Avg Time">avg_time</column> + <column title="Max Time">max_time</column> + <column title="Min BW">min_bw</column> + <column title="Avg BW">avg_bw</column> + <column title="Max BW">max_bw</column> + </table> + </result> + <result> + <use>analyseRuns,analyseReports</use> + <table name="ErrorResult" style="pretty" sort="jube_step_name"> + <column title="Test">jube_step_name</column> + <column title="Compiler">Compiler</column> + <column title="MPI">MPI</column> + <column title="Setting">Transport_Layer_Settings</column> + <column title="Layer">Messaging_Layer</column> + <column title="Srun Args">SRUN_Arguments</column> + <column title="Options">Options</column> + <column title="Errors">error_msg</column> + </table> + </result> + + </benchmark> +</jube> diff --git a/test/ModeTest.xml b/test/ModeTest.xml new file mode 100644 index 0000000..98f2f50 --- /dev/null +++ b/test/ModeTest.xml @@ -0,0 +1,15 @@ +<?xml version="1.0" encoding="UTF-8"?> +<jube> +<parameterset name="Linktest_Args" init_with="Default.xml"> + <parameter name="i" type="int">0,1,2,3,4,5,6,7,8,9,10</parameter> + <parameter name="All_To_All" mode="python" type="int">[0,1,0,0,0,0,0,0,0,0,0][$i]</parameter> + <parameter name="Bidirectional" mode="python" type="int">[0,0,1,0,0,0,0,0,0,0,0][$i]</parameter> + <parameter name="Bisection" mode="python" type="int">[0,0,0,1,0,0,0,0,1,0,0][$i]</parameter> + <parameter name="Number_Of_Randomized_Tasks_Iterations" mode="python" type="int">[0,0,0,0,4,0,0,0,0,1,0][$i]</parameter> + <parameter name="Unidirectional" mode="python" type="int">[0,0,0,0,0,1,0,0,0,0,0][$i]</parameter> + <parameter name="Use_Gpu_Memory" mode="python" type="int">[0,0,0,0,0,0,1,0,0,0,0][$i]</parameter> + <parameter name="Hostname_Grouping" mode="python" type="int">[0,0,0,0,0,0,0,1,1,1,0][$i]</parameter> + <parameter name="No_Sion_File" mode="python" type="int">[0,0,0,0,0,0,0,0,0,0,1][$i]</parameter> + <parameter name="Mode" mode="python">["Semidirektional","All to all","Bidirectional","Bisection","Randomized Semidirektional","Unidirectional","Use GPU Memory","Hostname Grouping","Hostname Bisection","Hostname Proc. Rand.","No SION"][$i]</parameter> +</parameterset> +</jube> diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..df80746 --- /dev/null +++ b/test/README.md @@ -0,0 +1,23 @@ +# Linktest - JUBE +Some simple JUBE tests for testing Linktest. + +## Usage +3. `jube run LinktestMain.xml [--tag Tag...]` + +| Tag | Effect | +|---|---| +| noCompileRunTest | skip compile tests | +| noLinktestReportTest | skip python report | +| noLayerTest | skip layer tests | +| noModeTest | skip mode tests | +| dryRun | Only build, no execution | + +## Tests +LinktestMain.xml - Runs the test configurations (s.b.) + +CompileTest.xml - Tests various different software stacks (Compilers, MPI) + +LayerTest.xml - Tests various different messaging layers (mpi, etc.) -- Not PSM2 + +ModeTest.xml - Tests various communication setups like all-to-all, Bidirectional, Bisection, ... (MPI Layer) + diff --git a/test/execute_base.sbatch b/test/execute_base.sbatch new file mode 100644 index 0000000..e89c7dd --- /dev/null +++ b/test/execute_base.sbatch @@ -0,0 +1,69 @@ +#!/bin/bash +#SBATCH --account=§SLURM_ACCOUNT§ +#SBATCH --partition=§SLURM_PARTITION§ +#SBATCH --time=§SLURM_MAX_WALLCLOCK_TIME§ +#SBATCH --output=linktest.log +#SBATCH --error=linktest.error +#SBATCH --nodes=§NUM_NODES§ +#SBATCH --ntasks-per-node=§NUM_TASKS_PER_NODE§ +#SBATCH --cpus-per-task=§CORES_PER_TASK§ +§GRES§ + +§LOAD_MODULES§ + +args="\ +--mode §MESSAGING_LAYER§ \ +--num-warmup-messages §NUM_WARM-UP_MESSAGES§ \ +--num-messages §NUM_MESSAGES§ \ +--size-messages §MESSAGE_SIZE§ \ +--num-slowest §NUM_SLOWEST§ \ +--output §REPORT_NAME§.sion \ +§ADDITIONAL_ARGUMENTS§" + +if [ §ALL_TO_ALL§ -ne 0 ]; then + args+=" --all-to-all" +fi +if [ §BIDIR§ -ne 0 ]; then + args+=" --bidirectional" +fi +if [ §UNIDIR§ -ne 0 ]; then + args+=" --unidirectional" +fi +if [ §BISECT§ -ne 0 ]; then + args+=" --bisection" +fi +if [ §MIX§ -ne 0 ]; then + args+=" --randomize" +fi +if [ §SERIAL_TESTS§ -ne 0 ]; then + args+=" --serial-tests" +fi +if [ §NO_SION_FILE§ -ne 0 ]; then + args+=" --no-sion-file" +fi +if [ §PARALLEL_IO§ -ne 0 ]; then + args+=" --parallel-sion-file" +fi +if [ §USE_GPU_MEMORY§ -ne 0 ]; then + args+=" --use-gpu-memory" +fi +if [ §NUM_RANDOMIZE_TASKS§ -ne 0 ]; then + args+=" --num-randomize-tasks §NUM_RANDOMIZE_TASKS§" +fi +if [ §HOSTNAME_GROUPING§ ]; then + args+=" --group-processes-by-hostname" +fi +set -x # echos commands before executing +srun --ntasks=${SLURM_NTASKS} \ + §SRUN_ARGS§ \ + §EXEC_BIN§ \ + ${args}; + +# Indicate Success to jube +if [ $? -ne 0 ]; then + echo "linktest run failed" >> error; +else + touch ready; +fi + +exit 0; -- GitLab