From 7d63e3386c1810c5eeab7019a6802f99d15920ea Mon Sep 17 00:00:00 2001 From: alvarezmallon1 <alvarezmallon1@jpblt-s01-01.jupiter.internal> Date: Thu, 23 Jan 2025 15:49:47 +0100 Subject: [PATCH 1/4] This enables the output during the retest phase to be more informative, so you can see directly which hosts are involved, instead of ranks that give no topological information --- benchmark/benchmark.cc | 9 +++++---- benchmark/vcluster.cc | 18 ++++++++++++++++-- benchmark/vcluster.h | 5 +++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/benchmark/benchmark.cc b/benchmark/benchmark.cc index ddb78ec..0a45937 100644 --- a/benchmark/benchmark.cc +++ b/benchmark/benchmark.cc @@ -659,7 +659,9 @@ int Benchmark::retest_slow_pairs(slow_pair* const sp,const int n,const int iter) const int to = sp[i].to; if (0 == rank()) { - std::printf(" %6d: Retest %6d <-> %6d:", i, from, to); + const char* from_host = cl->hostnamesAndRanks.hostForRank[from].c_str(); + const char* to_host = cl->hostnamesAndRanks.hostForRank[to].c_str(); + std::printf(" %6d: Retest %6d (%s) <-> %6d (%s):", i, from, from_host, to, to_host); std::fflush(stdout); } @@ -839,9 +841,8 @@ int Benchmark::init() { randomNumberEngineSteps = std::mt19937(0); } - if(args->do_group_processes_by_hostname > 0) { - cl->getHostAndLocalRank(); - } + // Run it always to gather hostname information + cl->getHostAndLocalRank(); return SUCCESS; } diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc index 15f8dee..c489cfb 100644 --- a/benchmark/vcluster.cc +++ b/benchmark/vcluster.cc @@ -514,7 +514,7 @@ const std::string& VirtualCluster::get_vcluster_impl_name(char** argv, const std const std::string dot{"."}; auto pos = executableName.find(dot); if(pos != std::string::npos) { - requestedImpl = executableName.substr(pos); + requestedImpl = executableName.substr(pos+1); } // Check environment @@ -598,7 +598,10 @@ void VirtualCluster::getHostAndLocalRank(){ for(int i=1;i<size();i++){ int j=uhostnames.size()-1; for(;j>=0;j--){ //Iterate backwards as hostnames are likely to repeat - if(hostnames[i]!=uhostnames[j]) continue; + // Compare based on c strings to avoid mistmatches due to control characters + if(strcmp(hostnames[i].c_str(), uhostnames[j].c_str()) != 0){ + continue; + } else{ ranks[j].push_back(i); break; @@ -661,6 +664,17 @@ void VirtualCluster::getHostAndLocalRank(){ } } bcast(0, hostLocalRanks_.get(), buf64x4[0]*buf64x4[2]); + + /* + * Fill in the hostnamesAndRanks struct to be queried later on + */ + for(int rank = 0; rank < size(); rank++){ + hostnamesAndRanks.hostForRank.push_back(hostnames[rank]); + } + for(std::vector<std::string>::size_type host=0;host<uhostnames.size();host++){ + hostnamesAndRanks.ranksForHost.push_back(ranks[host]); + } + }else{ gather(0, &tmp32, &tmp32, 1); //Gather hostname lengths send(0, hostname().c_str(), hostnameSize()); //Send hostname diff --git a/benchmark/vcluster.h b/benchmark/vcluster.h index b516073..d2c65e5 100644 --- a/benchmark/vcluster.h +++ b/benchmark/vcluster.h @@ -73,6 +73,11 @@ public: int localRank() ; std::shared_ptr<int[]> hostLocalRanks() ; + struct { + std::vector<std::string> hostForRank; + std::vector<std::vector<int>> ranksForHost; + } hostnamesAndRanks; + /*! \brief send wrapped data in buf to rank dst (communication layer undefined) * * The data may or may not be routed through the communication layer of this cluster -- GitLab From fdc9dccae0159179a64363a633a4bfb04d341ede Mon Sep 17 00:00:00 2001 From: alvarezmallon1 <alvarezmallon1@jpblt-s01-01.jupiter.internal> Date: Thu, 23 Jan 2025 15:51:35 +0100 Subject: [PATCH 2/4] This enables testing in a single node --- benchmark/vcluster.cc | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/benchmark/vcluster.cc b/benchmark/vcluster.cc index c489cfb..722d39a 100644 --- a/benchmark/vcluster.cc +++ b/benchmark/vcluster.cc @@ -617,14 +617,25 @@ void VirtualCluster::getHostAndLocalRank(){ std::stringstream buf; for(std::vector<std::string>::size_type i=0;i<uhostnames.size();i++){ buf << uhostnames[i].c_str() << ": " << ranks[i]; - debug("[%3d] Ranks on %s", 0, buf.str().c_str());buf.str(std::string()); + debug("[%3d] Ranks on %s", 0, buf.str().c_str()); + buf.str(std::string()); } #endif /******************************************************/ /* Check that all hosts have the same number of ranks */ /******************************************************/ - if(ranks.size()%2!=0) fatal("An even number of hosts is required!"); - for(std::vector<std::vector<int>>::size_type i=0;i<ranks.size();i++) if(ranks[0].size()!=ranks[i].size()) fatal("Hosts have differing amounts of ranks!"); + // One host is allowed for intranode testing + if(ranks.size()%2 != 0 and ranks.size() != 1){ + fatal("An even number of hosts is required!"); + } + else if(size()%2 != 0){ + fatal("An even number of ranks is required!"); + } + for(std::vector<std::vector<int>>::size_type i=0;i<ranks.size();i++){ + if(ranks[0].size()!=ranks[i].size()){ + fatal("Hosts have differing amounts of ranks!"); + } + } /**************************************/ /* Broadcast information to all ranks */ -- GitLab From c83889351f8065119f217cf2420b9dd0ed742fa9 Mon Sep 17 00:00:00 2001 From: alvarezmallon1 <alvarezmallon1@jpblt-s01-01.jupiter.internal> Date: Thu, 23 Jan 2025 15:53:01 +0100 Subject: [PATCH 3/4] Build example for JEDI --- exampleBuildJEDI.sh | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100755 exampleBuildJEDI.sh diff --git a/exampleBuildJEDI.sh b/exampleBuildJEDI.sh new file mode 100755 index 0000000..57075dd --- /dev/null +++ b/exampleBuildJEDI.sh @@ -0,0 +1,40 @@ +#!/bin/bash +############################################################################# +## LinkTest ## +############################################################################# +## Copyright (c) 2008-2021 ## +## Forschungszentrum Juelich, Juelich Supercomputing Centre ## +## ## +## See the file COPYRIGHT in the package base directory for details ## +############################################################################# + + +# The example uses a system that supports MPI, TCP, UCX and IBVerbs. +# Minipmi is already installed in ~/.local + +# Set-Up Environment +ml GCC OpenMPI SciPy-Stack SIONlib; + +# Use locallly installed minipmi, needed since we build with UCX and IBVerbs support +export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib/; +export CPATH=$CPATH:~/.local/include/; + +# Install linktest in folder install +mkdir -p install_jedi; +cd benchmark; +make clean +make -j HAVE_SION=1 HAVE_TCP=0 HAVE_UCP=0 PREFIX=../install_jedi install; +cd ..; +# Install linktest-report +# FIX for JSC Systems +#export CPATH=$EBROOTSCIPYMINBUNDLE/lib/python3*/site-packages/numpy/core/include:$CPATH +#cd install_jedi; +#python3 -m venv linktest-report-venv; +#source linktest-report-venv/bin/activate +#cd ../python; +#python3 -m pip install .; #TODO: Add --use-feature=in-tree-build if using pip 21.0.X to 21.2.X (default from 21.3 onwards) +#deactivate; +#cd ..; +# Notice that we close the virtual environment, since this script is likely not sourced +# To use python-report one has to source linktest-report-venv/bin/activate again +# To uninstall: pip uninstall linktest, or remove the virtual environment completly -- GitLab From 55ec26005ce4535adafd8ec078d87079de6df270 Mon Sep 17 00:00:00 2001 From: alvarezmallon1 <alvarezmallon1@jpblt-s01-01.jupiter.internal> Date: Thu, 23 Jan 2025 15:55:05 +0100 Subject: [PATCH 4/4] To build also the report tools --- exampleBuildJEDI.sh | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/exampleBuildJEDI.sh b/exampleBuildJEDI.sh index 57075dd..3356cd4 100755 --- a/exampleBuildJEDI.sh +++ b/exampleBuildJEDI.sh @@ -15,10 +15,6 @@ # Set-Up Environment ml GCC OpenMPI SciPy-Stack SIONlib; -# Use locallly installed minipmi, needed since we build with UCX and IBVerbs support -export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib/; -export CPATH=$CPATH:~/.local/include/; - # Install linktest in folder install mkdir -p install_jedi; cd benchmark; @@ -26,15 +22,14 @@ make clean make -j HAVE_SION=1 HAVE_TCP=0 HAVE_UCP=0 PREFIX=../install_jedi install; cd ..; # Install linktest-report -# FIX for JSC Systems -#export CPATH=$EBROOTSCIPYMINBUNDLE/lib/python3*/site-packages/numpy/core/include:$CPATH -#cd install_jedi; -#python3 -m venv linktest-report-venv; -#source linktest-report-venv/bin/activate -#cd ../python; -#python3 -m pip install .; #TODO: Add --use-feature=in-tree-build if using pip 21.0.X to 21.2.X (default from 21.3 onwards) -#deactivate; -#cd ..; +export CPATH=$EBROOTSCIPYMINBUNDLE/lib/python3*/site-packages/numpy/core/include:$CPATH +cd install_jedi; +python3 -m venv linktest-report-venv; +source linktest-report-venv/bin/activate +cd ../python; +python3 -m pip install .; #TODO: Add --use-feature=in-tree-build if using pip 21.0.X to 21.2.X (default from 21.3 onwards) +deactivate; +cd ..; # Notice that we close the virtual environment, since this script is likely not sourced # To use python-report one has to source linktest-report-venv/bin/activate again # To uninstall: pip uninstall linktest, or remove the virtual environment completly -- GitLab