Select Git revision
vcluster.cc
system.cc 5.08 KiB
/****************************************************************************
** LinkTest **
*****************************************************************************
** Copyright (c) 2008-2022 **
** Forschungszentrum Juelich, Juelich Supercomputing Centre **
** **
** See the file COPYRIGHT in the package base directory for details **
****************************************************************************/
#include "config.h"
#include "compiler.h"
#include "error.h"
#include "environ.h"
#include "system.h"
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <cstring>
#include <stdexcept>
namespace linktest
{
static std::string hostname(const std::string& suffix)
{
char name[64];
gethostname(name, sizeof(name));
// Get only the hostname part of the string
for(auto i = 0; name[i]; ++i) {
if ('.' == name[i]) {
name[i] = 0;
break;
}
}
return std::string(name) + suffix;
}
/* Use the hostname of the node to retrieve the IP to which other hosts should
* connect.
*/
static void retrieve_addr_from_hostname(const std::string& suffix,
linktest::IpAddr* addr)
{
struct addrinfo hints;
struct addrinfo* ailist = nullptr;
struct addrinfo* aip = nullptr;
std::memset(addr, 0, sizeof(*addr));
addr->sin_family = AF_INET;
std::memset(&hints, 0, sizeof(hints));
hints.ai_flags = AI_CANONNAME;
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
hints.ai_protocol = IPPROTO_TCP;
getaddrinfo(hostname(suffix).c_str(), nullptr, &hints, &ailist);
if ((aip = ailist)) {
*addr = *(linktest::IpAddr *)aip->ai_addr;
freeaddrinfo(aip);
} else {
fatal("getaddrinfo(\"%s\") returned nullptr",
hostname(suffix).c_str());
}
}
static System* _system_singleton = nullptr;
void System::store_singleton(System* system)
{
_system_singleton = system;
}
System* System::singleton()
{
return _system_singleton;
}
void System::setup_singleton_instance()
{
if (std::string("JUWELS Booster") == std::string(LINKTEST_SYSTEM)) {
store_singleton(new linktest::System_JUWELS_Booster);
} else
if (std::string("JURECA-DC GPU") == std::string(LINKTEST_SYSTEM)) {
store_singleton(new linktest::System_JURECA_DC_GPU);
} else {
store_singleton(new linktest::GenericSystem);
}
}
void read_environ_string(const std::string& k, std::string& val)
{
const char* p;
auto ret = read_environ_str(
(std::string(LINKTEST_ENVIRON_PREFIX) + k).c_str(), &p);
if (0 == ret) {
val = p;
}
}
GenericSystem::GenericSystem()
: closest_hca_port_(1), closest_gpu_(0)
{
read_environ_string("SYSTEM_HCA_NAME", closest_hca_name_);
read_environ_int(LINKTEST_ENVIRON_PREFIX "SYSTEM_HCA_PORT", &closest_hca_port_);
read_environ_int(LINKTEST_ENVIRON_PREFIX "SYSTEM_GPU", &closest_gpu_);
read_environ_string("SYSTEM_NODENAME_SUFFIX", nodename_suffix_);
}
bool GenericSystem::has_closest_hca() const
{
return (closest_hca_name_.length() > 0);
}
void GenericSystem::closest_hca_name(std::string& name)
{
name = closest_hca_name_;
}
int GenericSystem::closest_hca_port()
{
return closest_hca_port_;
}
int GenericSystem::closest_gpu_device()
{
return closest_gpu_;
}
int GenericSystem::ip_address(IpAddr* addr)
{
retrieve_addr_from_hostname(nodename_suffix_, addr);
return SUCCESS;
}
MultiGpuMultiHcaSystem::MultiGpuMultiHcaSystem(const std::array<int, kNumGpus>& local_rank_to_gpu,
const std::array<int, kNumGpus>& local_rank_to_hca)
: local_rank_(-1),
local_rank_to_gpu_(local_rank_to_gpu),
local_rank_to_hca_(local_rank_to_hca)
{
auto err = read_environ_int("MPI_LOCALRANKID", &local_rank_);
if (unlikely(err)) {
error("Failed to read \"MPI_LOCALRANKID\" from environment");
}
verify_local_rank_();
char buf[64];
std::snprintf(buf, sizeof(buf), "mlx5_%d",
local_rank_to_hca_[local_rank_]);
closest_hca_name_ = buf;
closest_hca_port_ = 1;
closest_gpu_ = local_rank_to_gpu_[local_rank_];
}
int MultiGpuMultiHcaSystem::change_environ()
{
char buf[64];
std::snprintf(buf, sizeof(buf), "%s:%d",
closest_hca_name_.c_str(), closest_hca_port_);
setenv("UCX_MAX_RNDV_RAILS", "1", 0);
setenv("UCX_NET_DEVICES", buf, 0);
return SUCCESS;
}
void MultiGpuMultiHcaSystem::verify_local_rank_()
{
if (unlikely((local_rank_ < 0) || (local_rank_ >= kNumGpus))) {
error("Invalid local rank = %d", local_rank_);
throw std::runtime_error("Invalid local rank");
}
}
System_JUWELS_Booster::System_JUWELS_Booster()
: MultiGpuMultiHcaSystem({0, 1, 2, 3}, {0, 1, 2, 3})
{
nodename_suffix_ = "i";
}
System_JURECA_DC_GPU::System_JURECA_DC_GPU()
: MultiGpuMultiHcaSystem({0, 1, 2, 3}, {1, 0, 3, 2})
{
nodename_suffix_ = "i";
}
}