Select Git revision
SystemSettings.h
system.h 4.71 KiB
/****************************************************************************
** LinkTest **
*****************************************************************************
** Copyright (c) 2008-2022 **
** Forschungszentrum Juelich, Juelich Supercomputing Centre **
** **
** See the file COPYRIGHT in the package base directory for details **
****************************************************************************/
#ifndef LINKTEST_SYSTEM_H
#define LINKTEST_SYSTEM_H
#include <string>
#include <array>
#include <initializer_list>
#include "utils.h"
#include "error.h"
namespace linktest
{
/* Representation of the system on which the benchmark is executed. Used for
* programmatic customizations as long as we cannot handle them through
* configuration files or application-internal scripting.
*/
class System
{
public:
static void store_singleton(System* system);
static System* singleton();
static void setup_singleton_instance();
/* Callback function that allows to modify the process environment
* at an early stage in the application. The callback is executed
* before the first VirtualCluster instance is created.
*/
virtual int change_environ() { return SUCCESS; }
/* Returns `true` if the calling process should use a specific
* InfiniBand network device.
*/
[[nodiscard]] virtual bool has_closest_hca() const = 0;
/* Name of hte closest InfiniBand network device for the calling process.
* If no preference can be provided, the function returns -1.
*/
virtual void closest_hca_name(std::string& name) = 0;
/* InfiniBand network device port to use. By default the first port
* is used.
*/
[[nodiscard]] virtual int closest_hca_port() = 0;
// Retrieve the closest GPU device for the calling process.
[[nodiscard]] virtual int closest_gpu_device() = 0;
/* Get an IP address of the local host that can be reached from other
* nodes in the system. Some systems have an hierarchical infrastructure
* and not all IP subnets are globally routed. In this case we need to
* pick the right IP.
*/
[[nodiscard]] virtual int ip_address(IpAddr* addr) = 0;
};
/* A generic HPC system. The values for the choice of HCA and GPU can be
* modified via the environment by setting
* - `LINKTEST_SYSTEM_HCA_NAME` (e.g. to `mlx5_0`)
* - `LINKTEST_SYSTEM_HCA_PORT` (e.g. to `1`)
* - `LINKTEST_SYSTEM_GPU` (e.g. to `0`)
* The variable `LINKTEST_SYSTEM_NODENAME_SUFFIX` can be set to allow the code
* to resolve the right IP to use for the setup of the IP-based out-of-band
* communication channel.
*/
class GenericSystem : public System
{
public:
GenericSystem();
GenericSystem(const GenericSystem&) = delete;
GenericSystem(GenericSystem&&) = delete;
[[nodiscard]] bool has_closest_hca() const override;
void closest_hca_name(std::string& name) override;
[[nodiscard]] int closest_hca_port() override;
[[nodiscard]] int closest_gpu_device() override;
[[nodiscard]] int ip_address(IpAddr* addr) override;
protected:
std::string closest_hca_name_;
int closest_hca_port_;
int closest_gpu_;
std::string nodename_suffix_;
};
// System with multiple GPUs and one HCA per GPU
class MultiGpuMultiHcaSystem : public GenericSystem
{
public:
static const int kNumGpus = 4;
MultiGpuMultiHcaSystem(const std::array<int, kNumGpus>& local_rank_to_gpu, const std::array<int, kNumGpus>& local_rank_to_hca);
MultiGpuMultiHcaSystem(const MultiGpuMultiHcaSystem&) = delete;
MultiGpuMultiHcaSystem(MultiGpuMultiHcaSystem&&) = delete;
virtual int change_environ();
private:
// The rank on the node
int local_rank_;
// Mapping of the local rank to the nearest GPU
std::array<int, kNumGpus> local_rank_to_gpu_;
// Mapping of the local rank to the right HCA
std::array<int, kNumGpus> local_rank_to_hca_;
void verify_local_rank_();
};
// The JUWELS Booster system at JSC
class System_JUWELS_Booster : public MultiGpuMultiHcaSystem
{
public:
System_JUWELS_Booster();
System_JUWELS_Booster(const System_JUWELS_Booster&) = delete;
System_JUWELS_Booster(System_JUWELS_Booster&&) = delete;
};
// The JURECA-DC system at JSC
class System_JURECA_DC_GPU : public MultiGpuMultiHcaSystem
{
public:
System_JURECA_DC_GPU();
System_JURECA_DC_GPU(const System_JURECA_DC_GPU&) = delete;
System_JURECA_DC_GPU(System_JURECA_DC_GPU&&) = delete;
};
}
#endif