gda: add check for active interfaces when selecting the GDA backend (#327)

* gda: add check for active interfaces when selecting the GDA backend

* fix __func__ maco in rocshmem_ctx_pe_quiet

* gda: switch to more generic RDMA NIC term in has_active_ib_interface

* gda: add active MLX5 and Pensando vendor ID checks for backend selection
This commit is contained in:
Kutovoi, Vadim
2025-12-01 20:49:25 +00:00
committed by GitHub
vanhempi ba77bdd9a6
commit 29000a5644
3 muutettua tiedostoa jossa 137 lisäystä ja 9 poistoa
+110 -8
Näytä tiedosto
@@ -547,9 +547,103 @@ GDAProvider GDABackend::requested_provider() {
return GDAProvider::UNSET;
}
/* Currently we only check whether we can dlopen a Direct Verbs library.
* We might need to extend this logic to check whether we have interfaces that
* can use those DV libraries
/* Check if a device's vendor ID matches the expected vendor for a given provider.
* Returns true if the device matches, false otherwise.
*/
bool GDABackend::device_matches_provider_vendor(GDAProvider provider,
const struct ibv_device_attr &device_attr,
const char *device_name) {
uint32_t expected_vendor_id = 0;
const char *vendor_name = nullptr;
switch (provider) {
case GDAProvider::BNXT:
expected_vendor_id = GDA_BNXT_VENDOR_ID;
vendor_name = "BNXT/Broadcom";
break;
case GDAProvider::IONIC:
expected_vendor_id = GDA_IONIC_VENDOR_ID;
vendor_name = "IONIC/Pensando";
break;
case GDAProvider::MLX5:
expected_vendor_id = GDA_MLX5_VENDOR_ID;
vendor_name = "MLX5/Mellanox";
break;
case GDAProvider::UNSET:
// UNSET accepts any vendor
return true;
default:
return true;
}
if (device_attr.vendor_id != expected_vendor_id) {
DPRINTF("Skipping device %s with vendor_id=0x%04x (not %s)\n",
device_name, device_attr.vendor_id, vendor_name);
return false;
}
return true;
}
/* Check whether there are active InfiniBand/RDMA interfaces available.
* Verifies the device vendor matches the requested provider to avoid selecting
* the wrong NIC when multiple vendors are present.
* Returns true if at least one active port is found on a matching device.
*/
bool GDABackend::has_active_ib_interface(GDAProvider provider) {
struct ibv_device **device_list = nullptr;
int num_devices = 0;
bool has_active = false;
device_list = ibv.get_device_list(&num_devices);
if (!device_list || num_devices == 0) {
DPRINTF("No RDMA NIC devices found\n");
return false;
}
for (int i = 0; i < num_devices && !has_active; i++) {
struct ibv_context *context = ibv.open_device(device_list[i]);
if (!context) {
continue;
}
struct ibv_device_attr device_attr;
if (ibv.query_device(context, &device_attr) == 0) {
// Check if device vendor matches the provider
if (!device_matches_provider_vendor(provider, device_attr,
ibv.get_device_name(device_list[i]))) {
ibv.close_device(context);
continue;
}
for (int port = 1; port <= device_attr.phys_port_cnt; ++port) {
struct ibv_port_attr port_attr;
if (ibv.query_port(context, port, &port_attr) == 0) {
if (port_attr.state == IBV_PORT_ACTIVE) {
DPRINTF("Found active RDMA NIC port %d on device %s (vendor_id=0x%04x, state=%d, phys_state=%d)\n",
port, ibv.get_device_name(device_list[i]),
device_attr.vendor_id, port_attr.state, port_attr.phys_state);
has_active = true;
break;
}
}
}
}
ibv.close_device(context);
}
ibv.free_device_list(device_list);
if (!has_active) {
DPRINTF("No active InfiniBand ports found on any device\n");
}
return has_active;
}
/* Check whether we can dlopen a Direct Verbs library and verify that
* there are active InfiniBand/RDMA interfaces available to use.
*/
int GDABackend::backend_can_run() {
void *handle{nullptr};
@@ -561,7 +655,10 @@ int GDABackend::backend_can_run() {
handle = bnxt_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
if (has_active_ib_interface(GDAProvider::BNXT)) {
return ROCSHMEM_SUCCESS;
}
DPRINTF("BNXT DV library found but no active InfiniBand interface available\n");
}
}
#endif //defined(GDA_BNXT)
@@ -572,7 +669,10 @@ int GDABackend::backend_can_run() {
handle = ionic_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
if (has_active_ib_interface(GDAProvider::IONIC)) {
return ROCSHMEM_SUCCESS;
}
DPRINTF("IONIC DV library found but no active InfiniBand interface available\n");
}
}
#endif //defined(GDA_IONIC)
@@ -583,7 +683,10 @@ int GDABackend::backend_can_run() {
handle = mlx5_dv_dlopen();
if (handle) {
dlclose(handle);
return ROCSHMEM_SUCCESS;
if (has_active_ib_interface(GDAProvider::MLX5)) {
return ROCSHMEM_SUCCESS;
}
DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n");
}
}
#endif //defined(GDA_MLX5)
@@ -893,11 +996,10 @@ void GDABackend::validate_ib_device() {
CHECK_ZERO(err, "ibv_query_device");
if (gda_provider == GDAProvider::BNXT) {
const uint32_t bnxt_vendor_id = 0x14E4;
const std::set<uint32_t> supported_bnxt_part_ids = { 0x1760 /* BCM57608 */};
const char min_supported_bnxt_fw_ver[12] = "233.2.104.0";
if (bnxt_vendor_id != device_attr.vendor_id) {
if (device_attr.vendor_id != GDA_BNXT_VENDOR_ID) {
printf("%s GDAProvider::BNXT requested but an invalid device is selected\n", debug_str.c_str());
exit(1);
}
+26
Näytä tiedosto
@@ -55,6 +55,10 @@ enum GDAProvider {
MLX5
};
inline constexpr uint32_t GDA_IONIC_VENDOR_ID = 0x1DD8;
inline constexpr uint32_t GDA_MLX5_VENDOR_ID = 0x15B3;
inline constexpr uint32_t GDA_BNXT_VENDOR_ID = 0x14E4;
class GDABackend : public Backend {
private:
typedef struct dest_info {
@@ -131,6 +135,28 @@ class GDABackend : public Backend {
*/
virtual ~GDABackend();
/**
* @brief Check if a device's vendor ID matches the expected vendor for a provider
*
* @param provider The GDA provider to check against
* @param device_attr The device attributes containing the vendor ID
* @param device_name The device name (for debug messages)
* @return true if the device vendor matches the provider, false otherwise
*/
static bool device_matches_provider_vendor(GDAProvider provider,
const struct ibv_device_attr &device_attr,
const char *device_name);
/**
* @brief Check if there are active InfiniBand/RDMA interfaces available
* that match the specified provider's vendor ID.
*
* @param provider The GDA provider to check for (BNXT, IONIC, or MLX5)
* @return true if at least one active port on a matching vendor device is found,
* false otherwise
*/
static bool has_active_ib_interface(GDAProvider provider);
/**
* @brief Verify whether GDA Backend could run
*
+1 -1
Näytä tiedosto
@@ -498,7 +498,7 @@ __device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) {
}
__device__ void rocshmem_ctx_pe_quiet(rocshmem_ctx_t ctx, const int *target_pes, size_t npes) {
GPU_DPRINTF("Function: %s (ctx=%zd)\n", __FUNC__, ctx.ctx_opaque);
GPU_DPRINTF("Function: %s (ctx=%zd)\n", __func__, ctx.ctx_opaque);
ContextTy *internal_ctx = get_internal_ctx(ctx);