gda: add check for active interfaces when selecting the GDA backend (#327)
* gda: add check for active interfaces when selecting the GDA backend * fix __func__ maco in rocshmem_ctx_pe_quiet * gda: switch to more generic RDMA NIC term in has_active_ib_interface * gda: add active MLX5 and Pensando vendor ID checks for backend selection
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
ba77bdd9a6
Коммит
29000a5644
@@ -547,9 +547,103 @@ GDAProvider GDABackend::requested_provider() {
|
||||
return GDAProvider::UNSET;
|
||||
}
|
||||
|
||||
/* Currently we only check whether we can dlopen a Direct Verbs library.
|
||||
* We might need to extend this logic to check whether we have interfaces that
|
||||
* can use those DV libraries
|
||||
/* Check if a device's vendor ID matches the expected vendor for a given provider.
|
||||
* Returns true if the device matches, false otherwise.
|
||||
*/
|
||||
bool GDABackend::device_matches_provider_vendor(GDAProvider provider,
|
||||
const struct ibv_device_attr &device_attr,
|
||||
const char *device_name) {
|
||||
uint32_t expected_vendor_id = 0;
|
||||
const char *vendor_name = nullptr;
|
||||
|
||||
switch (provider) {
|
||||
case GDAProvider::BNXT:
|
||||
expected_vendor_id = GDA_BNXT_VENDOR_ID;
|
||||
vendor_name = "BNXT/Broadcom";
|
||||
break;
|
||||
case GDAProvider::IONIC:
|
||||
expected_vendor_id = GDA_IONIC_VENDOR_ID;
|
||||
vendor_name = "IONIC/Pensando";
|
||||
break;
|
||||
case GDAProvider::MLX5:
|
||||
expected_vendor_id = GDA_MLX5_VENDOR_ID;
|
||||
vendor_name = "MLX5/Mellanox";
|
||||
break;
|
||||
case GDAProvider::UNSET:
|
||||
// UNSET accepts any vendor
|
||||
return true;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
if (device_attr.vendor_id != expected_vendor_id) {
|
||||
DPRINTF("Skipping device %s with vendor_id=0x%04x (not %s)\n",
|
||||
device_name, device_attr.vendor_id, vendor_name);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Check whether there are active InfiniBand/RDMA interfaces available.
|
||||
* Verifies the device vendor matches the requested provider to avoid selecting
|
||||
* the wrong NIC when multiple vendors are present.
|
||||
* Returns true if at least one active port is found on a matching device.
|
||||
*/
|
||||
bool GDABackend::has_active_ib_interface(GDAProvider provider) {
|
||||
struct ibv_device **device_list = nullptr;
|
||||
int num_devices = 0;
|
||||
bool has_active = false;
|
||||
|
||||
device_list = ibv.get_device_list(&num_devices);
|
||||
if (!device_list || num_devices == 0) {
|
||||
DPRINTF("No RDMA NIC devices found\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_devices && !has_active; i++) {
|
||||
struct ibv_context *context = ibv.open_device(device_list[i]);
|
||||
if (!context) {
|
||||
continue;
|
||||
}
|
||||
|
||||
struct ibv_device_attr device_attr;
|
||||
if (ibv.query_device(context, &device_attr) == 0) {
|
||||
// Check if device vendor matches the provider
|
||||
if (!device_matches_provider_vendor(provider, device_attr,
|
||||
ibv.get_device_name(device_list[i]))) {
|
||||
ibv.close_device(context);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int port = 1; port <= device_attr.phys_port_cnt; ++port) {
|
||||
struct ibv_port_attr port_attr;
|
||||
if (ibv.query_port(context, port, &port_attr) == 0) {
|
||||
if (port_attr.state == IBV_PORT_ACTIVE) {
|
||||
DPRINTF("Found active RDMA NIC port %d on device %s (vendor_id=0x%04x, state=%d, phys_state=%d)\n",
|
||||
port, ibv.get_device_name(device_list[i]),
|
||||
device_attr.vendor_id, port_attr.state, port_attr.phys_state);
|
||||
has_active = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ibv.close_device(context);
|
||||
}
|
||||
|
||||
ibv.free_device_list(device_list);
|
||||
|
||||
if (!has_active) {
|
||||
DPRINTF("No active InfiniBand ports found on any device\n");
|
||||
}
|
||||
|
||||
return has_active;
|
||||
}
|
||||
|
||||
/* Check whether we can dlopen a Direct Verbs library and verify that
|
||||
* there are active InfiniBand/RDMA interfaces available to use.
|
||||
*/
|
||||
int GDABackend::backend_can_run() {
|
||||
void *handle{nullptr};
|
||||
@@ -561,7 +655,10 @@ int GDABackend::backend_can_run() {
|
||||
handle = bnxt_dv_dlopen();
|
||||
if (handle) {
|
||||
dlclose(handle);
|
||||
return ROCSHMEM_SUCCESS;
|
||||
if (has_active_ib_interface(GDAProvider::BNXT)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
DPRINTF("BNXT DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
#endif //defined(GDA_BNXT)
|
||||
@@ -572,7 +669,10 @@ int GDABackend::backend_can_run() {
|
||||
handle = ionic_dv_dlopen();
|
||||
if (handle) {
|
||||
dlclose(handle);
|
||||
return ROCSHMEM_SUCCESS;
|
||||
if (has_active_ib_interface(GDAProvider::IONIC)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
DPRINTF("IONIC DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
#endif //defined(GDA_IONIC)
|
||||
@@ -583,7 +683,10 @@ int GDABackend::backend_can_run() {
|
||||
handle = mlx5_dv_dlopen();
|
||||
if (handle) {
|
||||
dlclose(handle);
|
||||
return ROCSHMEM_SUCCESS;
|
||||
if (has_active_ib_interface(GDAProvider::MLX5)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
#endif //defined(GDA_MLX5)
|
||||
@@ -893,11 +996,10 @@ void GDABackend::validate_ib_device() {
|
||||
CHECK_ZERO(err, "ibv_query_device");
|
||||
|
||||
if (gda_provider == GDAProvider::BNXT) {
|
||||
const uint32_t bnxt_vendor_id = 0x14E4;
|
||||
const std::set<uint32_t> supported_bnxt_part_ids = { 0x1760 /* BCM57608 */};
|
||||
const char min_supported_bnxt_fw_ver[12] = "233.2.104.0";
|
||||
|
||||
if (bnxt_vendor_id != device_attr.vendor_id) {
|
||||
if (device_attr.vendor_id != GDA_BNXT_VENDOR_ID) {
|
||||
printf("%s GDAProvider::BNXT requested but an invalid device is selected\n", debug_str.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -55,6 +55,10 @@ enum GDAProvider {
|
||||
MLX5
|
||||
};
|
||||
|
||||
inline constexpr uint32_t GDA_IONIC_VENDOR_ID = 0x1DD8;
|
||||
inline constexpr uint32_t GDA_MLX5_VENDOR_ID = 0x15B3;
|
||||
inline constexpr uint32_t GDA_BNXT_VENDOR_ID = 0x14E4;
|
||||
|
||||
class GDABackend : public Backend {
|
||||
private:
|
||||
typedef struct dest_info {
|
||||
@@ -131,6 +135,28 @@ class GDABackend : public Backend {
|
||||
*/
|
||||
virtual ~GDABackend();
|
||||
|
||||
/**
|
||||
* @brief Check if a device's vendor ID matches the expected vendor for a provider
|
||||
*
|
||||
* @param provider The GDA provider to check against
|
||||
* @param device_attr The device attributes containing the vendor ID
|
||||
* @param device_name The device name (for debug messages)
|
||||
* @return true if the device vendor matches the provider, false otherwise
|
||||
*/
|
||||
static bool device_matches_provider_vendor(GDAProvider provider,
|
||||
const struct ibv_device_attr &device_attr,
|
||||
const char *device_name);
|
||||
|
||||
/**
|
||||
* @brief Check if there are active InfiniBand/RDMA interfaces available
|
||||
* that match the specified provider's vendor ID.
|
||||
*
|
||||
* @param provider The GDA provider to check for (BNXT, IONIC, or MLX5)
|
||||
* @return true if at least one active port on a matching vendor device is found,
|
||||
* false otherwise
|
||||
*/
|
||||
static bool has_active_ib_interface(GDAProvider provider);
|
||||
|
||||
/**
|
||||
* @brief Verify whether GDA Backend could run
|
||||
*
|
||||
|
||||
@@ -498,7 +498,7 @@ __device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) {
|
||||
}
|
||||
|
||||
__device__ void rocshmem_ctx_pe_quiet(rocshmem_ctx_t ctx, const int *target_pes, size_t npes) {
|
||||
GPU_DPRINTF("Function: %s (ctx=%zd)\n", __FUNC__, ctx.ctx_opaque);
|
||||
GPU_DPRINTF("Function: %s (ctx=%zd)\n", __func__, ctx.ctx_opaque);
|
||||
|
||||
ContextTy *internal_ctx = get_internal_ctx(ctx);
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user