diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp index 2a6d7a7cf3..82bf5cfb28 100644 --- a/src/gda/backend_gda.cpp +++ b/src/gda/backend_gda.cpp @@ -547,9 +547,103 @@ GDAProvider GDABackend::requested_provider() { return GDAProvider::UNSET; } -/* Currently we only check whether we can dlopen a Direct Verbs library. - * We might need to extend this logic to check whether we have interfaces that - * can use those DV libraries +/* Check if a device's vendor ID matches the expected vendor for a given provider. + * Returns true if the device matches, false otherwise. + */ +bool GDABackend::device_matches_provider_vendor(GDAProvider provider, + const struct ibv_device_attr &device_attr, + const char *device_name) { + uint32_t expected_vendor_id = 0; + const char *vendor_name = nullptr; + + switch (provider) { + case GDAProvider::BNXT: + expected_vendor_id = GDA_BNXT_VENDOR_ID; + vendor_name = "BNXT/Broadcom"; + break; + case GDAProvider::IONIC: + expected_vendor_id = GDA_IONIC_VENDOR_ID; + vendor_name = "IONIC/Pensando"; + break; + case GDAProvider::MLX5: + expected_vendor_id = GDA_MLX5_VENDOR_ID; + vendor_name = "MLX5/Mellanox"; + break; + case GDAProvider::UNSET: + // UNSET accepts any vendor + return true; + default: + return true; + } + + if (device_attr.vendor_id != expected_vendor_id) { + DPRINTF("Skipping device %s with vendor_id=0x%04x (not %s)\n", + device_name, device_attr.vendor_id, vendor_name); + return false; + } + + return true; +} + +/* Check whether there are active InfiniBand/RDMA interfaces available. + * Verifies the device vendor matches the requested provider to avoid selecting + * the wrong NIC when multiple vendors are present. + * Returns true if at least one active port is found on a matching device. + */ +bool GDABackend::has_active_ib_interface(GDAProvider provider) { + struct ibv_device **device_list = nullptr; + int num_devices = 0; + bool has_active = false; + + device_list = ibv.get_device_list(&num_devices); + if (!device_list || num_devices == 0) { + DPRINTF("No RDMA NIC devices found\n"); + return false; + } + + for (int i = 0; i < num_devices && !has_active; i++) { + struct ibv_context *context = ibv.open_device(device_list[i]); + if (!context) { + continue; + } + + struct ibv_device_attr device_attr; + if (ibv.query_device(context, &device_attr) == 0) { + // Check if device vendor matches the provider + if (!device_matches_provider_vendor(provider, device_attr, + ibv.get_device_name(device_list[i]))) { + ibv.close_device(context); + continue; + } + + for (int port = 1; port <= device_attr.phys_port_cnt; ++port) { + struct ibv_port_attr port_attr; + if (ibv.query_port(context, port, &port_attr) == 0) { + if (port_attr.state == IBV_PORT_ACTIVE) { + DPRINTF("Found active RDMA NIC port %d on device %s (vendor_id=0x%04x, state=%d, phys_state=%d)\n", + port, ibv.get_device_name(device_list[i]), + device_attr.vendor_id, port_attr.state, port_attr.phys_state); + has_active = true; + break; + } + } + } + } + + ibv.close_device(context); + } + + ibv.free_device_list(device_list); + + if (!has_active) { + DPRINTF("No active InfiniBand ports found on any device\n"); + } + + return has_active; +} + +/* Check whether we can dlopen a Direct Verbs library and verify that + * there are active InfiniBand/RDMA interfaces available to use. */ int GDABackend::backend_can_run() { void *handle{nullptr}; @@ -561,7 +655,10 @@ int GDABackend::backend_can_run() { handle = bnxt_dv_dlopen(); if (handle) { dlclose(handle); - return ROCSHMEM_SUCCESS; + if (has_active_ib_interface(GDAProvider::BNXT)) { + return ROCSHMEM_SUCCESS; + } + DPRINTF("BNXT DV library found but no active InfiniBand interface available\n"); } } #endif //defined(GDA_BNXT) @@ -572,7 +669,10 @@ int GDABackend::backend_can_run() { handle = ionic_dv_dlopen(); if (handle) { dlclose(handle); - return ROCSHMEM_SUCCESS; + if (has_active_ib_interface(GDAProvider::IONIC)) { + return ROCSHMEM_SUCCESS; + } + DPRINTF("IONIC DV library found but no active InfiniBand interface available\n"); } } #endif //defined(GDA_IONIC) @@ -583,7 +683,10 @@ int GDABackend::backend_can_run() { handle = mlx5_dv_dlopen(); if (handle) { dlclose(handle); - return ROCSHMEM_SUCCESS; + if (has_active_ib_interface(GDAProvider::MLX5)) { + return ROCSHMEM_SUCCESS; + } + DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n"); } } #endif //defined(GDA_MLX5) @@ -893,11 +996,10 @@ void GDABackend::validate_ib_device() { CHECK_ZERO(err, "ibv_query_device"); if (gda_provider == GDAProvider::BNXT) { - const uint32_t bnxt_vendor_id = 0x14E4; const std::set supported_bnxt_part_ids = { 0x1760 /* BCM57608 */}; const char min_supported_bnxt_fw_ver[12] = "233.2.104.0"; - if (bnxt_vendor_id != device_attr.vendor_id) { + if (device_attr.vendor_id != GDA_BNXT_VENDOR_ID) { printf("%s GDAProvider::BNXT requested but an invalid device is selected\n", debug_str.c_str()); exit(1); } diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp index f2bc96a74e..990e2401eb 100644 --- a/src/gda/backend_gda.hpp +++ b/src/gda/backend_gda.hpp @@ -55,6 +55,10 @@ enum GDAProvider { MLX5 }; +inline constexpr uint32_t GDA_IONIC_VENDOR_ID = 0x1DD8; +inline constexpr uint32_t GDA_MLX5_VENDOR_ID = 0x15B3; +inline constexpr uint32_t GDA_BNXT_VENDOR_ID = 0x14E4; + class GDABackend : public Backend { private: typedef struct dest_info { @@ -131,6 +135,28 @@ class GDABackend : public Backend { */ virtual ~GDABackend(); + /** + * @brief Check if a device's vendor ID matches the expected vendor for a provider + * + * @param provider The GDA provider to check against + * @param device_attr The device attributes containing the vendor ID + * @param device_name The device name (for debug messages) + * @return true if the device vendor matches the provider, false otherwise + */ + static bool device_matches_provider_vendor(GDAProvider provider, + const struct ibv_device_attr &device_attr, + const char *device_name); + + /** + * @brief Check if there are active InfiniBand/RDMA interfaces available + * that match the specified provider's vendor ID. + * + * @param provider The GDA provider to check for (BNXT, IONIC, or MLX5) + * @return true if at least one active port on a matching vendor device is found, + * false otherwise + */ + static bool has_active_ib_interface(GDAProvider provider); + /** * @brief Verify whether GDA Backend could run * diff --git a/src/rocshmem_gpu.cpp b/src/rocshmem_gpu.cpp index b0eea85859..858bf499de 100644 --- a/src/rocshmem_gpu.cpp +++ b/src/rocshmem_gpu.cpp @@ -498,7 +498,7 @@ __device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx) { } __device__ void rocshmem_ctx_pe_quiet(rocshmem_ctx_t ctx, const int *target_pes, size_t npes) { - GPU_DPRINTF("Function: %s (ctx=%zd)\n", __FUNC__, ctx.ctx_opaque); + GPU_DPRINTF("Function: %s (ctx=%zd)\n", __func__, ctx.ctx_opaque); ContextTy *internal_ctx = get_internal_ctx(ctx);