From 6cad766d4e952203fa560001519bc49ade065eab Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 8 Jan 2026 13:40:11 -0500 Subject: [PATCH] dlclosing the dvlib may leave libibverbs in a broken state (#381) * Error out when IPC gets selected when it is impossible to run it. * Use RTLD_LAZY when dlopening * Do not dlclose libbnxt/ionic/mlx5.so as that breaks libibverbs [ROCm/rocshmem commit: 47f6fa6267661cae0afd524c83049edbf3a10dda] --- projects/rocshmem/src/gda/backend_gda.cpp | 7 ++++--- projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp | 4 ++-- projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp | 4 ++-- projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp | 2 +- projects/rocshmem/src/ipc/backend_ipc.cpp | 6 +++++- projects/rocshmem/src/mpi_instance.cpp | 2 +- 6 files changed, 15 insertions(+), 10 deletions(-) diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp index 4f47d0af32..daa7c2f28b 100644 --- a/projects/rocshmem/src/gda/backend_gda.cpp +++ b/projects/rocshmem/src/gda/backend_gda.cpp @@ -603,6 +603,7 @@ bool GDABackend::has_active_ib_interface(GDAProvider provider) { } for (int i = 0; i < num_devices && !has_active; i++) { + DPRINTF("ibv.open device[%d] of %d\n", i, num_devices); struct ibv_context *context = ibv.open_device(device_list[i]); if (!context) { continue; @@ -659,7 +660,7 @@ int GDABackend::backend_can_run() { handle = bnxt_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::BNXT); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("BNXT DV library found but no active InfiniBand interface available\n"); } @@ -672,7 +673,7 @@ int GDABackend::backend_can_run() { handle = ionic_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::IONIC); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("IONIC DV library found but no active InfiniBand interface available\n"); } @@ -685,7 +686,7 @@ int GDABackend::backend_can_run() { handle = mlx5_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::MLX5); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n"); } diff --git a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp index 80c67c2143..04f79ede05 100644 --- a/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/projects/rocshmem/src/gda/bnxt/backend_gda_bnxt.cpp @@ -264,10 +264,10 @@ void GDABackend::bnxt_create_qps(int sq_length) { void* GDABackend::bnxt_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libbnxt_re.so", RTLD_NOW); + dv_handle = dlopen("libbnxt_re.so", RTLD_LAZY); if (!dv_handle) { // Try hard-coded PATH - dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW); + dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libbnxt_re.so. Returning\n"); } diff --git a/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp b/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp index dfa56d62bf..98bd31dca8 100644 --- a/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp +++ b/projects/rocshmem/src/gda/ionic/backend_gda_ionic.cpp @@ -132,10 +132,10 @@ void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* p void* GDABackend::ionic_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libionic.so", RTLD_NOW); + dv_handle = dlopen("libionic.so", RTLD_LAZY); if (!dv_handle) { // Try hard-coded PATH - dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW); + dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libionic.so. Returning\n"); } diff --git a/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp b/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp index 5052808162..696208859d 100644 --- a/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp +++ b/projects/rocshmem/src/gda/mlx5/backend_gda_mlx5.cpp @@ -29,7 +29,7 @@ namespace rocshmem { void* GDABackend::mlx5_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libmlx5.so", RTLD_NOW); + dv_handle = dlopen("libmlx5.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libmlx5.so. Returning\n"); } diff --git a/projects/rocshmem/src/ipc/backend_ipc.cpp b/projects/rocshmem/src/ipc/backend_ipc.cpp index 2e073e9c2f..1e732c8d68 100644 --- a/projects/rocshmem/src/ipc/backend_ipc.cpp +++ b/projects/rocshmem/src/ipc/backend_ipc.cpp @@ -72,7 +72,11 @@ IPCBackend::IPCBackend(MPI_Comm comm): Backend(comm) { * Check if num_pes == ipcImpl.shm_size) * All the PEs must be with in a node for IPC conduit */ - assert(num_pes == ipcImpl.shm_size); + if(num_pes != ipcImpl.shm_size) { + fprintf(stderr, "rocSHMEM: IPC Backend selected but some PEs are non-local. This is not a supported configuration.\n" + " The GDA and RO backends mix off-node -and- IPC on-node communication as needed.\n"); + exit(1); + } /* Initialize the host interface */ host_interface = std::make_shared(hdp_proxy_.get(), diff --git a/projects/rocshmem/src/mpi_instance.cpp b/projects/rocshmem/src/mpi_instance.cpp index d1cbe21f2a..7255d62213 100644 --- a/projects/rocshmem/src/mpi_instance.cpp +++ b/projects/rocshmem/src/mpi_instance.cpp @@ -42,7 +42,7 @@ int MPIInstance::mpilib_dl_init() { if (mpilib_handle_ != nullptr) return ROCSHMEM_SUCCESS; - mpilib_handle_ = dlopen("libmpi.so", RTLD_NOW); + mpilib_handle_ = dlopen("libmpi.so", RTLD_LAZY); if (!mpilib_handle_) { printf("Could not open libmpi.so. Returning\n"); return ROCSHMEM_ERROR;