diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp index 4f47d0af32..daa7c2f28b 100644 --- a/src/gda/backend_gda.cpp +++ b/src/gda/backend_gda.cpp @@ -603,6 +603,7 @@ bool GDABackend::has_active_ib_interface(GDAProvider provider) { } for (int i = 0; i < num_devices && !has_active; i++) { + DPRINTF("ibv.open device[%d] of %d\n", i, num_devices); struct ibv_context *context = ibv.open_device(device_list[i]); if (!context) { continue; @@ -659,7 +660,7 @@ int GDABackend::backend_can_run() { handle = bnxt_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::BNXT); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("BNXT DV library found but no active InfiniBand interface available\n"); } @@ -672,7 +673,7 @@ int GDABackend::backend_can_run() { handle = ionic_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::IONIC); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("IONIC DV library found but no active InfiniBand interface available\n"); } @@ -685,7 +686,7 @@ int GDABackend::backend_can_run() { handle = mlx5_dv_dlopen(); if (handle) { auto ret = has_active_ib_interface(GDAProvider::MLX5); - dlclose(handle); +// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device if (ret) return ROCSHMEM_SUCCESS; DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n"); } diff --git a/src/gda/bnxt/backend_gda_bnxt.cpp b/src/gda/bnxt/backend_gda_bnxt.cpp index 80c67c2143..04f79ede05 100644 --- a/src/gda/bnxt/backend_gda_bnxt.cpp +++ b/src/gda/bnxt/backend_gda_bnxt.cpp @@ -264,10 +264,10 @@ void GDABackend::bnxt_create_qps(int sq_length) { void* GDABackend::bnxt_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libbnxt_re.so", RTLD_NOW); + dv_handle = dlopen("libbnxt_re.so", RTLD_LAZY); if (!dv_handle) { // Try hard-coded PATH - dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW); + dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libbnxt_re.so. Returning\n"); } diff --git a/src/gda/ionic/backend_gda_ionic.cpp b/src/gda/ionic/backend_gda_ionic.cpp index dfa56d62bf..98bd31dca8 100644 --- a/src/gda/ionic/backend_gda_ionic.cpp +++ b/src/gda/ionic/backend_gda_ionic.cpp @@ -132,10 +132,10 @@ void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* p void* GDABackend::ionic_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libionic.so", RTLD_NOW); + dv_handle = dlopen("libionic.so", RTLD_LAZY); if (!dv_handle) { // Try hard-coded PATH - dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW); + dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libionic.so. Returning\n"); } diff --git a/src/gda/mlx5/backend_gda_mlx5.cpp b/src/gda/mlx5/backend_gda_mlx5.cpp index 5052808162..696208859d 100644 --- a/src/gda/mlx5/backend_gda_mlx5.cpp +++ b/src/gda/mlx5/backend_gda_mlx5.cpp @@ -29,7 +29,7 @@ namespace rocshmem { void* GDABackend::mlx5_dv_dlopen() { void* dv_handle{nullptr}; - dv_handle = dlopen("libmlx5.so", RTLD_NOW); + dv_handle = dlopen("libmlx5.so", RTLD_LAZY); if (!dv_handle) { DPRINTF("Could not open libmlx5.so. Returning\n"); } diff --git a/src/ipc/backend_ipc.cpp b/src/ipc/backend_ipc.cpp index 2e073e9c2f..1e732c8d68 100644 --- a/src/ipc/backend_ipc.cpp +++ b/src/ipc/backend_ipc.cpp @@ -72,7 +72,11 @@ IPCBackend::IPCBackend(MPI_Comm comm): Backend(comm) { * Check if num_pes == ipcImpl.shm_size) * All the PEs must be with in a node for IPC conduit */ - assert(num_pes == ipcImpl.shm_size); + if(num_pes != ipcImpl.shm_size) { + fprintf(stderr, "rocSHMEM: IPC Backend selected but some PEs are non-local. This is not a supported configuration.\n" + " The GDA and RO backends mix off-node -and- IPC on-node communication as needed.\n"); + exit(1); + } /* Initialize the host interface */ host_interface = std::make_shared(hdp_proxy_.get(), diff --git a/src/mpi_instance.cpp b/src/mpi_instance.cpp index d1cbe21f2a..7255d62213 100644 --- a/src/mpi_instance.cpp +++ b/src/mpi_instance.cpp @@ -42,7 +42,7 @@ int MPIInstance::mpilib_dl_init() { if (mpilib_handle_ != nullptr) return ROCSHMEM_SUCCESS; - mpilib_handle_ = dlopen("libmpi.so", RTLD_NOW); + mpilib_handle_ = dlopen("libmpi.so", RTLD_LAZY); if (!mpilib_handle_) { printf("Could not open libmpi.so. Returning\n"); return ROCSHMEM_ERROR;