dlclosing the dvlib may leave libibverbs in a broken state (#381)
* Error out when IPC gets selected when it is impossible to run it. * Use RTLD_LAZY when dlopening * Do not dlclose libbnxt/ionic/mlx5.so as that breaks libibverbs
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
e47cff7f45
Коммит
47f6fa6267
@@ -603,6 +603,7 @@ bool GDABackend::has_active_ib_interface(GDAProvider provider) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_devices && !has_active; i++) {
|
||||
DPRINTF("ibv.open device[%d] of %d\n", i, num_devices);
|
||||
struct ibv_context *context = ibv.open_device(device_list[i]);
|
||||
if (!context) {
|
||||
continue;
|
||||
@@ -659,7 +660,7 @@ int GDABackend::backend_can_run() {
|
||||
handle = bnxt_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::BNXT);
|
||||
dlclose(handle);
|
||||
// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("BNXT DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
@@ -672,7 +673,7 @@ int GDABackend::backend_can_run() {
|
||||
handle = ionic_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::IONIC);
|
||||
dlclose(handle);
|
||||
// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("IONIC DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
@@ -685,7 +686,7 @@ int GDABackend::backend_can_run() {
|
||||
handle = mlx5_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::MLX5);
|
||||
dlclose(handle);
|
||||
// dlclose(handle); //TODO: unloading the lib crashes the next call to ibv_open_device
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
|
||||
@@ -264,10 +264,10 @@ void GDABackend::bnxt_create_qps(int sq_length) {
|
||||
|
||||
void* GDABackend::bnxt_dv_dlopen() {
|
||||
void* dv_handle{nullptr};
|
||||
dv_handle = dlopen("libbnxt_re.so", RTLD_NOW);
|
||||
dv_handle = dlopen("libbnxt_re.so", RTLD_LAZY);
|
||||
if (!dv_handle) {
|
||||
// Try hard-coded PATH
|
||||
dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_NOW);
|
||||
dv_handle = dlopen("/usr/local/lib/libbnxt_re.so", RTLD_LAZY);
|
||||
if (!dv_handle) {
|
||||
DPRINTF("Could not open libbnxt_re.so. Returning\n");
|
||||
}
|
||||
|
||||
@@ -132,10 +132,10 @@ void GDABackend::ionic_setup_parent_domain(struct ibv_parent_domain_init_attr* p
|
||||
|
||||
void* GDABackend::ionic_dv_dlopen() {
|
||||
void* dv_handle{nullptr};
|
||||
dv_handle = dlopen("libionic.so", RTLD_NOW);
|
||||
dv_handle = dlopen("libionic.so", RTLD_LAZY);
|
||||
if (!dv_handle) {
|
||||
// Try hard-coded PATH
|
||||
dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_NOW);
|
||||
dv_handle = dlopen("/usr/local/lib/libionic.so", RTLD_LAZY);
|
||||
if (!dv_handle) {
|
||||
DPRINTF("Could not open libionic.so. Returning\n");
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ namespace rocshmem {
|
||||
|
||||
void* GDABackend::mlx5_dv_dlopen() {
|
||||
void* dv_handle{nullptr};
|
||||
dv_handle = dlopen("libmlx5.so", RTLD_NOW);
|
||||
dv_handle = dlopen("libmlx5.so", RTLD_LAZY);
|
||||
if (!dv_handle) {
|
||||
DPRINTF("Could not open libmlx5.so. Returning\n");
|
||||
}
|
||||
|
||||
@@ -72,7 +72,11 @@ IPCBackend::IPCBackend(MPI_Comm comm): Backend(comm) {
|
||||
* Check if num_pes == ipcImpl.shm_size)
|
||||
* All the PEs must be with in a node for IPC conduit
|
||||
*/
|
||||
assert(num_pes == ipcImpl.shm_size);
|
||||
if(num_pes != ipcImpl.shm_size) {
|
||||
fprintf(stderr, "rocSHMEM: IPC Backend selected but some PEs are non-local. This is not a supported configuration.\n"
|
||||
" The GDA and RO backends mix off-node -and- IPC on-node communication as needed.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Initialize the host interface */
|
||||
host_interface = std::make_shared<HostInterface>(hdp_proxy_.get(),
|
||||
|
||||
@@ -42,7 +42,7 @@ int MPIInstance::mpilib_dl_init() {
|
||||
if (mpilib_handle_ != nullptr)
|
||||
return ROCSHMEM_SUCCESS;
|
||||
|
||||
mpilib_handle_ = dlopen("libmpi.so", RTLD_NOW);
|
||||
mpilib_handle_ = dlopen("libmpi.so", RTLD_LAZY);
|
||||
if (!mpilib_handle_) {
|
||||
printf("Could not open libmpi.so. Returning\n");
|
||||
return ROCSHMEM_ERROR;
|
||||
|
||||
Ссылка в новой задаче
Block a user