Functional tests without MPI support (#343)

* Let functional tests build without external MPI

* Fix error conditions when using uuid startup with internal MPI

* Do not abort if libibverbs is not found but not using GDA

* Enabled RO functional test initialized with TEST_UUID

* Reduce load time for ro backend_can_run and prevent mpilib_dlclose
crashing

* Fix case TEST_UUID=1, ROCSHMEM_BACKEND='' (autoloading gda)
This commit is contained in:
Aurelien Bouteiller
2025-12-08 11:46:16 -05:00
zatwierdzone przez GitHub
rodzic baaf8091b5
commit c99bc21e10
10 zmienionych plików z 81 dodań i 56 usunięć
-1
Wyświetl plik
@@ -165,7 +165,6 @@ if (NOT BUILD_TESTS_ONLY)
set(HAVE_EXTERNAL_MPI ON)
else()
set(HAVE_EXTERNAL_MPI OFF)
set(BUILD_FUNCTIONAL_TESTS OFF)
set(BUILD_UNIT_TESTS OFF)
endif()
+10 -9
Wyświetl plik
@@ -29,6 +29,7 @@
#include <cassert>
#include "backend_gda.hpp"
#include "ibv_wrapper.hpp"
#include "envvar.hpp"
#include "gda_team.hpp"
#include "mpi_instance.hpp"
@@ -649,15 +650,17 @@ int GDABackend::backend_can_run() {
void *handle{nullptr};
GDAProvider requested = requested_provider();
/* Basic verbs? */
if (!ibv.is_initialized) return ROCSHMEM_ERROR;
/* Try opening bnxt DV libraries */
#if defined(GDA_BNXT)
if (requested == GDAProvider::UNSET || requested == GDAProvider::BNXT) {
handle = bnxt_dv_dlopen();
if (handle) {
auto ret = has_active_ib_interface(GDAProvider::BNXT);
dlclose(handle);
if (has_active_ib_interface(GDAProvider::BNXT)) {
return ROCSHMEM_SUCCESS;
}
if (ret) return ROCSHMEM_SUCCESS;
DPRINTF("BNXT DV library found but no active InfiniBand interface available\n");
}
}
@@ -668,10 +671,9 @@ int GDABackend::backend_can_run() {
if (requested == GDAProvider::UNSET || requested == GDAProvider::IONIC) {
handle = ionic_dv_dlopen();
if (handle) {
auto ret = has_active_ib_interface(GDAProvider::IONIC);
dlclose(handle);
if (has_active_ib_interface(GDAProvider::IONIC)) {
return ROCSHMEM_SUCCESS;
}
if (ret) return ROCSHMEM_SUCCESS;
DPRINTF("IONIC DV library found but no active InfiniBand interface available\n");
}
}
@@ -682,10 +684,9 @@ int GDABackend::backend_can_run() {
if (requested == GDAProvider::UNSET || requested == GDAProvider::MLX5) {
handle = mlx5_dv_dlopen();
if (handle) {
auto ret = has_active_ib_interface(GDAProvider::MLX5);
dlclose(handle);
if (has_active_ib_interface(GDAProvider::MLX5)) {
return ROCSHMEM_SUCCESS;
}
if (ret) return ROCSHMEM_SUCCESS;
DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n");
}
}
+1 -1
Wyświetl plik
@@ -26,6 +26,7 @@
#define LIBRARY_SRC_GDA_BACKEND_HPP_
#include <dlfcn.h>
#include <infiniband/verbs.h>
#include "backend_bc.hpp"
#include "containers/free_list_impl.hpp"
@@ -36,7 +37,6 @@
#include "queue_pair.hpp"
#include "bootstrap/bootstrap.hpp"
#include "debug_gda.hpp"
#include "ibv_wrapper.hpp"
#include "gda/ionic/provider_gda_ionic.hpp"
#include "gda/bnxt/provider_gda_bnxt.hpp"
#include "gda/mlx5/provider_gda_mlx5.hpp"
+6 -4
Wyświetl plik
@@ -42,19 +42,21 @@ IBVWrapper::IBVWrapper() {
ibv_handle = dlopen("/usr/lib/x86_64-linux-gnu/libibverbs.so", RTLD_NOW);
if (!ibv_handle) {
DPRINTF("Could not open libibverbs. Returning\n");
exit(1);
DPRINTF("Could not open libibverbs. Disabled.\n");
return;
}
}
err = init_function_table();
if (err != ROCSHMEM_SUCCESS) {
DPRINTF("Could not construct InfiniBand Verbs function table \n");
exit(1);
DPRINTF("Could not construct InfiniBand Verbs function table. Disabled.\n");
return;
}
is_initialized = true;
}
IBVWrapper::~IBVWrapper() {
is_initialized = false;
if (ibv_handle != nullptr) {
dlclose(ibv_handle);
}
+2
Wyświetl plik
@@ -38,6 +38,8 @@ class IBVWrapper {
explicit IBVWrapper();
virtual ~IBVWrapper();
bool is_initialized{false};
struct ibv_device** get_device_list(int *num_devices);
void free_device_list(struct ibv_device **list);
+3
Wyświetl plik
@@ -138,6 +138,9 @@ void MPIInstance::mpilib_dl_close() {
MPIInstance::MPIInstance(MPI_Comm comm) {
int is_init{0};
assert (nullptr != mpilib_handle_);
mpilib_ftable_.Initialized(&is_init);
if (!is_init) {
+14
Wyświetl plik
@@ -33,6 +33,7 @@
#include <cstdlib>
#include <memory>
#include <thread> // NOLINT
#include <dlfcn.h>
#include "rocshmem/rocshmem.hpp"
#include "atomic_return.hpp"
@@ -131,6 +132,19 @@ ROBackend::ROBackend(MPI_Comm comm)
*done_init = 1;
}
/* Currently we only check whether we can dlopen an MPI library.
*/
int ROBackend::backend_can_run() {
auto handle = dlopen("libmpi.so", RTLD_LAZY);
if (!handle) {
printf("Could not open libmpi.so. Returning\n");
return ROCSHMEM_ERROR;
}
//TODO dlsym MPI_Get_library_version and verify compat when HAVE_EXTERNAL_MPI is undef
dlclose(handle);
return ROCSHMEM_SUCCESS;
}
void ROBackend::setup_ctxs() {
CHECK_HIP(hipMalloc(&ctx_array, sizeof(ROContext) * envvar::max_num_contexts));
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
+8
Wyświetl plik
@@ -72,6 +72,14 @@ class ROBackend : public Backend {
*/
virtual ~ROBackend();
/**
* @brief Verify whether RO Backend could run
*
* @return ROSCHMEM_SUCCESS if RO backend can most likely be used
* ROCSHMEM_ERROR otherwise
*/
static int backend_can_run(void);
/**
* @brief Abort the application.
*
+34 -35
Wyświetl plik
@@ -107,7 +107,7 @@ static BackendType select_backend_type() {
DPRINTF("GDABackend::backend_can_run returned success\n");
return BackendType::GDA_BACKEND;
}
if (MPIInstance::mpilib_dl_init() == ROCSHMEM_SUCCESS) {
if (ROBackend::backend_can_run() == ROCSHMEM_SUCCESS) {
DPRINTF("MPIInstance could dl_init MPI library\n");
return BackendType::RO_BACKEND;
}
@@ -130,6 +130,11 @@ static BackendType select_backend_type() {
int ret;
ret = MPIInstance::mpilib_dl_init();
if (ret != ROCSHMEM_SUCCESS) {
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
exit(1);
}
mpi_instance = new MPIInstance(comm);
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
@@ -155,10 +160,6 @@ static BackendType select_backend_type() {
CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
backend = new (backend) GDABackend(comm);
#elif defined(USE_RO)
if (ret != ROCSHMEM_SUCCESS) {
printf("Could not initialize MPI library. RO conduit requires MPI library to be loaded at runtime. Aborting\n");
abort();
}
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
backend = new (backend) ROBackend(comm);
#elif defined(USE_IPC)
@@ -167,7 +168,8 @@ static BackendType select_backend_type() {
#endif
if (!backend) {
abort();
printf("No Backend could be initialized! Aborting.\n");
exit(1);
}
}
@@ -177,14 +179,16 @@ static BackendType select_backend_type() {
int ret;
ret = MPIInstance::mpilib_dl_init();
if (ret == ROCSHMEM_SUCCESS) {
printf("Could not initialize MPI library. This initialization method of "
"rocSHMEM requires MPI library to be loaded at runtime. Aborting\n");
abort();
if (ret != ROCSHMEM_SUCCESS) {
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
exit(1);
}
mpilib_ftable_.Initialized(&initialized);
if (!initialized) {
if (initialized) {
mpilib_ftable_.Comm_size (MPI_COMM_WORLD, &world_size);
} else {
// This is an Open MPI specific solution to retrieve the number of
// processes that have been started, value can be checked before MPI_Init
char *value = getenv("OMPI_COMM_WORLD_SIZE");
@@ -194,13 +198,11 @@ static BackendType select_backend_type() {
if (world_size != nranks) {
// This solution will require MPI_Sessions. This is planned for the
// future, but is not supported in the current version.
fprintf (stderr, "Unsupported configuration to initialize rocSHMEM. Please "
"initialize the MPI library using MPI_Init first, if you want to "
"initialize rocSHMEM with a subset of the processes\n");
abort();
fprintf(stderr, "Unsupported configuration to initialize rocSHMEM. Please "
"initialize the MPI library using MPI_Init first, if you want to "
"initialize rocSHMEM with a subset of the processes\n");
exit(1);
}
} else {
mpilib_ftable_.Comm_size (MPI_COMM_WORLD, &world_size);
}
if (world_size == nranks) {
@@ -252,11 +254,8 @@ static BackendType select_backend_type() {
backend = new (backend) GDABackend(bootstrap);
break;
case BackendType::RO_BACKEND:
/* Not sure whether this is a valid configuration. Will leave it in for now */
DPRINTF("Initializing RO backend with TCP bootstrapping\n");
mpi_instance = new MPIInstance(MPI_COMM_WORLD);
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
backend = new (backend) ROBackend(MPI_COMM_WORLD);
library_init_subcomm(bootstr, bootstr->getNranks(), bootstr->getRank());
break;
case BackendType::IPC_BACKEND:
DPRINTF("Initializing IPC backend with TCP bootstrapping\n");
@@ -268,23 +267,15 @@ static BackendType select_backend_type() {
CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
backend = new (backend) GDABackend(bootstrap);
#elif defined(USE_RO)
/* Not sure whether this is a valid configuration. Will leave it in for now */
int ret;
ret = MPIInstance::mpilib_dl_init();
if (ret != MPI_SUCCESS) {
printf("RO Backend requires MPI library to be initialized, even when using uniqueId initializations!\n");
abort();
}
mpi_instance = new MPIInstance(MPI_COMM_WORLD);
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
backend = new (backend) ROBackend(MPI_COMM_WORLD);
library_init_subcomm(bootstr, bootstr->getNranks(), bootstr->getRank());
#elif defined(USE_IPC)
CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend)));
backend = new (backend) IPCBackend(bootstrap);
#endif
if (!backend) {
abort();
printf("No Backend could be initialized! Aborting.\n");
exit(1);
}
}
@@ -318,7 +309,7 @@ static BackendType select_backend_type() {
if (envvar::uniqueid_with_mpi) {
library_init_subcomm(bootstr, attr->nranks, attr->rank);
} else {
library_init (bootstr);
library_init(bootstr);
}
}
@@ -367,7 +358,12 @@ static BackendType select_backend_type() {
#endif
[[maybe_unused]] __host__ void rocshmem_init() {
MPIInstance::mpilib_dl_init();
auto ret = MPIInstance::mpilib_dl_init();
if (ret != ROCSHMEM_SUCCESS) {
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
exit(1);
}
library_init(MPI_COMM_WORLD);
}
@@ -458,11 +454,14 @@ __host__ void * rocshmem_ptr(const void * dest, int pe){
backend->~Backend();
CHECK_HIP(hipHostFree(backend));
if (bootstr == nullptr)
if (mpi_instance != nullptr)
delete mpi_instance;
if (bootstr != nullptr)
delete bootstr;
//TODO This crashes
//MPIInstance::mpilib_dl_close();
}
__host__ void rocshmem_query_thread(int *provided) {
+3 -6
Wyświetl plik
@@ -172,6 +172,9 @@ int main(int argc, char *argv[]) {
char key[] = "rocshmem-uuid";
pmix_bcast(&uid, sizeof(rocshmem_uniqueid_t), key, 0);
// Close PMIx before potentially doing MPI_Init inside rocshmem_init
PMIx_Finalize(NULL, 0);
ret = rocshmem_set_attr_uniqueid_args(rank, nranks, &uid, &attr);
if (ret != ROCSHMEM_SUCCESS) {
std::cout << rank << ": Error in rocshmem_set_attr_uniqueid_args. Aborting.\n";
@@ -224,11 +227,5 @@ int main(int argc, char *argv[]) {
*/
rocshmem_finalize();
#ifdef HAVE_PMIX
if (test_uuid) {
PMIx_Finalize(NULL, 0);
}
#endif
return 0;
}