Functional tests without MPI support (#343)
* Let functional tests build without external MPI * Fix error conditions when using uuid startup with internal MPI * Do not abort if libibverbs is not found but not using GDA * Enabled RO functional test initialized with TEST_UUID * Reduce load time for ro backend_can_run and prevent mpilib_dlclose crashing * Fix case TEST_UUID=1, ROCSHMEM_BACKEND='' (autoloading gda)
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
baaf8091b5
commit
c99bc21e10
@@ -165,7 +165,6 @@ if (NOT BUILD_TESTS_ONLY)
|
||||
set(HAVE_EXTERNAL_MPI ON)
|
||||
else()
|
||||
set(HAVE_EXTERNAL_MPI OFF)
|
||||
set(BUILD_FUNCTIONAL_TESTS OFF)
|
||||
set(BUILD_UNIT_TESTS OFF)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <cassert>
|
||||
|
||||
#include "backend_gda.hpp"
|
||||
#include "ibv_wrapper.hpp"
|
||||
#include "envvar.hpp"
|
||||
#include "gda_team.hpp"
|
||||
#include "mpi_instance.hpp"
|
||||
@@ -649,15 +650,17 @@ int GDABackend::backend_can_run() {
|
||||
void *handle{nullptr};
|
||||
GDAProvider requested = requested_provider();
|
||||
|
||||
/* Basic verbs? */
|
||||
if (!ibv.is_initialized) return ROCSHMEM_ERROR;
|
||||
|
||||
/* Try opening bnxt DV libraries */
|
||||
#if defined(GDA_BNXT)
|
||||
if (requested == GDAProvider::UNSET || requested == GDAProvider::BNXT) {
|
||||
handle = bnxt_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::BNXT);
|
||||
dlclose(handle);
|
||||
if (has_active_ib_interface(GDAProvider::BNXT)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("BNXT DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
@@ -668,10 +671,9 @@ int GDABackend::backend_can_run() {
|
||||
if (requested == GDAProvider::UNSET || requested == GDAProvider::IONIC) {
|
||||
handle = ionic_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::IONIC);
|
||||
dlclose(handle);
|
||||
if (has_active_ib_interface(GDAProvider::IONIC)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("IONIC DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
@@ -682,10 +684,9 @@ int GDABackend::backend_can_run() {
|
||||
if (requested == GDAProvider::UNSET || requested == GDAProvider::MLX5) {
|
||||
handle = mlx5_dv_dlopen();
|
||||
if (handle) {
|
||||
auto ret = has_active_ib_interface(GDAProvider::MLX5);
|
||||
dlclose(handle);
|
||||
if (has_active_ib_interface(GDAProvider::MLX5)) {
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
if (ret) return ROCSHMEM_SUCCESS;
|
||||
DPRINTF("MLX5 DV library found but no active InfiniBand interface available\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#define LIBRARY_SRC_GDA_BACKEND_HPP_
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <infiniband/verbs.h>
|
||||
|
||||
#include "backend_bc.hpp"
|
||||
#include "containers/free_list_impl.hpp"
|
||||
@@ -36,7 +37,6 @@
|
||||
#include "queue_pair.hpp"
|
||||
#include "bootstrap/bootstrap.hpp"
|
||||
#include "debug_gda.hpp"
|
||||
#include "ibv_wrapper.hpp"
|
||||
#include "gda/ionic/provider_gda_ionic.hpp"
|
||||
#include "gda/bnxt/provider_gda_bnxt.hpp"
|
||||
#include "gda/mlx5/provider_gda_mlx5.hpp"
|
||||
|
||||
@@ -42,19 +42,21 @@ IBVWrapper::IBVWrapper() {
|
||||
ibv_handle = dlopen("/usr/lib/x86_64-linux-gnu/libibverbs.so", RTLD_NOW);
|
||||
|
||||
if (!ibv_handle) {
|
||||
DPRINTF("Could not open libibverbs. Returning\n");
|
||||
exit(1);
|
||||
DPRINTF("Could not open libibverbs. Disabled.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
err = init_function_table();
|
||||
if (err != ROCSHMEM_SUCCESS) {
|
||||
DPRINTF("Could not construct InfiniBand Verbs function table \n");
|
||||
exit(1);
|
||||
DPRINTF("Could not construct InfiniBand Verbs function table. Disabled.\n");
|
||||
return;
|
||||
}
|
||||
is_initialized = true;
|
||||
}
|
||||
|
||||
IBVWrapper::~IBVWrapper() {
|
||||
is_initialized = false;
|
||||
if (ibv_handle != nullptr) {
|
||||
dlclose(ibv_handle);
|
||||
}
|
||||
|
||||
@@ -38,6 +38,8 @@ class IBVWrapper {
|
||||
explicit IBVWrapper();
|
||||
virtual ~IBVWrapper();
|
||||
|
||||
bool is_initialized{false};
|
||||
|
||||
struct ibv_device** get_device_list(int *num_devices);
|
||||
void free_device_list(struct ibv_device **list);
|
||||
|
||||
|
||||
@@ -138,6 +138,9 @@ void MPIInstance::mpilib_dl_close() {
|
||||
|
||||
MPIInstance::MPIInstance(MPI_Comm comm) {
|
||||
int is_init{0};
|
||||
|
||||
assert (nullptr != mpilib_handle_);
|
||||
|
||||
mpilib_ftable_.Initialized(&is_init);
|
||||
|
||||
if (!is_init) {
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <thread> // NOLINT
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include "rocshmem/rocshmem.hpp"
|
||||
#include "atomic_return.hpp"
|
||||
@@ -131,6 +132,19 @@ ROBackend::ROBackend(MPI_Comm comm)
|
||||
*done_init = 1;
|
||||
}
|
||||
|
||||
/* Currently we only check whether we can dlopen an MPI library.
|
||||
*/
|
||||
int ROBackend::backend_can_run() {
|
||||
auto handle = dlopen("libmpi.so", RTLD_LAZY);
|
||||
if (!handle) {
|
||||
printf("Could not open libmpi.so. Returning\n");
|
||||
return ROCSHMEM_ERROR;
|
||||
}
|
||||
//TODO dlsym MPI_Get_library_version and verify compat when HAVE_EXTERNAL_MPI is undef
|
||||
dlclose(handle);
|
||||
return ROCSHMEM_SUCCESS;
|
||||
}
|
||||
|
||||
void ROBackend::setup_ctxs() {
|
||||
CHECK_HIP(hipMalloc(&ctx_array, sizeof(ROContext) * envvar::max_num_contexts));
|
||||
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
|
||||
|
||||
@@ -72,6 +72,14 @@ class ROBackend : public Backend {
|
||||
*/
|
||||
virtual ~ROBackend();
|
||||
|
||||
/**
|
||||
* @brief Verify whether RO Backend could run
|
||||
*
|
||||
* @return ROSCHMEM_SUCCESS if RO backend can most likely be used
|
||||
* ROCSHMEM_ERROR otherwise
|
||||
*/
|
||||
static int backend_can_run(void);
|
||||
|
||||
/**
|
||||
* @brief Abort the application.
|
||||
*
|
||||
|
||||
+34
-35
@@ -107,7 +107,7 @@ static BackendType select_backend_type() {
|
||||
DPRINTF("GDABackend::backend_can_run returned success\n");
|
||||
return BackendType::GDA_BACKEND;
|
||||
}
|
||||
if (MPIInstance::mpilib_dl_init() == ROCSHMEM_SUCCESS) {
|
||||
if (ROBackend::backend_can_run() == ROCSHMEM_SUCCESS) {
|
||||
DPRINTF("MPIInstance could dl_init MPI library\n");
|
||||
return BackendType::RO_BACKEND;
|
||||
}
|
||||
@@ -130,6 +130,11 @@ static BackendType select_backend_type() {
|
||||
|
||||
int ret;
|
||||
ret = MPIInstance::mpilib_dl_init();
|
||||
if (ret != ROCSHMEM_SUCCESS) {
|
||||
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
|
||||
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
|
||||
exit(1);
|
||||
}
|
||||
mpi_instance = new MPIInstance(comm);
|
||||
|
||||
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
|
||||
@@ -155,10 +160,6 @@ static BackendType select_backend_type() {
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
|
||||
backend = new (backend) GDABackend(comm);
|
||||
#elif defined(USE_RO)
|
||||
if (ret != ROCSHMEM_SUCCESS) {
|
||||
printf("Could not initialize MPI library. RO conduit requires MPI library to be loaded at runtime. Aborting\n");
|
||||
abort();
|
||||
}
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
|
||||
backend = new (backend) ROBackend(comm);
|
||||
#elif defined(USE_IPC)
|
||||
@@ -167,7 +168,8 @@ static BackendType select_backend_type() {
|
||||
#endif
|
||||
|
||||
if (!backend) {
|
||||
abort();
|
||||
printf("No Backend could be initialized! Aborting.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -177,14 +179,16 @@ static BackendType select_backend_type() {
|
||||
|
||||
int ret;
|
||||
ret = MPIInstance::mpilib_dl_init();
|
||||
if (ret == ROCSHMEM_SUCCESS) {
|
||||
printf("Could not initialize MPI library. This initialization method of "
|
||||
"rocSHMEM requires MPI library to be loaded at runtime. Aborting\n");
|
||||
abort();
|
||||
if (ret != ROCSHMEM_SUCCESS) {
|
||||
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
|
||||
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
|
||||
exit(1);
|
||||
}
|
||||
mpilib_ftable_.Initialized(&initialized);
|
||||
|
||||
if (!initialized) {
|
||||
if (initialized) {
|
||||
mpilib_ftable_.Comm_size (MPI_COMM_WORLD, &world_size);
|
||||
} else {
|
||||
// This is an Open MPI specific solution to retrieve the number of
|
||||
// processes that have been started, value can be checked before MPI_Init
|
||||
char *value = getenv("OMPI_COMM_WORLD_SIZE");
|
||||
@@ -194,13 +198,11 @@ static BackendType select_backend_type() {
|
||||
if (world_size != nranks) {
|
||||
// This solution will require MPI_Sessions. This is planned for the
|
||||
// future, but is not supported in the current version.
|
||||
fprintf (stderr, "Unsupported configuration to initialize rocSHMEM. Please "
|
||||
"initialize the MPI library using MPI_Init first, if you want to "
|
||||
"initialize rocSHMEM with a subset of the processes\n");
|
||||
abort();
|
||||
fprintf(stderr, "Unsupported configuration to initialize rocSHMEM. Please "
|
||||
"initialize the MPI library using MPI_Init first, if you want to "
|
||||
"initialize rocSHMEM with a subset of the processes\n");
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
mpilib_ftable_.Comm_size (MPI_COMM_WORLD, &world_size);
|
||||
}
|
||||
|
||||
if (world_size == nranks) {
|
||||
@@ -252,11 +254,8 @@ static BackendType select_backend_type() {
|
||||
backend = new (backend) GDABackend(bootstrap);
|
||||
break;
|
||||
case BackendType::RO_BACKEND:
|
||||
/* Not sure whether this is a valid configuration. Will leave it in for now */
|
||||
DPRINTF("Initializing RO backend with TCP bootstrapping\n");
|
||||
mpi_instance = new MPIInstance(MPI_COMM_WORLD);
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
|
||||
backend = new (backend) ROBackend(MPI_COMM_WORLD);
|
||||
library_init_subcomm(bootstr, bootstr->getNranks(), bootstr->getRank());
|
||||
break;
|
||||
case BackendType::IPC_BACKEND:
|
||||
DPRINTF("Initializing IPC backend with TCP bootstrapping\n");
|
||||
@@ -268,23 +267,15 @@ static BackendType select_backend_type() {
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(GDABackend)));
|
||||
backend = new (backend) GDABackend(bootstrap);
|
||||
#elif defined(USE_RO)
|
||||
/* Not sure whether this is a valid configuration. Will leave it in for now */
|
||||
int ret;
|
||||
ret = MPIInstance::mpilib_dl_init();
|
||||
if (ret != MPI_SUCCESS) {
|
||||
printf("RO Backend requires MPI library to be initialized, even when using uniqueId initializations!\n");
|
||||
abort();
|
||||
}
|
||||
mpi_instance = new MPIInstance(MPI_COMM_WORLD);
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(ROBackend)));
|
||||
backend = new (backend) ROBackend(MPI_COMM_WORLD);
|
||||
library_init_subcomm(bootstr, bootstr->getNranks(), bootstr->getRank());
|
||||
#elif defined(USE_IPC)
|
||||
CHECK_HIP(hipHostMalloc(&backend, sizeof(IPCBackend)));
|
||||
backend = new (backend) IPCBackend(bootstrap);
|
||||
#endif
|
||||
|
||||
if (!backend) {
|
||||
abort();
|
||||
printf("No Backend could be initialized! Aborting.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +309,7 @@ static BackendType select_backend_type() {
|
||||
if (envvar::uniqueid_with_mpi) {
|
||||
library_init_subcomm(bootstr, attr->nranks, attr->rank);
|
||||
} else {
|
||||
library_init (bootstr);
|
||||
library_init(bootstr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -367,7 +358,12 @@ static BackendType select_backend_type() {
|
||||
#endif
|
||||
|
||||
[[maybe_unused]] __host__ void rocshmem_init() {
|
||||
MPIInstance::mpilib_dl_init();
|
||||
auto ret = MPIInstance::mpilib_dl_init();
|
||||
if (ret != ROCSHMEM_SUCCESS) {
|
||||
fprintf(stderr, "Could not initialize MPI library. This initialization method of "
|
||||
"rocSHMEM requires MPI library to be loaded at runtime. Aborting.\n");
|
||||
exit(1);
|
||||
}
|
||||
library_init(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
@@ -458,11 +454,14 @@ __host__ void * rocshmem_ptr(const void * dest, int pe){
|
||||
backend->~Backend();
|
||||
CHECK_HIP(hipHostFree(backend));
|
||||
|
||||
if (bootstr == nullptr)
|
||||
if (mpi_instance != nullptr)
|
||||
delete mpi_instance;
|
||||
|
||||
if (bootstr != nullptr)
|
||||
delete bootstr;
|
||||
|
||||
//TODO This crashes
|
||||
//MPIInstance::mpilib_dl_close();
|
||||
}
|
||||
|
||||
__host__ void rocshmem_query_thread(int *provided) {
|
||||
|
||||
@@ -172,6 +172,9 @@ int main(int argc, char *argv[]) {
|
||||
char key[] = "rocshmem-uuid";
|
||||
pmix_bcast(&uid, sizeof(rocshmem_uniqueid_t), key, 0);
|
||||
|
||||
// Close PMIx before potentially doing MPI_Init inside rocshmem_init
|
||||
PMIx_Finalize(NULL, 0);
|
||||
|
||||
ret = rocshmem_set_attr_uniqueid_args(rank, nranks, &uid, &attr);
|
||||
if (ret != ROCSHMEM_SUCCESS) {
|
||||
std::cout << rank << ": Error in rocshmem_set_attr_uniqueid_args. Aborting.\n";
|
||||
@@ -224,11 +227,5 @@ int main(int argc, char *argv[]) {
|
||||
*/
|
||||
rocshmem_finalize();
|
||||
|
||||
#ifdef HAVE_PMIX
|
||||
if (test_uuid) {
|
||||
PMIx_Finalize(NULL, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user