From 2b2f23f42d7d0cb18594de1314f613da809a56b2 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 14 Feb 2023 22:31:54 +0000 Subject: [PATCH] auto-detect and enable MPI --- CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++-------- README.md | 16 ++++++++++++++++ 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 539a1eae2b..f440060946 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,33 @@ # ######################################################################## # Copyright 2022 Advanced Micro Devices, Inc. # ######################################################################## +macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so) + find_program(MPI_MPICXX ${mpi_compiler}) + if (MPI_MPICXX) + message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}") + if (${CMAKE_VERSION} VERSION_LESS "3.20.0") + get_filename_component(mpi.tmpdir ${MPI_MPICXX} DIRECTORY) + get_filename_component(mpi_base_dir ${mpi.tmpdir} DIRECTORY) + else() + cmake_path(GET MPI_MPICXX PARENT_PATH mpi.tmpdir) + cmake_path(GET mpi.tmpdir PARENT_PATH mpi_base_dir) + endif() + find_file(MPI_H mpi.h PATHS ${mpi_base_dir} PATH_SUFFIXES include include/x86_64-linux-gnu ${ARGN} {REQUIRED) + if (${CMAKE_VERSION} VERSION_LESS "3.20.0") + get_filename_component(mpi_inc_dir ${MPI_H} DIRECTORY) + else() + cmake_path(GET MPI_H PARENT_PATH mpi_inc_dir) + endif() + message ("-- mpi.h is in ${mpi_inc_dir}") + find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu REQIRED) + message ("-- libmpi is ${MPI_LIB}") + add_definitions(-DMPI_SUPPORT) + include_directories(${mpi_inc_dir}) + link_libraries(${MPI_LIB}) + else() + message ("-- ${mpi_compiler} not found") + endif() +endmacro() cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) @@ -30,8 +57,7 @@ include(ROCMCheckTargetIds) include(ROCMClients) # Build variables -option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.") -set(MPI_PATH "" CACHE PATH "Path to MPI installation") +option(NO_MPI "Build RCCL-tests without MPI support.") ## Get default GPU targets using rocm_check_target_ids rocm_check_target_ids( DEFAULT_AMDGPU_TARGETS @@ -39,13 +65,21 @@ rocm_check_target_ids( ) set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.") -# Find the MPI package if we're using MPI -if (USE_MPI) - if(NOT MPI_PATH STREQUAL "") - set(MPI_HOME "${MPI_PATH}") +if (NOT NO_MPI) + # Check for MPICH first + check_mpi(mpicxx.mpich libmpich.a libmpich.so include/x86_64-linux-gnu/mpich) + + # Check for MPI in general. If we find mpicxx, we don't know whether its + # MPICH or another MPI implementation + if (NOT MPI_MPICXX) + check_mpi(mpicxx libmpi.a libmpi.so) endif() - find_package(MPI REQUIRED MODULE) - add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT) + + if (NOT MPI_MPICXX) + message ("-- no MPI library found") + endif() +else() + message ("-- MPI support explicitely disabled") endif() set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component diff --git a/README.md b/README.md index c2847232e6..0a88c5d384 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,22 @@ RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If y $ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl ``` +RCCL tests can also be built using cmake. A typical sequence will be: + +```shell +$ mkdir build +$ cd build +$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl .. +$ make +``` + +When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check +for cmake target and config files that are created during the RCCL build. + +Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitley request +MPI builds. A user can explicitely disable MPI builds by adding the -DNO_MPI=1 flag to the cmake command line. + + ## Usage RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).