Re-enabled MSCCL++ (#1325)
* Added restrictions around calling MSCCL++ collectives (#1281) * Added restriction to non-zero 32-byte multiple message sizes to MSCCL++ AllGather. * Renamed and refactored some mscclpp types. * Only transmit the MSCCL++ unique id for non-split comm init. For splitting comm, it has already been transmitted. Instead, save the MSCCL++ communicator in child communicators when calling `ncclCommSplit`. Only destroy MSCCL++ communicators when no RCCL communicators remain that use it. Also improved trace logging. * Disable MSCCL++ when using managed memory buffers as it isn't supported. * Added datatype and op constraints for MSCCL++ AllReduce. * Added documentation on MSCCL++ restrictions to the README. * [BUILD] Support custom CMake flags in MSCCLPP (#1275) * [BUILD] Support custom CMAKE_PREFIX_PATH in MSCCLPP Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> * [BUILD] CMake flags to support build-id in MSCCLPP Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> * [BUILD] Fix CMake warnings in MSCCLPP build Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> * Wrapped all cmake arguments passed to mscclpp to remove empty arguments and properly format them. --------- Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> Co-authored-by: Corey Derochie <corey.derochie@amd.com> * Link to libmscclpp_nccl statically (#1282) * Switched mscclpp_nccl to static linking. Added a build step to rename the NCCL API functions. * Undid separation of building libmscclpp_nccl from building librccl with MSCCL++ integration. With a static build, it's either fully enabled or fully disabled. * `nm` isn't always available in docker containers due to being stripped down. Removed use of `nm` in `cmake` and hard-coded the output into mscclpp_nccl_syms.txt. * Removed IBVerbs dependency for integrating with MSCCL++ (#1313) * Renamed `RCCL_ENABLE_MSCCLPP` to `RCCL_MSCCLPP_ENABLE` to conform to MSCCL. Set `RCCL_MSCCLPP_ENABLE` to 1 by default if `ENABLE_MSCCLPP` is defined, or 0 otherwise. Added a log warning if `RCCL_MSCCLPP_ENABLE` is set to 1 but `ENABLE_MSCCLPP` is not defined. (#1294) * Include mscclpp as a git submodule (#1314) * Added the desired mscclpp commit as a git submodule. * Added step to automatically checkout the mscclpp submodule if it isn't already present, in case the user forgot to clone recursively. * Added instruction to README to clone using --recurse-submodules to get the mscclpp submodule. * Enabled MSCCL++ feature build. --------- Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
Este commit está contenido en:
cometido por
GitHub
padre
4856309413
commit
736a705875
@@ -0,0 +1,4 @@
|
||||
[submodule "ext-src/mscclpp"]
|
||||
path = ext-src/mscclpp
|
||||
url = https://github.com/microsoft/mscclpp.git
|
||||
ignore = dirty
|
||||
+9
-19
@@ -26,8 +26,7 @@ option(BUILD_SHARED_LIBS "Build as shared library"
|
||||
option(BUILD_TESTS "Build unit test programs" OFF)
|
||||
option(COLLTRACE "Collective Trace Option" ON)
|
||||
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
|
||||
option(ENABLE_MSCCLPP "Enable MSCCL++" OFF)
|
||||
option(BUILD_MSCCLPP "Build MSCCL++" OFF)
|
||||
option(ENABLE_MSCCLPP "Enable MSCCL++" ON)
|
||||
option(ENABLE_IFC "Enable indirect function call" OFF)
|
||||
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
|
||||
option(ROCTX "Enable ROCTX" OFF)
|
||||
@@ -288,19 +287,10 @@ if (HAVE_KERNARG_PRELOAD)
|
||||
endif()
|
||||
|
||||
## Disable building MSCCL++ if the build environment is invalid
|
||||
if (BUILD_MSCCLPP)
|
||||
if (NOT ENABLE_MSCCLPP)
|
||||
set(BUILD_MSCCLPP OFF)
|
||||
message(WARNING "ENABLE_MSCCLPP not set; disabling MSCCL++ build")
|
||||
else()
|
||||
find_package(IBVerbs)
|
||||
if (NOT IBVerbs_FOUND)
|
||||
set(BUILD_MSCCLPP OFF)
|
||||
message(WARNING "IBVerbs not found; disabling MSCCL++ build")
|
||||
elseif(NOT "gfx942" IN_LIST GPU_TARGETS)
|
||||
set(BUILD_MSCCLPP OFF)
|
||||
message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
|
||||
endif()
|
||||
if (ENABLE_MSCCLPP)
|
||||
if(NOT "gfx942" IN_LIST GPU_TARGETS)
|
||||
set(ENABLE_MSCCLPP OFF)
|
||||
message(WARNING "Can only build MSCCL++ for gfx942; disabling MSCCL++ build")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -765,6 +755,9 @@ target_link_libraries(rccl INTERFACE hip::host)
|
||||
target_link_libraries(rccl PRIVATE hip::device)
|
||||
target_link_libraries(rccl PRIVATE dl)
|
||||
target_link_libraries(rccl PRIVATE ${ROCM_SMI_LIBRARIES})
|
||||
if(ENABLE_MSCCLPP)
|
||||
target_link_libraries(rccl PRIVATE mscclpp_nccl)
|
||||
endif()
|
||||
|
||||
## Set RCCL link options
|
||||
## Find out available memory
|
||||
@@ -807,7 +800,7 @@ if (HAVE_KERNARG_PRELOAD)
|
||||
target_link_options(rccl PRIVATE -Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16)
|
||||
endif()
|
||||
|
||||
if(BUILD_MSCCLPP)
|
||||
if(ENABLE_MSCCLPP)
|
||||
include(cmake/MSCCLPP.cmake)
|
||||
message(STATUS "Building MSCCL++ with NCCL API support")
|
||||
endif()
|
||||
@@ -831,9 +824,6 @@ file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
|
||||
## Install Algorithm files under share folder
|
||||
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
|
||||
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
|
||||
if(BUILD_MSCCLPP)
|
||||
rocm_install(FILES ${MSCCLPP_OUT_LIBS} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT "runtime")
|
||||
endif()
|
||||
|
||||
rocm_export_targets(
|
||||
NAMESPACE roc::
|
||||
|
||||
+13
-2
@@ -68,13 +68,18 @@ By default, RCCL builds for all GPU targets defined in `DEFAULT_GPUS` in `CMakeL
|
||||
### To build the library using CMake:
|
||||
|
||||
```shell
|
||||
$ git clone https://github.com/ROCm/rccl.git
|
||||
$ git clone https://github.com/ROCm/rccl.git --recurse-submodules
|
||||
$ cd rccl
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ cmake ..
|
||||
$ make -j 16 # Or some other suitable number of parallel jobs
|
||||
```
|
||||
If you have already cloned, you can checkout the `mscclpp` submodule manually.
|
||||
```shell
|
||||
$ cd ext-src/mscclpp
|
||||
$ git submodule update --init --recursive
|
||||
```
|
||||
You may substitute an installation path of your own choosing by passing `CMAKE_INSTALL_PREFIX`. For example:
|
||||
```shell
|
||||
$ cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
|
||||
@@ -134,7 +139,13 @@ RCCL integrates [MSCCL](https://github.com/Azure/msccl) and [MSCCL++](https://gi
|
||||
|
||||
MSCCL uses XMLs for different collective algorithms on different architectures. RCCL collectives can leverage those algorithms once the corresponding XML has been provided by the user. The XML files contain the sequence of send-recv and reduction operations to be executed by the kernel. On MI300X, MSCCL is enabled by default. On other platforms, the users may have to enable this by setting `RCCL_MSCCL_FORCE_ENABLE=1`. By default, MSCCL will only be used if every rank belongs to a unique process; to disable this restriction for multi-threaded or single-threaded configurations, set `RCCL_MSCCL_ENABLE_SINGLE_PROCESS=1`.
|
||||
|
||||
On the other hand, RCCL allreduce and allgather collectives can leverage the efficient MSCCL++ communication kernels for certain message sizes. MSCCL++ support is available whenever MSCCL support is available. Users need to set the RCCL environment variable `RCCL_ENABLE_MSCCLPP=1` to run RCCL workload with MSCCL++ support. It is also possible to set the message size threshold for using MSCCL++ by using the environment variable `RCCL_MSCCLPP_THRESHOLD`. Once `RCCL_MSCCLPP_THRESHOLD` (the default value is 1MB) is set, RCCL will invoke MSCCL++ kernels for all message sizes less than or equal to the specified threshold.
|
||||
On the other hand, RCCL allreduce and allgather collectives can leverage the efficient MSCCL++ communication kernels for certain message sizes. MSCCL++ support is available whenever MSCCL support is available. Users need to set the RCCL environment variable `RCCL_MSCCLPP_ENABLE=1` to run RCCL workload with MSCCL++ support. It is also possible to set the message size threshold for using MSCCL++ by using the environment variable `RCCL_MSCCLPP_THRESHOLD`. Once `RCCL_MSCCLPP_THRESHOLD` (the default value is 1MB) is set, RCCL will invoke MSCCL++ kernels for all message sizes less than or equal to the specified threshold.
|
||||
|
||||
If some restrictions are not met, it will fall back to MSCCL or RCCL. The following are restrictions on using MSCCL++:
|
||||
- Message size must be a non-zero multiple of 32 bytes
|
||||
- Does not support `hipMallocManaged` buffers
|
||||
- Allreduce only supports `float16`, `int32`, `uint32`, `float32`, and `bfloat16` data types
|
||||
- Allreduce only supports the `sum` op
|
||||
|
||||
## Library and API Documentation
|
||||
|
||||
|
||||
+41
-13
@@ -32,7 +32,20 @@
|
||||
# For downloading, building, and installing required dependencies
|
||||
include(cmake/DownloadProject.cmake)
|
||||
|
||||
function(mscclpp_cmake_arg NAME)
|
||||
string (REPLACE ";" "$<SEMICOLON>" ARG_VALUE "${${NAME}}") # Replace ; with non-escapable SEMICOLON symbol to avoid CMake errors
|
||||
string(STRIP "${ARG_VALUE}" ARG_VALUE) # Eliminate whitespace, reducing to empty string if necessary
|
||||
|
||||
# Only add a cmake argument if it has a value
|
||||
set(${NAME}_ARG "-D${NAME}=\"${ARG_VALUE}\"" PARENT_SCOPE)
|
||||
if("${ARG_VALUE}" STREQUAL "")
|
||||
set(${NAME}_ARG "" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
if(ENABLE_MSCCLPP)
|
||||
# Try to find the mscclpp install
|
||||
set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
|
||||
execute_process(
|
||||
COMMAND mkdir -p ${MSCCLPP_ROOT}
|
||||
@@ -41,30 +54,45 @@ if(ENABLE_MSCCLPP)
|
||||
find_package(mscclpp_nccl)
|
||||
|
||||
if(NOT mscclpp_nccl_FOUND)
|
||||
message(STATUS "MSCCL++ not found. Downloading and building MSCCL++ only for gfx942.")
|
||||
# Download, build and install mscclpp
|
||||
# Ensure the source code is checked out
|
||||
set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
|
||||
if(NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt)
|
||||
message(STATUS "Checking out microsoft/mscclpp")
|
||||
execute_process(
|
||||
COMMAND git submodule update --init --recursive
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
endif()
|
||||
|
||||
message(STATUS "Building mscclpp only for gfx942.")
|
||||
|
||||
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
|
||||
mscclpp_cmake_arg(CMAKE_SHARED_LINKER_FLAGS_INIT)
|
||||
mscclpp_cmake_arg(CMAKE_EXE_LINKER_FLAGS_INIT)
|
||||
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
mscclpp_cmake_arg(HIP_COMPILER)
|
||||
|
||||
download_project(PROJ mscclpp_nccl
|
||||
GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
|
||||
GIT_TAG b1b9d0626cfa40319c18c05f8c16650568395c29
|
||||
# GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
|
||||
# GIT_TAG 1e82dd444fc1ed8b7add354eebaab8a94e67d5fc
|
||||
INSTALL_DIR ${MSCCLPP_ROOT}
|
||||
CMAKE_ARGS -DGPU_TARGETS=gfx942 -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
|
||||
CMAKE_ARGS -DGPU_TARGETS=gfx942 -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" "${CMAKE_SHARED_LINKER_FLAGS_INIT_ARG}" "${CMAKE_EXE_LINKER_FLAGS_INIT_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}"
|
||||
LOG_DOWNLOAD FALSE
|
||||
LOG_CONFIGURE FALSE
|
||||
LOG_BUILD FALSE
|
||||
LOG_INSTALL FALSE
|
||||
UPDATE_DISCONNECTED TRUE
|
||||
SOURCE_DIR ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
find_package(mscclpp_nccl REQUIRED)
|
||||
endif()
|
||||
|
||||
# Copy the outputs to the PROJECT_BINARY_DIR, list them in MSCCLPP_OUT_LIBS
|
||||
file(GLOB MSCCLPP_LIB_FILES "${MSCCLPP_ROOT}/lib/*")
|
||||
file(GLOB MSCCLPP_LIB_NAMES RELATIVE ${MSCCLPP_ROOT}/lib "${MSCCLPP_ROOT}/lib/*")
|
||||
set(MSCCLPP_OUT_LIBS "")
|
||||
foreach(LIB_NAME ${MSCCLPP_LIB_NAMES})
|
||||
list(APPEND MSCCLPP_OUT_LIBS ${PROJECT_BINARY_DIR}/${LIB_NAME})
|
||||
endforeach()
|
||||
file(COPY ${MSCCLPP_LIB_FILES} DESTINATION ${PROJECT_BINARY_DIR})
|
||||
execute_process(COMMAND objcopy
|
||||
--redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
|
||||
"${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a"
|
||||
"${PROJECT_BINARY_DIR}/libmscclpp_nccl.a"
|
||||
)
|
||||
add_library(mscclpp_nccl STATIC IMPORTED)
|
||||
set_target_properties(mscclpp_nccl PROPERTIES IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/libmscclpp_nccl.a)
|
||||
endif()
|
||||
|
||||
Submódulo
+1
Submodule ext-src/mscclpp added at 1e82dd444f
+1
-1
@@ -24,7 +24,7 @@ install_dependencies=false
|
||||
install_library=false
|
||||
install_prefix="${ROCM_PATH}"
|
||||
msccl_kernel_enabled=true
|
||||
mscclpp_enabled=false
|
||||
mscclpp_enabled=true
|
||||
num_parallel_jobs=$(nproc)
|
||||
npkit_enabled=false
|
||||
openmp_test_enabled=false
|
||||
|
||||
@@ -396,7 +396,7 @@ struct ncclComm {
|
||||
#if defined(ENABLE_MSCCLPP)
|
||||
// Whether this comm is compatible with MSCCLPP
|
||||
bool mscclppCompatible;
|
||||
struct mscclpp_ncclComm* mscclpp_comm;
|
||||
struct mscclppComm* mscclpp_comm;
|
||||
size_t mscclpp_threshold;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -9,32 +9,36 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
typedef struct mscclpp_ncclComm* mscclpp_ncclComm_t;
|
||||
typedef struct mscclppComm* mscclppComm_t;
|
||||
|
||||
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } mscclpp_ncclUniqueId;
|
||||
typedef ncclUniqueId mscclppUniqueId;
|
||||
|
||||
bool mscclpp_init();
|
||||
/* A ncclUniqueId and a mscclppUniqueId will always be created together and used alternatively. This maps between them. */
|
||||
extern std::unordered_map<ncclUniqueId, mscclppUniqueId> mscclpp_uniqueIdMap;
|
||||
extern std::unordered_map<mscclppUniqueId, std::unordered_set<ncclUniqueId>> mscclpp_uniqueIdReverseMap;
|
||||
extern std::unordered_map<mscclppComm_t, mscclppUniqueId> mscclpp_commToUniqueIdMap;
|
||||
extern std::unordered_map<ncclComm_t, ncclUniqueId> ncclCommToUniqueIdMap;
|
||||
|
||||
/* A ncclUniqueId and a mscclpp_ncclUniqueId will always be created together and used alternatively. This maps between them. */
|
||||
extern std::unordered_map<ncclUniqueId, mscclpp_ncclUniqueId> mscclpp_uniqueIdMap;
|
||||
extern "C" {
|
||||
/* See ncclGetUniqueId. */
|
||||
ncclResult_t mscclpp_ncclGetUniqueId(mscclppUniqueId* uniqueId);
|
||||
|
||||
/* See ncclGetUniqueId. */
|
||||
extern ncclResult_t (*mscclpp_ncclGetUniqueId)(mscclpp_ncclUniqueId* uniqueId);
|
||||
/* See ncclCommInitRank. */
|
||||
ncclResult_t mscclpp_ncclCommInitRank(mscclppComm_t* comm, int nranks, mscclppUniqueId commId, int rank);
|
||||
|
||||
/* See ncclCommInitRank. */
|
||||
extern ncclResult_t (*mscclpp_ncclCommInitRank)(mscclpp_ncclComm_t* comm, int nranks, mscclpp_ncclUniqueId commId, int rank);
|
||||
/* See ncclCommDestroy. */
|
||||
ncclResult_t mscclpp_ncclCommDestroy(mscclppComm_t comm);
|
||||
|
||||
/* See ncclCommDestroy. */
|
||||
extern ncclResult_t (*mscclpp_ncclCommDestroy)(mscclpp_ncclComm_t comm);
|
||||
/* See ncclAllReduce. */
|
||||
ncclResult_t mscclpp_ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, mscclppComm_t comm, hipStream_t stream);
|
||||
|
||||
/* See ncclAllReduce. */
|
||||
extern ncclResult_t (*mscclpp_ncclAllReduce)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, mscclpp_ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
/* See ncclAllGather. */
|
||||
extern ncclResult_t (*mscclpp_ncclAllGather)(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, mscclpp_ncclComm_t comm, hipStream_t stream);
|
||||
/* See ncclAllGather. */
|
||||
ncclResult_t mscclpp_ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, mscclppComm_t comm, hipStream_t stream);
|
||||
}
|
||||
|
||||
namespace std {
|
||||
template <>
|
||||
|
||||
+56
-20
@@ -101,10 +101,14 @@ bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b) {
|
||||
return memcmp(a.internal, b.internal, NCCL_UNIQUE_ID_BYTES) == 0;
|
||||
}
|
||||
|
||||
RCCL_PARAM(EnableMscclpp, "ENABLE_MSCCLPP", 0);
|
||||
RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(1024*1024));
|
||||
static constexpr int64_t defaultEnableMscclpp = 1;
|
||||
#else
|
||||
static constexpr int64_t defaultEnableMscclpp = 0;
|
||||
#endif
|
||||
|
||||
RCCL_PARAM(MscclppEnabled, "MSCCLPP_ENABLE", defaultEnableMscclpp);
|
||||
|
||||
// GDRCOPY support: Off by default
|
||||
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
|
||||
|
||||
@@ -165,11 +169,6 @@ static ncclResult_t ncclInit() {
|
||||
}
|
||||
#ifndef NVTX_NO_IMPL
|
||||
initNvtxRegisteredEnums();
|
||||
#endif
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
if (rcclParamEnableMscclpp() && !mscclpp_init()) {
|
||||
return ncclSystemError;
|
||||
}
|
||||
#endif
|
||||
__atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
|
||||
}
|
||||
@@ -190,21 +189,25 @@ ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
|
||||
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
|
||||
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
|
||||
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
|
||||
if (rcclParamMscclppEnabled()) {
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
if (rcclParamEnableMscclpp()) {
|
||||
NCCLCHECK(res);
|
||||
int dev;
|
||||
CUDACHECK(cudaGetDevice(&dev));
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, dev));
|
||||
if (IsArchMatch(devProp.gcnArchName, "gfx94")) {
|
||||
res = mscclpp_ncclGetUniqueId(&(mscclpp_uniqueIdMap[*out]));
|
||||
TRACE_CALL("mscclpp_ncclGetUniqueId");
|
||||
auto& mscclppUniqueId = mscclpp_uniqueIdMap[*out];
|
||||
res = mscclpp_ncclGetUniqueId(&mscclppUniqueId);
|
||||
TRACE_CALL("mscclpp_ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(mscclppUniqueId));
|
||||
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(*out);
|
||||
} else {
|
||||
WARN("MSCCL++: Cannot enable MSCCL++ on %s architecture", devProp.gcnArchName);
|
||||
}
|
||||
}
|
||||
#else
|
||||
WARN("MSCCL++: Feature not enabled. ENABLE_MSCCLPP must be defined at compile-time to enable this feature.");
|
||||
#endif
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -1914,6 +1917,9 @@ fail:
|
||||
static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
|
||||
ncclComm_t comm = job->comm;
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
ncclUniqueId origUniqueId = job->commId;
|
||||
#endif
|
||||
ncclResult_t res = ncclSuccess;
|
||||
int archMajor, archMinor;
|
||||
size_t maxLocalSizeBytes = 0;
|
||||
@@ -1967,22 +1973,41 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
if (rcclParamEnableMscclpp()) {
|
||||
if (job->parent) {
|
||||
if (job->parent->mscclppCompatible) {
|
||||
INFO(NCCL_INIT, "MSCCL++: Splitting a compatible communicator; using parent mscclpp_comm");
|
||||
comm->mscclppCompatible = true;
|
||||
comm->mscclpp_threshold = job->parent->mscclpp_threshold;
|
||||
comm->mscclpp_comm = job->parent->mscclpp_comm;
|
||||
auto& mscclppUniqueId = mscclpp_uniqueIdMap[origUniqueId];
|
||||
mscclpp_uniqueIdMap[job->commId] = mscclppUniqueId;
|
||||
mscclpp_uniqueIdReverseMap[mscclppUniqueId].insert(job->commId);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
if (rcclParamMscclppEnabled()) {
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
|
||||
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
|
||||
if (comm->mscclppCompatible) {
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &(mscclpp_uniqueIdMap[job->commId]), sizeof(mscclpp_ncclUniqueId)), res, fail);
|
||||
TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=<mscclpp_ncclUniqueId>)", comm->localRank, comm->localRanks, 0);
|
||||
auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];
|
||||
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &mscclppUniqueId, sizeof(mscclppUniqueId)), res, fail);
|
||||
unsigned long long mscclppUniqueIdHash; (void)mscclppUniqueIdHash;
|
||||
TRACE_CALL("bootstrapIntraNodeBroadcast(rank=%d, nranks=%d, root=%d, bcastData=hash:0x%llx)", comm->localRank, comm->localRanks, 0, (mscclppUniqueIdHash = (unsigned long long)hashUniqueId(mscclppUniqueId)));
|
||||
comm->mscclpp_threshold = rcclParamMscclppThreshold();
|
||||
INFO(NCCL_INIT, "MSCCL++: Enabled! Msg size threshold=%zu", comm->mscclpp_threshold);
|
||||
NCCLCHECKGOTO(mscclpp_ncclCommInitRank(&(comm->mscclpp_comm), job->nranks, mscclpp_uniqueIdMap[job->commId], job->myrank), res, fail);
|
||||
TRACE_CALL("mscclpp_ncclCommInitRank (nranks=%d, myrank=%d)", job->nranks, job->myrank);
|
||||
NCCLCHECKGOTO(mscclpp_ncclCommInitRank(&(comm->mscclpp_comm), job->nranks, mscclppUniqueId, job->myrank), res, fail);
|
||||
TRACE_CALL("mscclpp_ncclCommInitRank (*comm=%p, nranks=%d, commId=hash:0x%llx, myrank=%d)", comm->mscclpp_comm, job->nranks, mscclppUniqueIdHash, job->myrank);
|
||||
mscclpp_commToUniqueIdMap[comm->mscclpp_comm] = mscclppUniqueId;
|
||||
} else {
|
||||
WARN("MSCCL++: Cannot enable MSCCL++ on %s architecture", devProp.gcnArchName);
|
||||
}
|
||||
}
|
||||
#else
|
||||
WARN("MSCCL++: Feature not enabled. ENABLE_MSCCLPP must be defined at compile-time to enable this feature.");
|
||||
#endif
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
|
||||
if (comm->tuner) {
|
||||
@@ -2584,11 +2609,22 @@ ncclResult_t ncclCommDestroy_impl(ncclComm_t comm) {
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
if (comm->mscclppCompatible) {
|
||||
ncclResult_t res = mscclpp_ncclCommDestroy(comm->mscclpp_comm);
|
||||
TRACE_CALL("mscclpp_ncclCommDestroy");
|
||||
if (res != ncclSuccess) {
|
||||
WARN("MSCCL++: mscclpp_ncclCommDestroy failed (%s)", ncclGetErrorString(res));
|
||||
auto& mscclppUniqueId = mscclpp_commToUniqueIdMap[comm->mscclpp_comm];
|
||||
auto& uniqueIds = mscclpp_uniqueIdReverseMap[mscclppUniqueId];
|
||||
auto& ncclUniqueId = ncclCommToUniqueIdMap[comm];
|
||||
if (uniqueIds.find(ncclUniqueId) == uniqueIds.end()) {
|
||||
WARN("MSCCL++: comm=%p not found in mscclpp_uniqueIdReverseMap for key=%p", comm, comm->mscclpp_comm);
|
||||
}
|
||||
uniqueIds.erase(ncclUniqueId);
|
||||
if (uniqueIds.size() == 0) {
|
||||
mscclpp_uniqueIdReverseMap.erase(mscclppUniqueId);
|
||||
ncclResult_t res = mscclpp_ncclCommDestroy(comm->mscclpp_comm);
|
||||
TRACE_CALL("mscclpp_ncclCommDestroy");
|
||||
if (res != ncclSuccess) {
|
||||
WARN("MSCCL++: mscclpp_ncclCommDestroy failed (%s)", ncclGetErrorString(res));
|
||||
}
|
||||
}
|
||||
|
||||
comm->mscclppCompatible = false;
|
||||
comm->mscclpp_comm = nullptr;
|
||||
}
|
||||
|
||||
@@ -468,6 +468,25 @@ static ncclResult_t mscclFallBackSavedParams() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
static inline bool isMscclppAllReduceSupported(ncclDataType_t dataType, ncclRedOp_t op) {
|
||||
switch (dataType) {
|
||||
case ncclFloat16:
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
case ncclFloat32:
|
||||
#ifdef RCCL_BFLOAT16
|
||||
case ncclBfloat16:
|
||||
#endif
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return (op == ncclSum);
|
||||
}
|
||||
#endif
|
||||
|
||||
ncclResult_t mscclEnqueueCheck(
|
||||
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
|
||||
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
|
||||
@@ -493,8 +512,13 @@ ncclResult_t mscclEnqueueCheck(
|
||||
}
|
||||
|
||||
/* check if one rank per GPU and graph mode is enabled */
|
||||
if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible) {
|
||||
if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold && (nBytes & 31) == 0) {
|
||||
if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
|
||||
bool isManagedBuffer = false;
|
||||
if (sendBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(sendBuff)));
|
||||
if (!isManagedBuffer && recvBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(recvBuff)));
|
||||
|
||||
if (isManagedBuffer) { /* MSCCL++ not enabled for managed memory buffers */ }
|
||||
else if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold && isMscclppAllReduceSupported(dataType, op)) {
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
"mscclpp_ncclAllReduce", comm->opCount, sendBuff, recvBuff, count, dataType, op, root, comm, comm->nRanks, stream);
|
||||
NCCLCHECK(mscclpp_ncclAllReduce(sendBuff, recvBuff, count, dataType, op, comm->mscclpp_comm, stream));
|
||||
@@ -529,8 +553,13 @@ ncclResult_t mscclEnqueueCheck(
|
||||
}
|
||||
|
||||
/* check if one rank per GPU and graph mode is enabled */
|
||||
if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible) {
|
||||
if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold && (nBytes & 31) == 0) {
|
||||
if ((threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible && nBytes > 0 && (nBytes & 31) == 0) {
|
||||
bool isManagedBuffer = false;
|
||||
if (sendBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(sendBuff)));
|
||||
if (!isManagedBuffer && recvBuff) CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(recvBuff)));
|
||||
|
||||
if (isManagedBuffer) { /* MSCCL++ not enabled for managed memory buffers */ }
|
||||
else if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold && isMscclppAllReduceSupported(dataType, op)) {
|
||||
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
||||
"mscclpp_ncclAllReduce", comm->opCount, sendBuff, recvBuff, count, dataType, op, root, comm, comm->nRanks, stream);
|
||||
NCCLCHECK(mscclpp_ncclAllReduce(sendBuff, recvBuff, count, dataType, op, comm->mscclpp_comm, stream));
|
||||
|
||||
@@ -5,42 +5,8 @@
|
||||
************************************************************************/
|
||||
|
||||
#include "mscclpp/mscclpp_nccl.h"
|
||||
#include "debug.h"
|
||||
#include <dlfcn.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#define MSCCLPP_DECLARE(X) decltype(mscclpp_##X) mscclpp_##X = nullptr
|
||||
#define MSCCLPP_LOAD(HANDLE, X) do { \
|
||||
(mscclpp_##X) = (decltype(mscclpp_##X))dlsym((HANDLE), (#X)); \
|
||||
const char* error; \
|
||||
if ((error = dlerror()) != nullptr) { \
|
||||
WARN("MSCCL++: failed to load %s : %s", (#X), error); \
|
||||
return false; \
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
static const char mscclpp_nccl_lib_name[] = "libmscclpp_nccl.so";
|
||||
|
||||
MSCCLPP_DECLARE(ncclGetUniqueId);
|
||||
MSCCLPP_DECLARE(ncclCommInitRank);
|
||||
MSCCLPP_DECLARE(ncclCommDestroy);
|
||||
MSCCLPP_DECLARE(ncclAllReduce);
|
||||
MSCCLPP_DECLARE(ncclAllGather);
|
||||
|
||||
bool mscclpp_init() {
|
||||
void* handle = dlopen(mscclpp_nccl_lib_name, RTLD_LAZY);
|
||||
if (!handle) {
|
||||
WARN("MSCCL++: failed to access %s : %s", mscclpp_nccl_lib_name, dlerror());
|
||||
return false;
|
||||
}
|
||||
dlerror(); // Clear any errors.
|
||||
|
||||
MSCCLPP_LOAD(handle, ncclGetUniqueId);
|
||||
MSCCLPP_LOAD(handle, ncclCommInitRank);
|
||||
MSCCLPP_LOAD(handle, ncclCommDestroy);
|
||||
MSCCLPP_LOAD(handle, ncclAllReduce);
|
||||
MSCCLPP_LOAD(handle, ncclAllGather);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::unordered_map<ncclUniqueId, mscclpp_ncclUniqueId> mscclpp_uniqueIdMap;
|
||||
std::unordered_map<ncclUniqueId, mscclppUniqueId> mscclpp_uniqueIdMap;
|
||||
std::unordered_map<mscclppUniqueId, std::unordered_set<ncclUniqueId>> mscclpp_uniqueIdReverseMap;
|
||||
std::unordered_map<mscclppComm_t, mscclppUniqueId> mscclpp_commToUniqueIdMap;
|
||||
std::unordered_map<ncclComm_t, ncclUniqueId> ncclCommToUniqueIdMap;
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
# > ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt;
|
||||
# for sym in $(nm -fjust-symbols ${MSCCLPP_ROOT}/lib/libmscclpp_nccl_static.a | grep "^nccl"); do
|
||||
# echo $sym mscclpp_$sym>> ${PROJECT_BINARY_DIR}/mscclpp_nccl_syms.txt;
|
||||
# done
|
||||
ncclAllGather mscclpp_ncclAllGather
|
||||
ncclAllReduce mscclpp_ncclAllReduce
|
||||
ncclAllToAll mscclpp_ncclAllToAll
|
||||
ncclBcast mscclpp_ncclBcast
|
||||
ncclBroadcast mscclpp_ncclBroadcast
|
||||
ncclCommAbort mscclpp_ncclCommAbort
|
||||
ncclCommCount mscclpp_ncclCommCount
|
||||
ncclCommCuDevice mscclpp_ncclCommCuDevice
|
||||
ncclCommDestroy mscclpp_ncclCommDestroy
|
||||
ncclCommFinalize mscclpp_ncclCommFinalize
|
||||
ncclCommGetAsyncError mscclpp_ncclCommGetAsyncError
|
||||
ncclCommInitAll mscclpp_ncclCommInitAll
|
||||
ncclCommInitRank mscclpp_ncclCommInitRank
|
||||
ncclCommInitRankConfig mscclpp_ncclCommInitRankConfig
|
||||
ncclCommSplit mscclpp_ncclCommSplit
|
||||
ncclCommUserRank mscclpp_ncclCommUserRank
|
||||
ncclGetErrorString mscclpp_ncclGetErrorString
|
||||
ncclGetLastError mscclpp_ncclGetLastError
|
||||
ncclGetUniqueId mscclpp_ncclGetUniqueId
|
||||
ncclGetVersion mscclpp_ncclGetVersion
|
||||
ncclGroupEnd mscclpp_ncclGroupEnd
|
||||
ncclGroupStart mscclpp_ncclGroupStart
|
||||
ncclRecv mscclpp_ncclRecv
|
||||
ncclRedOpCreatePreMulSum mscclpp_ncclRedOpCreatePreMulSum
|
||||
ncclRedOpDestroy mscclpp_ncclRedOpDestroy
|
||||
ncclReduce mscclpp_ncclReduce
|
||||
ncclReduceScatter mscclpp_ncclReduceScatter
|
||||
ncclSend mscclpp_ncclSend
|
||||
Referencia en una nueva incidencia
Block a user