Integrated RCCL with MSCCL++ for small message sizes (#1231)

[ROCm/rccl commit: 6dc47eecd7]
Этот коммит содержится в:
corey-derochie-amd
2024-07-12 15:32:58 -06:00
коммит произвёл GitHub
родитель f60367f1c3
Коммит b8542c2477
15 изменённых файлов: 441 добавлений и 4 удалений
+1
Просмотреть файл
@@ -2,3 +2,4 @@
*.gcov
/coverage/
build/
ext/
+2 -1
Просмотреть файл
@@ -21,10 +21,11 @@ def runTestCommand (platform, project, gfilter, envars)
def command = """#!/usr/bin/env bash
set -x
export RUN_TEST_ROOT=\$(pwd)
cd ${project.paths.project_build_prefix}/build/release/test
${sudo} ulimit -l unlimited
ulimit -a
${sudo} ${envars} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 UT_MULTITHREAD=1 UT_PROCESS_MASK=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
${sudo} ${envars} LD_LIBRARY_PATH=\${RUN_TEST_ROOT}/${project.paths.project_build_prefix}/build/release:\${LD_LIBRARY_PATH} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 UT_MULTITHREAD=1 UT_PROCESS_MASK=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+31
Просмотреть файл
@@ -26,6 +26,7 @@ option(BUILD_SHARED_LIBS "Build as shared library"
option(BUILD_TESTS "Build unit test programs" OFF)
option(COLLTRACE "Collective Trace Option" ON)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
option(ENABLE_MSCCLPP "Enable MSCCL++" ON)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(ROCTX "Enable ROCTX" OFF)
@@ -59,6 +60,8 @@ include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # GTest, rocm-cmake, rocm_local_targets
include(cmake/Generator.cmake) # Configure functions that goes into RCCL
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
@@ -242,6 +245,15 @@ if (HAVE_KERNARG_PRELOAD)
message(STATUS "Kernarg preloading to SGPR enabled")
endif()
# Check for IBVerbs; disable MSCCL++ if not present
if (ENABLE_MSCCLPP)
find_package(IBVerbs)
if (NOT IBVerbs_FOUND)
set(ENABLE_MSCCLPP OFF)
message(WARNING "IBVerbs not found; disabling MSCCL++")
endif()
endif()
# Determine version from makefiles/version.mk and fill in templates
#==================================================================================================
## parse version from Makefile NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH must exist
@@ -482,6 +494,14 @@ if (ENABLE_MSCCL_KERNEL)
list(APPEND SRC_FILES ${MSCCL_KERNEL_SOURCES})
endif()
if (ENABLE_MSCCLPP)
set(MSCCLPP_SOURCES
src/include/mscclpp/mscclpp_nccl.h
src/misc/mscclpp/mscclpp_nccl.cc
)
list(APPEND SRC_FILES ${MSCCLPP_SOURCES})
endif()
# Hipify source files (copy of source generated into hipify directory)
#==================================================================================================
find_program(hipify-perl_executable hipify-perl)
@@ -563,6 +583,9 @@ endif()
if(ENABLE_MSCCL_KERNEL)
target_compile_definitions(rccl PRIVATE COMPILE_MSCCL_KERNEL)
endif()
if(ENABLE_MSCCLPP)
target_compile_definitions(rccl PRIVATE ENABLE_MSCCLPP)
endif()
if(HAVE_ROCM_SMI64CONFIG)
target_compile_definitions(rccl PRIVATE USE_ROCM_SMI64CONFIG)
endif()
@@ -682,6 +705,11 @@ if (HAVE_KERNARG_PRELOAD)
target_link_options(rccl PRIVATE -Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16)
endif()
if(ENABLE_MSCCLPP)
include(cmake/MSCCLPP.cmake)
message(STATUS "Building RCCL with MSCCL++ support")
endif()
## Track linking time
set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
@@ -699,6 +727,9 @@ file(COPY tools/msccl-unit-test-algorithms DESTINATION ${PROJECT_BINARY_DIR})
## Install Algorithm files under share folder
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
rocm_install(DIRECTORY ${PROJECT_BINARY_DIR}/msccl-unit-test-algorithms DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
if(ENABLE_MSCCLPP)
rocm_install(FILES ${MSCCLPP_OUT_LIBS} DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT "runtime")
endif()
rocm_export_targets(
NAMESPACE roc::
+1 -1
Просмотреть файл
@@ -4,7 +4,7 @@ Attributions
Contains contributions from NVIDIA.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
Redistribution and use in source and binary forms, with or without
+30
Просмотреть файл
@@ -67,3 +67,33 @@ Dependencies on NPKit (MIT License)
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
_______________________________________________________________
Dependencies on MSCCL++ (MIT License)
Copyright (c) Microsoft Corporation.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
See:
https://github.com/microsoft/mscclpp
for more information and license details.
+9
Просмотреть файл
@@ -37,6 +37,7 @@ RCCL build & installation helper script
--enable_backtrace Build with custom backtrace support
--disable-colltrace Build without collective trace
--disable-msccl-kernel Build without MSCCL kernels
--disable-mscclpp Build without MSCCL++ support
-f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
-h|--help Prints this help message
-i|--install Install RCCL library (see --prefix argument below)
@@ -45,6 +46,7 @@ RCCL build & installation helper script
--amdgpu_targets Only compile for specified GPU architecture(s). For multiple targets, seperate by ';' (builds for all supported GPU architectures by default)
--no_clean Don't delete files if they already exist
--npkit-enable Compile with npkit enabled
--openmp-test-enable Enable OpenMP in rccl unit tests
--roctx-enable Compile with roctx enabled (example usage: rocprof --roctx-trace ./rccl-program)
-p|--package_build Build RCCL package
--prefix Specify custom directory to install RCCL to (default: `/opt/rocm`)
@@ -123,6 +125,13 @@ To manually run RCCL with NPKit enabled, environment variable `NPKIT_DUMP_DIR` n
To manually analyze NPKit dump results, please leverage [npkit_trace_generator.py](https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py).
## MSCCL/MSCCL++
RCCL integrates MSCCL(https://github.com/microsoft/msccl) and MSCCL++ (https://github.com/microsoft/mscclpp) to leverage the highly efficient GPU-GPU communication primitives for collective operations. Thanks to Microsoft Corporation for collaborating with us in this project.
MSCCL uses XMLs for different collective algorithms on different architectures. RCCL collectives can leverage those algorithms once the corresponding XML has been provided by the user. The XML files contain the sequence of send-recv and reduction operations to be executed by the kernel. On MI300X, MSCCL is enabled by default. On other platforms, the users may have to enable this by setting `RCCL_MSCCL_FORCE_ENABLE=1`.
On the other hand, RCCL allreduce and allgather collectives can leverage the efficient MSCCL++ communication kernels for certain message sizes. MSCCL++ support is available whenever MSCCL support is available. Users need to set the RCCL environment variable `RCCL_ENABLE_MSCCLPP=1` to run RCCL workload with MSCCL++ support. It is also possible to set the message size threshold for using MSCCL++ by using the environment variable `RCCL_MSCCLPP_THRESHOLD`. Once `RCCL_MSCCLPP_THRESHOLD` (the default value is 1MB) is set, RCCL will invoke MSCCL++ kernels for all message sizes less than or equal to the specified threshold.
## Library and API Documentation
Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
+39
Просмотреть файл
@@ -0,0 +1,39 @@
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(IBVERBS_INCLUDE_DIRS
NAMES infiniband/verbs.h
HINTS
${IBVERBS_INCLUDE_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/include)
find_library(IBVERBS_LIBRARIES
NAMES ibverbs
HINTS
${IBVERBS_LIB_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)
+36
Просмотреть файл
@@ -0,0 +1,36 @@
# MIT License
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_path(MSCCLPP_INCLUDE_DIRS
NAMES mscclpp/gpu.hpp
HINTS
${MSCCLPP_ROOT}/include)
find_library(MSCCLPP_LIBRARIES
NAMES mscclpp_nccl
HINTS
${MSCCLPP_ROOT}/lib)
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args(mscclpp_nccl DEFAULT_MSG MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
mark_as_advanced(MSCCLPP_INCLUDE_DIRS MSCCLPP_LIBRARIES)
+70
Просмотреть файл
@@ -0,0 +1,70 @@
# MIT License
#
# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Dependencies
# HIP dependency is handled earlier in the project cmake file
# when VerifyCompiler.cmake is included.
# GIT
# Test dependencies
# For downloading, building, and installing required dependencies
include(cmake/DownloadProject.cmake)
if(ENABLE_MSCCLPP)
set(MSCCLPP_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/ext/mscclpp CACHE PATH "")
execute_process(
COMMAND mkdir -p ${MSCCLPP_ROOT}
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(mscclpp_nccl)
if(NOT mscclpp_nccl_FOUND)
message(STATUS "MSCCL++ not found. Downloading and building MSCCL++.")
# Download, build and install mscclpp
download_project(PROJ mscclpp_nccl
GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
GIT_TAG b1b9d0626cfa40319c18c05f8c16650568395c29
INSTALL_DIR ${MSCCLPP_ROOT}
CMAKE_ARGS -DGPU_TARGETS=${GPU_TARGETS} -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
LOG_DOWNLOAD TRUE
LOG_CONFIGURE TRUE
LOG_BUILD TRUE
LOG_INSTALL TRUE
UPDATE_DISCONNECTED TRUE
)
find_package(mscclpp_nccl REQUIRED)
endif()
# Copy the outputs to the PROJECT_BINARY_DIR, list them in MSCCLPP_OUT_LIBS
file(GLOB MSCCLPP_LIB_FILES "${MSCCLPP_ROOT}/lib/*")
file(GLOB MSCCLPP_LIB_NAMES RELATIVE ${MSCCLPP_ROOT}/lib "${MSCCLPP_ROOT}/lib/*")
set(MSCCLPP_OUT_LIBS "")
foreach(LIB_NAME ${MSCCLPP_LIB_NAMES})
list(APPEND MSCCLPP_OUT_LIBS ${PROJECT_BINARY_DIR}/${LIB_NAME})
endforeach()
file(COPY ${MSCCLPP_LIB_FILES} DESTINATION ${PROJECT_BINARY_DIR})
endif()
+8 -1
Просмотреть файл
@@ -24,6 +24,7 @@ install_dependencies=false
install_library=false
install_prefix="${ROCM_PATH}"
msccl_kernel_enabled=true
mscclpp_enabled=true
num_parallel_jobs=$(nproc)
npkit_enabled=false
openmp_test_enabled=false
@@ -45,6 +46,7 @@ function display_help()
echo " --enable_backtrace Build with custom backtrace support"
echo " --disable-colltrace Build without collective trace"
echo " --disable-msccl-kernel Build without MSCCL kernels"
echo " --disable-mscclpp Build without MSCCL++ support"
echo " -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)"
echo " -h|--help Prints this help message"
echo " -i|--install Install RCCL library (see --prefix argument below)"
@@ -73,7 +75,7 @@ function display_help()
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ "$?" -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,disable-mscclpp,fast,help,install,jobs:,local_gpu_only,amdgpu_targets:,no_clean,npkit-enable,openmp-test-enable,roctx-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
@@ -94,6 +96,7 @@ while true; do
--enable_backtrace) build_bfd=true; shift ;;
--disable-colltrace) collective_trace=false; shift ;;
--disable-msccl-kernel) msccl_kernel_enabled=false; shift ;;
--disable-mscclpp) mscclpp_enabled=false; shift ;;
-f | --fast) build_local_gpu_only=true; collective_trace=false; msccl_kernel_enabled=false; shift ;;
-h | --help) display_help; exit 0 ;;
-i | --install) install_library=true; shift ;;
@@ -234,6 +237,10 @@ if [[ "${msccl_kernel_enabled}" == false ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_MSCCL_KERNEL=OFF"
fi
if [[ "${mscclpp_enabled}" == false ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_MSCCLPP=OFF"
fi
# Install dependencies
if [[ "${install_dependencies}" == true ]]; then
cmake_common_options="${cmake_common_options} -DINSTALL_DEPENDENCIES=ON"
+7
Просмотреть файл
@@ -393,6 +393,13 @@ struct ncclComm {
// shared structures for finalization
int finalizeRankCnt;
#if defined(ENABLE_MSCCLPP)
// Whether this comm is compatible with MSCCLPP
bool mscclppCompatible;
struct mscclpp_ncclComm* mscclpp_comm;
size_t mscclpp_threshold;
#endif
// Whether this comm is compatible with MSCCL
bool mscclCompatible;
// group job to support multi-thread FT
+48
Просмотреть файл
@@ -0,0 +1,48 @@
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt and NOTICES.txt for license information
************************************************************************/
#ifndef MSCCLPP_NCCL_H_
#define MSCCLPP_NCCL_H_
#include "nccl.h"
#include <unordered_map>
typedef struct mscclpp_ncclComm* mscclpp_ncclComm_t;
typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } mscclpp_ncclUniqueId;
bool mscclpp_init();
/* A ncclUniqueId and a mscclpp_ncclUniqueId will always be created together and used alternatively. This maps between them. */
extern std::unordered_map<ncclUniqueId, mscclpp_ncclUniqueId> mscclpp_uniqueIdMap;
/* See ncclGetUniqueId. */
extern ncclResult_t (*mscclpp_ncclGetUniqueId)(mscclpp_ncclUniqueId* uniqueId);
/* See ncclCommInitRank. */
extern ncclResult_t (*mscclpp_ncclCommInitRank)(mscclpp_ncclComm_t* comm, int nranks, mscclpp_ncclUniqueId commId, int rank);
/* See ncclCommDestroy. */
extern ncclResult_t (*mscclpp_ncclCommDestroy)(mscclpp_ncclComm_t comm);
/* See ncclAllReduce. */
extern ncclResult_t (*mscclpp_ncclAllReduce)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, mscclpp_ncclComm_t comm, hipStream_t stream);
/* See ncclAllGather. */
extern ncclResult_t (*mscclpp_ncclAllGather)(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, mscclpp_ncclComm_t comm, hipStream_t stream);
namespace std {
template <>
struct hash<ncclUniqueId> {
size_t operator ()(const ncclUniqueId& uniqueId) const noexcept;
};
}
bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b);
#endif
+68
Просмотреть файл
@@ -45,6 +45,9 @@
#include "hip_rocm_version_info.h"
//#include "clique/CliqueManager.h"
//#include <hsa/hsa_ext_amd.h>
#ifdef ENABLE_MSCCLPP
#include "mscclpp/mscclpp_nccl.h"
#endif
// [/RCCL]
#include "msccl/msccl_lifecycle.h"
@@ -89,6 +92,16 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
return h;
}
#ifdef ENABLE_MSCCLPP
size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {
return (size_t)hashUniqueId(uniqueId);
}
bool operator ==(const ncclUniqueId& a, const ncclUniqueId& b) {
return memcmp(a.internal, b.internal, NCCL_UNIQUE_ID_BYTES) == 0;
}
#endif
// GDRCOPY support: Off by default
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
@@ -149,6 +162,11 @@ static ncclResult_t ncclInit() {
}
#ifndef NVTX_NO_IMPL
initNvtxRegisteredEnums();
#endif
#ifdef ENABLE_MSCCLPP
if (!mscclpp_init()) {
return ncclSystemError;
}
#endif
__atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
}
@@ -163,12 +181,32 @@ ncclResult_t ncclGetVersion(int* version) {
return ncclSuccess;
}
#ifdef ENABLE_MSCCLPP
RCCL_PARAM(EnableMscclpp, "ENABLE_MSCCLPP", 0);
RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(1024*1024));
#endif
NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
NCCLCHECK(ncclInit());
NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
ncclResult_t res = bootstrapGetUniqueId((struct ncclBootstrapHandle*)out);
TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
#ifdef ENABLE_MSCCLPP
if (rcclParamEnableMscclpp()) {
NCCLCHECK(res);
int dev;
CUDACHECK(cudaGetDevice(&dev));
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, dev));
if (IsArchMatch(devProp.gcnArchName, "gfx94")) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclGetUniqueId");
res = mscclpp_ncclGetUniqueId(&(mscclpp_uniqueIdMap[*out]));
} else {
WARN("MSCCL++: Cannot enable MSCCL++ on %s architecture", devProp.gcnArchName);
}
}
#endif
return res;
}
@@ -1930,6 +1968,24 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
NCCLCHECKGOTO(initTransportsRank(comm, job->parent), res, fail);
#ifdef ENABLE_MSCCLPP
if (rcclParamEnableMscclpp()) {
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
if (comm->mscclppCompatible) {
NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, &(mscclpp_uniqueIdMap[job->commId]), sizeof(mscclpp_ncclUniqueId)), res, fail);
INFO(NCCL_INIT, "MSCCL++: Broadcast mscclpp_ncclUniqueId to %d ranks", (comm->localRanks - 1));
comm->mscclpp_threshold = rcclParamMscclppThreshold();
INFO(NCCL_INIT, "MSCCL++: Enabled! Msg size threshold=%zu", comm->mscclpp_threshold);
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclCommInitRank (nranks=%d)", job->nranks);
NCCLCHECKGOTO(mscclpp_ncclCommInitRank(&(comm->mscclpp_comm), job->nranks, mscclpp_uniqueIdMap[job->commId], job->myrank), res, fail);
} else {
WARN("MSCCL++: Cannot enable MSCCL++ on %s architecture", devProp.gcnArchName);
}
}
#endif
NCCLCHECKGOTO(ncclLoadTunerPlugin(&comm->tuner), res, fail);
if (comm->tuner) {
NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog));
@@ -2528,6 +2584,18 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
return ncclSuccess;
}
#ifdef ENABLE_MSCCLPP
if (comm->mscclppCompatible) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclCommDestroy");
ncclResult_t res = mscclpp_ncclCommDestroy(comm->mscclpp_comm);
if (res != ncclSuccess) {
WARN("MSCCL++: mscclpp_ncclCommDestroy failed (%s)", ncclGetErrorString(res));
}
comm->mscclppCompatible = false;
comm->mscclpp_comm = nullptr;
}
#endif
int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
+45 -1
Просмотреть файл
@@ -22,6 +22,10 @@
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"
#ifdef ENABLE_MSCCLPP
#include "mscclpp/mscclpp_nccl.h"
#endif
RCCL_PARAM(MscclEnabled, "MSCCL_ENABLE", 1);
RCCL_PARAM(MscclForceEnabled, "MSCCL_FORCE_ENABLE", 0);
static const char* mscclAlgoFilePathEnv = "MSCCL_ALGO_FILE_PATH";
@@ -448,18 +452,58 @@ ncclResult_t mscclEnqueueCheck(
count, dataType, root, peer, op, func, comm, stream,
&threadLocalStatus.savedSchedulerParams.back()));
size_t nBytes = count * ncclTypeSize(dataType);
switch (threadLocalStatus.groupStatus) {
case mscclNoGroup:
#ifdef ENABLE_MSCCLPP
if (comm->mscclppCompatible) {
/* check if one rank per GPU and graph mode is enabled */
if ((nBytes >= 32) && (threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible) {
if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclAllReduce (groupStatus=mscclNoGroup)");
NCCLCHECK(mscclpp_ncclAllReduce(sendBuff, recvBuff, count, dataType, op, comm->mscclpp_comm, stream));
threadLocalStatus.savedSchedulerParams.clear();
break;
}
else if (func == mscclFuncAllGather && nBytes * comm->nRanks <= comm->mscclpp_threshold) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclAllGather (groupStatus=mscclNoGroup)");
NCCLCHECK(mscclpp_ncclAllGather(sendBuff, recvBuff, count, dataType, comm->mscclpp_comm, stream));
threadLocalStatus.savedSchedulerParams.clear();
break;
}
}
}
#endif
if (comm->mscclCompatible) {
NCCLCHECK(mscclSchedulerSelectAlgo(&threadLocalStatus.savedSchedulerParams.back()));
if (threadLocalStatus.savedSchedulerParams.back().p.scheduled) {
NCCLCHECK(mscclRunSavedParams());
break;
}
}
}
NCCLCHECK(mscclFallBackSavedParams());
break;
case mscclGroupSupportedOp:
#ifdef ENABLE_MSCCLPP
if (comm->mscclppCompatible) {
/* check if one rank per GPU and graph mode is enabled */
if ((nBytes >= 32) && (threadLocalStatus.captureStatus != mscclNoCapture) && comm->mscclCompatible) {
if (func == mscclFuncAllReduce && nBytes <= comm->mscclpp_threshold) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclAllReduce (groupStatus=mscclGroupSupportedOp)");
NCCLCHECK(mscclpp_ncclAllReduce(sendBuff, recvBuff, count, dataType, op, comm->mscclpp_comm, stream));
threadLocalStatus.savedSchedulerParams.clear();
break;
}
else if (func == mscclFuncAllGather && nBytes * comm->nRanks <= comm->mscclpp_threshold) {
INFO(NCCL_INIT, "MSCCL++: mscclpp_ncclAllGather (groupStatus=mscclGroupSupportedOp)");
NCCLCHECK(mscclpp_ncclAllGather(sendBuff, recvBuff, count, dataType, comm->mscclpp_comm, stream));
threadLocalStatus.savedSchedulerParams.clear();
break;
}
}
}
#endif
if (comm->mscclCompatible) {
NCCLCHECK(mscclSchedulerSelectAlgo(&threadLocalStatus.savedSchedulerParams.back()));
if (threadLocalStatus.savedSchedulerParams.back().p.scheduled) {
+46
Просмотреть файл
@@ -0,0 +1,46 @@
/*************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt and NOTICES.txt for license information
************************************************************************/
#include "mscclpp/mscclpp_nccl.h"
#include "debug.h"
#include <dlfcn.h>
#include <unordered_map>
#define MSCCLPP_DECLARE(X) decltype(mscclpp_##X) mscclpp_##X = nullptr
#define MSCCLPP_LOAD(HANDLE, X) do { \
(mscclpp_##X) = (decltype(mscclpp_##X))dlsym((HANDLE), (#X)); \
const char* error; \
if ((error = dlerror()) != nullptr) { \
WARN("MSCCL++: failed to load %s : %s", (#X), error); \
return false; \
} \
} while (false)
static const char mscclpp_nccl_lib_name[] = "libmscclpp_nccl.so";
MSCCLPP_DECLARE(ncclGetUniqueId);
MSCCLPP_DECLARE(ncclCommInitRank);
MSCCLPP_DECLARE(ncclCommDestroy);
MSCCLPP_DECLARE(ncclAllReduce);
MSCCLPP_DECLARE(ncclAllGather);
bool mscclpp_init() {
void* handle = dlopen(mscclpp_nccl_lib_name, RTLD_LAZY);
if (!handle) {
WARN("MSCCL++: failed to access %s : %s", mscclpp_nccl_lib_name, dlerror());
return false;
}
dlerror(); // Clear any errors.
MSCCLPP_LOAD(handle, ncclGetUniqueId);
MSCCLPP_LOAD(handle, ncclCommInitRank);
MSCCLPP_LOAD(handle, ncclCommDestroy);
MSCCLPP_LOAD(handle, ncclAllReduce);
MSCCLPP_LOAD(handle, ncclAllGather);
return true;
}
std::unordered_map<ncclUniqueId, mscclpp_ncclUniqueId> mscclpp_uniqueIdMap;