update mscclpp (#1488)

* update commit hash for mscclpp submodule

* update mscclpp submodule

* remove print messages in cmake

* add back some print messages, update MSCLPP CMAKE_ARGS

* enable MSCCL++ patches regardless of finding mscclpp_nccl package

[ROCm/rccl commit: d89432e8c8]
이 커밋은 다음에 포함됨:
isaki001
2025-01-20 08:06:43 -06:00
커밋한 사람 GitHub
부모 8e6bedeedc
커밋 25150b1f20
6개의 변경된 파일81개의 추가작업 그리고 51개의 파일을 삭제
+13 -10
파일 보기
@@ -53,7 +53,7 @@ if(ENABLE_MSCCLPP)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
find_package(mscclpp_nccl)
if(NOT mscclpp_nccl_FOUND)
#if(NOT mscclpp_nccl_FOUND)
# Ensure the source code is checked out
set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
if(NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt)
@@ -63,11 +63,13 @@ if(ENABLE_MSCCLPP)
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
endif()
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
execute_process(
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
@@ -83,7 +85,6 @@ if(ENABLE_MSCCLPP)
)
message(STATUS "Building mscclpp only for gfx942.")
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
mscclpp_cmake_arg(HIP_COMPILER)
@@ -94,10 +95,10 @@ if(ENABLE_MSCCLPP)
endif()
download_project(PROJ mscclpp_nccl
# GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
# GIT_TAG 1e82dd444fc1ed8b7add354eebaab8a94e67d5fc
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
INSTALL_DIR ${MSCCLPP_ROOT}
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json
LOG_DOWNLOAD FALSE
LOG_CONFIGURE FALSE
LOG_BUILD FALSE
@@ -106,26 +107,28 @@ if(ENABLE_MSCCLPP)
SOURCE_DIR ${MSCCLPP_SOURCE}
)
find_package(mscclpp_nccl REQUIRED)
execute_process(
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
execute_process(
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
)
endif()
#endif()
execute_process(COMMAND objcopy
--redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
+16 -14
파일 보기
@@ -1,10 +1,10 @@
diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h
index 7f50792..b8b146d 100644
index bfdb226..7fd07a8 100644
--- a/apps/nccl/include/nccl.h
+++ b/apps/nccl/include/nccl.h
@@ -344,6 +344,13 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
ncclComm_t comm, cudaStream_t stream);
@@ -167,6 +167,14 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+/*
+ * Register/Deregister
@@ -13,14 +13,15 @@ index 7f50792..b8b146d 100644
+ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle);
+bool mscclpp_BuffIsRegistered(ncclComm_t comm, const void* buff, size_t count);
+size_t mscclpp_BufferSize(ncclComm_t comm, void* handle);
/*
* Send
*
+
/* Reduction operation selector */
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
typedef enum {
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index a697be2..1d4af61 100644
index 022d398..2a39643 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -65,6 +65,7 @@ struct ncclComm {
@@ -85,6 +85,7 @@ struct ncclComm {
std::unordered_map<channelKey, ChannelInfo> channelInInfos;
std::unordered_map<channelKey, ChannelInfo> channelOutInfos;
std::unordered_map<channelKey, ChannelInfo> channelScratchInfos;
@@ -28,7 +29,7 @@ index a697be2..1d4af61 100644
std::shared_ptr<char> scratchBuff;
std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
@@ -73,6 +74,11 @@ struct ncclComm {
@@ -92,6 +93,11 @@ struct ncclComm {
uint32_t buffFlag;
};
@@ -40,8 +41,8 @@ index a697be2..1d4af61 100644
static size_t ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
@@ -577,6 +583,104 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
return ncclSuccess;
@@ -561,6 +567,105 @@ NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
return ncclInternalError;
}
+NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, void** handle) {
@@ -142,6 +143,7 @@ index a697be2..1d4af61 100644
+ auto buffKeyIt = comm->handleKeys.find(handle);
+ return buffKeyIt != comm->handleKeys.end() ? buffKeyIt->second.bytes : 0;
+}
NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
+
NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t,
cudaStream_t) {
// TODO: implement this function
return ncclInternalError;
서브모듈 projects/rccl/ext-src/mscclpp 업데이트됨: 1e82dd444f...4ee15b7ad0
+45 -26
파일 보기
@@ -1,8 +1,8 @@
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 1b85136..ee90c2f 100644
index 4134241..d65be4b 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -386,24 +386,353 @@ __global__ void __launch_bounds__(512, 1)
@@ -495,24 +495,348 @@ __global__ void __launch_bounds__(512, 1)
}
}
@@ -311,9 +311,6 @@ index 1b85136..ee90c2f 100644
+ }
+
+}
+
+
+
+
template <typename T>
cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
@@ -326,7 +323,7 @@ index 1b85136..ee90c2f 100644
+ int readAllred = 0, hieAllred = 0;
+ char* envValue = nullptr;
+ char* envValue1 = nullptr;
+
+ nRanksPerNode = (worldSize < nRanksPerNode) ? worldSize : nRanksPerNode;
+
+ envValue = std::getenv("MSCCLPP_READ_ALLRED");
@@ -342,8 +339,6 @@ index 1b85136..ee90c2f 100644
+ hieAllred = 1;
+ }
+ }
+
if (sizeof(T) * nelems < worldSize * sizeof(int)) {
int nBlocks = 7;
int nThreadsPerBlock = 32;
@@ -361,8 +356,8 @@ index 1b85136..ee90c2f 100644
+ nBlocks = 8*(nRanksPerNode - 1);
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
}
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
@@ -412,9 +741,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
#if defined(ENABLE_NPKIT)
@@ -528,9 +852,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
} else {
int nBlocks = 35;
int nThreadsPerBlock = 512;
@@ -388,24 +383,25 @@ index 1b85136..ee90c2f 100644
return cudaGetLastError();
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
index 25c74e7..5e85468 100644
index 015e0a2..f8ba6d6 100644
--- a/apps/nccl/src/common.hpp
+++ b/apps/nccl/src/common.hpp
@@ -11,7 +11,9 @@
@@ -13,8 +13,10 @@
#define WARP_SIZE 32
#endif
+constexpr int NRANKS1_PER_NODE = 4;
constexpr int NRANKS_PER_NODE = 8;
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
constexpr int NPEERS = 7;
+constexpr int NPEER = 7;
#endif // NCCL_COMMON_HPP_
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index cb0e7d5..a697be2 100644
index f91d15e..022d398 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -49,7 +49,9 @@ struct hash<channelKey> {
@@ -70,7 +70,9 @@ struct hash<channelKey> {
struct ChannelInfo {
std::vector<mscclpp::SmChannel> smChannels;
@@ -415,18 +411,15 @@ index cb0e7d5..a697be2 100644
};
struct ncclComm {
@@ -212,8 +214,10 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
int rank = comm->comm->bootstrap()->getRank();
channelKey sendKey{(void*)sendBasePtr, sendBytes};
@@ -213,6 +215,7 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
channelKey recvKey{(void*)recvBasePtr, recvBytes};
+
mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
+ mscclpp::DeviceHandle<mscclpp::SmChannel>* smScrChannels = nullptr;
// Creating the channels
if (count * ncclTypeSize(datatype) <= comm->largeMessageSizeBoundary) {
@@ -221,19 +225,25 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
if (count * ncclTypeSize(datatype) <= (1 << 20)) {
@@ -220,19 +223,24 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
if (sendIt == comm->channelScratchInfos.end()) {
std::vector<mscclpp::SmChannel> channels =
setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
@@ -438,8 +431,8 @@ index cb0e7d5..a697be2 100644
smChannels = sendIt->second.smChannelDeviceHandles.get();
} else {
std::vector<mscclpp::RegisteredMemory> remoteMemories;
-
+ std::vector<mscclpp::RegisteredMemory> remoteMemories1;
auto sendIt = comm->channelInInfos.find(sendKey);
if (sendIt == comm->channelInInfos.end()) {
std::vector<mscclpp::SmChannel> channels =
@@ -454,7 +447,7 @@ index cb0e7d5..a697be2 100644
sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first;
}
@@ -243,35 +253,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
@@ -242,35 +250,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
std::vector<mscclpp::SmChannel> outChannels =
setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
@@ -501,8 +494,8 @@ index cb0e7d5..a697be2 100644
+ NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
break;
default:
return ncclInvalidArgument;
@@ -550,7 +561,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
WARN("datatype is invalid");
@@ -315,7 +324,7 @@ static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff,
std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
@@ -511,3 +504,29 @@ index cb0e7d5..a697be2 100644
it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
}
@@ -597,7 +606,7 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff
std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
- ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
+ ChannelInfo channelInfo{channels, channels, setupSmChannelDeviceHandles(channels), setupSmChannelDeviceHandles(channels)};
it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
}
@@ -805,16 +814,6 @@ NCCL_API ncclResult_t ncclGroupEnd() {
return ncclSuccess;
}
-NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
- // TODO: Implementation
- return ncclSuccess;
-}
-
-NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
- // TODO: Implementation
- return ncclSuccess;
-}
-
ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
if (ptr == nullptr || size == 0) {
WARN("ptr is nullptr or size is 0");
+4
파일 보기
@@ -46,6 +46,10 @@ extern "C" {
bool mscclpp_BuffIsRegistered(mscclppComm_t comm, const void* buff, size_t count);
size_t mscclpp_BufferSize(mscclppComm_t comm, void* handle);
ncclResult_t mscclpp_ncclMemAlloc(void** ptr, size_t size);
ncclResult_t mscclpp_ncclMemFree(void* ptr);
}
namespace std {
+2
파일 보기
@@ -32,3 +32,5 @@ ncclReduceScatter mscclpp_ncclReduceScatter
ncclSend mscclpp_ncclSend
ncclCommRegister mscclpp_ncclCommRegister
ncclCommDeregister mscclpp_ncclCommDeregister
ncclMemAlloc mscclpp_ncclMemAlloc
ncclMemFree mscclpp_ncclMemFree