update mscclpp (#1488)
* update commit hash for mscclpp submodule * update mscclpp submodule * remove print messages in cmake * add back some print messages, update MSCLPP CMAKE_ARGS * enable MSCCL++ patches regardless of finding mscclpp_nccl package
Tento commit je obsažen v:
+13
-10
@@ -53,7 +53,7 @@ if(ENABLE_MSCCLPP)
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
find_package(mscclpp_nccl)
|
||||
|
||||
if(NOT mscclpp_nccl_FOUND)
|
||||
#if(NOT mscclpp_nccl_FOUND)
|
||||
# Ensure the source code is checked out
|
||||
set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "")
|
||||
if(NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt)
|
||||
@@ -63,11 +63,13 @@ if(ENABLE_MSCCLPP)
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
execute_process(
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
@@ -83,7 +85,6 @@ if(ENABLE_MSCCLPP)
|
||||
)
|
||||
|
||||
message(STATUS "Building mscclpp only for gfx942.")
|
||||
|
||||
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
|
||||
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
mscclpp_cmake_arg(HIP_COMPILER)
|
||||
@@ -94,10 +95,10 @@ if(ENABLE_MSCCLPP)
|
||||
endif()
|
||||
|
||||
download_project(PROJ mscclpp_nccl
|
||||
# GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
|
||||
# GIT_TAG 1e82dd444fc1ed8b7add354eebaab8a94e67d5fc
|
||||
#GIT_REPOSITORY https://github.com/microsoft/mscclpp.git
|
||||
#GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc
|
||||
INSTALL_DIR ${MSCCLPP_ROOT}
|
||||
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json
|
||||
CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json
|
||||
LOG_DOWNLOAD FALSE
|
||||
LOG_CONFIGURE FALSE
|
||||
LOG_BUILD FALSE
|
||||
@@ -106,26 +107,28 @@ if(ENABLE_MSCCLPP)
|
||||
SOURCE_DIR ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
|
||||
find_package(mscclpp_nccl REQUIRED)
|
||||
execute_process(
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
endif()
|
||||
|
||||
#endif()
|
||||
|
||||
execute_process(COMMAND objcopy
|
||||
--redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt
|
||||
|
||||
+16
-14
@@ -1,10 +1,10 @@
|
||||
diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h
|
||||
index 7f50792..b8b146d 100644
|
||||
index bfdb226..7fd07a8 100644
|
||||
--- a/apps/nccl/include/nccl.h
|
||||
+++ b/apps/nccl/include/nccl.h
|
||||
@@ -344,6 +344,13 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
|
||||
ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
@@ -167,6 +167,14 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
|
||||
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
|
||||
|
||||
+/*
|
||||
+ * Register/Deregister
|
||||
@@ -13,14 +13,15 @@ index 7f50792..b8b146d 100644
|
||||
+ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle);
|
||||
+bool mscclpp_BuffIsRegistered(ncclComm_t comm, const void* buff, size_t count);
|
||||
+size_t mscclpp_BufferSize(ncclComm_t comm, void* handle);
|
||||
/*
|
||||
* Send
|
||||
*
|
||||
+
|
||||
/* Reduction operation selector */
|
||||
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
|
||||
typedef enum {
|
||||
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
|
||||
index a697be2..1d4af61 100644
|
||||
index 022d398..2a39643 100644
|
||||
--- a/apps/nccl/src/nccl.cu
|
||||
+++ b/apps/nccl/src/nccl.cu
|
||||
@@ -65,6 +65,7 @@ struct ncclComm {
|
||||
@@ -85,6 +85,7 @@ struct ncclComm {
|
||||
std::unordered_map<channelKey, ChannelInfo> channelInInfos;
|
||||
std::unordered_map<channelKey, ChannelInfo> channelOutInfos;
|
||||
std::unordered_map<channelKey, ChannelInfo> channelScratchInfos;
|
||||
@@ -28,7 +29,7 @@ index a697be2..1d4af61 100644
|
||||
std::shared_ptr<char> scratchBuff;
|
||||
std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
|
||||
|
||||
@@ -73,6 +74,11 @@ struct ncclComm {
|
||||
@@ -92,6 +93,11 @@ struct ncclComm {
|
||||
uint32_t buffFlag;
|
||||
};
|
||||
|
||||
@@ -40,8 +41,8 @@ index a697be2..1d4af61 100644
|
||||
static size_t ncclTypeSize(ncclDataType_t type) {
|
||||
switch (type) {
|
||||
case ncclInt8:
|
||||
@@ -577,6 +583,104 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
|
||||
return ncclSuccess;
|
||||
@@ -561,6 +567,105 @@ NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) {
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
+NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, void** handle) {
|
||||
@@ -142,6 +143,7 @@ index a697be2..1d4af61 100644
|
||||
+ auto buffKeyIt = comm->handleKeys.find(handle);
|
||||
+ return buffKeyIt != comm->handleKeys.end() ? buffKeyIt->second.bytes : 0;
|
||||
+}
|
||||
NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) {
|
||||
+
|
||||
NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t,
|
||||
cudaStream_t) {
|
||||
// TODO: implement this function
|
||||
return ncclInternalError;
|
||||
|
||||
+1
-1
Submodul ext-src/mscclpp aktualizován: 1e82dd444f...4ee15b7ad0
@@ -1,8 +1,8 @@
|
||||
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
|
||||
index 1b85136..ee90c2f 100644
|
||||
index 4134241..d65be4b 100644
|
||||
--- a/apps/nccl/src/allreduce.hpp
|
||||
+++ b/apps/nccl/src/allreduce.hpp
|
||||
@@ -386,24 +386,353 @@ __global__ void __launch_bounds__(512, 1)
|
||||
@@ -495,24 +495,348 @@ __global__ void __launch_bounds__(512, 1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -311,9 +311,6 @@ index 1b85136..ee90c2f 100644
|
||||
+ }
|
||||
+
|
||||
+}
|
||||
+
|
||||
+
|
||||
+
|
||||
+
|
||||
template <typename T>
|
||||
cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
|
||||
@@ -326,7 +323,7 @@ index 1b85136..ee90c2f 100644
|
||||
+ int readAllred = 0, hieAllred = 0;
|
||||
+ char* envValue = nullptr;
|
||||
+ char* envValue1 = nullptr;
|
||||
+
|
||||
|
||||
+ nRanksPerNode = (worldSize < nRanksPerNode) ? worldSize : nRanksPerNode;
|
||||
+
|
||||
+ envValue = std::getenv("MSCCLPP_READ_ALLRED");
|
||||
@@ -342,8 +339,6 @@ index 1b85136..ee90c2f 100644
|
||||
+ hieAllred = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
|
||||
if (sizeof(T) * nelems < worldSize * sizeof(int)) {
|
||||
int nBlocks = 7;
|
||||
int nThreadsPerBlock = 32;
|
||||
@@ -361,8 +356,8 @@ index 1b85136..ee90c2f 100644
|
||||
+ nBlocks = 8*(nRanksPerNode - 1);
|
||||
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
|
||||
}
|
||||
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
|
||||
@@ -412,9 +741,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -528,9 +852,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
|
||||
} else {
|
||||
int nBlocks = 35;
|
||||
int nThreadsPerBlock = 512;
|
||||
@@ -388,24 +383,25 @@ index 1b85136..ee90c2f 100644
|
||||
|
||||
return cudaGetLastError();
|
||||
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
|
||||
index 25c74e7..5e85468 100644
|
||||
index 015e0a2..f8ba6d6 100644
|
||||
--- a/apps/nccl/src/common.hpp
|
||||
+++ b/apps/nccl/src/common.hpp
|
||||
@@ -11,7 +11,9 @@
|
||||
@@ -13,8 +13,10 @@
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
+constexpr int NRANKS1_PER_NODE = 4;
|
||||
constexpr int NRANKS_PER_NODE = 8;
|
||||
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
|
||||
constexpr int NPEERS = 7;
|
||||
+constexpr int NPEER = 7;
|
||||
|
||||
#endif // NCCL_COMMON_HPP_
|
||||
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
|
||||
|
||||
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
|
||||
index cb0e7d5..a697be2 100644
|
||||
index f91d15e..022d398 100644
|
||||
--- a/apps/nccl/src/nccl.cu
|
||||
+++ b/apps/nccl/src/nccl.cu
|
||||
@@ -49,7 +49,9 @@ struct hash<channelKey> {
|
||||
@@ -70,7 +70,9 @@ struct hash<channelKey> {
|
||||
|
||||
struct ChannelInfo {
|
||||
std::vector<mscclpp::SmChannel> smChannels;
|
||||
@@ -415,18 +411,15 @@ index cb0e7d5..a697be2 100644
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
@@ -212,8 +214,10 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
int rank = comm->comm->bootstrap()->getRank();
|
||||
channelKey sendKey{(void*)sendBasePtr, sendBytes};
|
||||
@@ -213,6 +215,7 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
channelKey recvKey{(void*)recvBasePtr, recvBytes};
|
||||
+
|
||||
mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
|
||||
mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
|
||||
+ mscclpp::DeviceHandle<mscclpp::SmChannel>* smScrChannels = nullptr;
|
||||
|
||||
// Creating the channels
|
||||
if (count * ncclTypeSize(datatype) <= comm->largeMessageSizeBoundary) {
|
||||
@@ -221,19 +225,25 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
if (count * ncclTypeSize(datatype) <= (1 << 20)) {
|
||||
@@ -220,19 +223,24 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
if (sendIt == comm->channelScratchInfos.end()) {
|
||||
std::vector<mscclpp::SmChannel> channels =
|
||||
setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
|
||||
@@ -438,8 +431,8 @@ index cb0e7d5..a697be2 100644
|
||||
smChannels = sendIt->second.smChannelDeviceHandles.get();
|
||||
} else {
|
||||
std::vector<mscclpp::RegisteredMemory> remoteMemories;
|
||||
-
|
||||
+ std::vector<mscclpp::RegisteredMemory> remoteMemories1;
|
||||
|
||||
auto sendIt = comm->channelInInfos.find(sendKey);
|
||||
if (sendIt == comm->channelInInfos.end()) {
|
||||
std::vector<mscclpp::SmChannel> channels =
|
||||
@@ -454,7 +447,7 @@ index cb0e7d5..a697be2 100644
|
||||
sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first;
|
||||
}
|
||||
|
||||
@@ -243,35 +253,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
@@ -242,35 +250,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
|
||||
setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
|
||||
std::vector<mscclpp::SmChannel> outChannels =
|
||||
setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
|
||||
@@ -501,8 +494,8 @@ index cb0e7d5..a697be2 100644
|
||||
+ NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
|
||||
break;
|
||||
default:
|
||||
return ncclInvalidArgument;
|
||||
@@ -550,7 +561,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
|
||||
WARN("datatype is invalid");
|
||||
@@ -315,7 +324,7 @@ static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff,
|
||||
std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
|
||||
std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
|
||||
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
|
||||
@@ -511,3 +504,29 @@ index cb0e7d5..a697be2 100644
|
||||
it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
|
||||
}
|
||||
|
||||
@@ -597,7 +606,7 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff
|
||||
std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
|
||||
std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
|
||||
[](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
|
||||
- ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
|
||||
+ ChannelInfo channelInfo{channels, channels, setupSmChannelDeviceHandles(channels), setupSmChannelDeviceHandles(channels)};
|
||||
it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
|
||||
}
|
||||
|
||||
@@ -805,16 +814,6 @@ NCCL_API ncclResult_t ncclGroupEnd() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
-NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) {
|
||||
- // TODO: Implementation
|
||||
- return ncclSuccess;
|
||||
-}
|
||||
-
|
||||
-NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) {
|
||||
- // TODO: Implementation
|
||||
- return ncclSuccess;
|
||||
-}
|
||||
-
|
||||
ncclResult_t ncclMemAlloc(void** ptr, size_t size) {
|
||||
if (ptr == nullptr || size == 0) {
|
||||
WARN("ptr is nullptr or size is 0");
|
||||
|
||||
@@ -46,6 +46,10 @@ extern "C" {
|
||||
bool mscclpp_BuffIsRegistered(mscclppComm_t comm, const void* buff, size_t count);
|
||||
|
||||
size_t mscclpp_BufferSize(mscclppComm_t comm, void* handle);
|
||||
|
||||
ncclResult_t mscclpp_ncclMemAlloc(void** ptr, size_t size);
|
||||
|
||||
ncclResult_t mscclpp_ncclMemFree(void* ptr);
|
||||
}
|
||||
|
||||
namespace std {
|
||||
|
||||
@@ -32,3 +32,5 @@ ncclReduceScatter mscclpp_ncclReduceScatter
|
||||
ncclSend mscclpp_ncclSend
|
||||
ncclCommRegister mscclpp_ncclCommRegister
|
||||
ncclCommDeregister mscclpp_ncclCommDeregister
|
||||
ncclMemAlloc mscclpp_ncclMemAlloc
|
||||
ncclMemFree mscclpp_ncclMemFree
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele