diff --git a/projects/rccl/cmake/MSCCLPP.cmake b/projects/rccl/cmake/MSCCLPP.cmake index 48c1e22842..6e4c672a3f 100644 --- a/projects/rccl/cmake/MSCCLPP.cmake +++ b/projects/rccl/cmake/MSCCLPP.cmake @@ -53,7 +53,7 @@ if(ENABLE_MSCCLPP) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") find_package(mscclpp_nccl) - if(NOT mscclpp_nccl_FOUND) + #if(NOT mscclpp_nccl_FOUND) # Ensure the source code is checked out set(MSCCLPP_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp CACHE PATH "") if(NOT EXISTS ${MSCCLPP_SOURCE}/CMakeLists.txt) @@ -63,11 +63,13 @@ if(ENABLE_MSCCLPP) WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) endif() + execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - execute_process( + + execute_process( COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) @@ -83,7 +85,6 @@ if(ENABLE_MSCCLPP) ) message(STATUS "Building mscclpp only for gfx942.") - mscclpp_cmake_arg(CMAKE_PREFIX_PATH) mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH) mscclpp_cmake_arg(HIP_COMPILER) @@ -94,10 +95,10 @@ if(ENABLE_MSCCLPP) endif() download_project(PROJ mscclpp_nccl - # GIT_REPOSITORY https://github.com/microsoft/mscclpp.git - # GIT_TAG 1e82dd444fc1ed8b7add354eebaab8a94e67d5fc + #GIT_REPOSITORY https://github.com/microsoft/mscclpp.git + #GIT_TAG 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc INSTALL_DIR ${MSCCLPP_ROOT} - CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DBUILD_APPS_NCCL=ON -DBUILD_PYTHON_BINDINGS=OFF -DBUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX= "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json + CMAKE_ARGS -DAMDGPU_TARGETS=${GFX942_VARIANT} -DGPU_TARGETS=${GFX942_VARIANT} -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DMSCCLPP_BUILD_APPS_NCCL=ON -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF -DMSCCLPP_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX= "${CMAKE_PREFIX_PATH_ARG}" -DCMAKE_VERBOSE_MAKEFILE=1 "${CMAKE_INSTALL_RPATH_USE_LINK_PATH_ARG}" "${HIP_COMPILER_ARG}" -DFETCHCONTENT_SOURCE_DIR_JSON=${CMAKE_CURRENT_SOURCE_DIR}/ext-src/json LOG_DOWNLOAD FALSE LOG_CONFIGURE FALSE LOG_BUILD FALSE @@ -106,26 +107,28 @@ if(ENABLE_MSCCLPP) SOURCE_DIR ${MSCCLPP_SOURCE} ) + find_package(mscclpp_nccl REQUIRED) - execute_process( + execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) + execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - + execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - execute_process( COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) - endif() + + #endif() execute_process(COMMAND objcopy --redefine-syms=${CMAKE_CURRENT_SOURCE_DIR}/src/misc/mscclpp/mscclpp_nccl_syms.txt diff --git a/projects/rccl/ext-src/mem-reg.patch b/projects/rccl/ext-src/mem-reg.patch index f95b116b9a..0e848cf6c5 100644 --- a/projects/rccl/ext-src/mem-reg.patch +++ b/projects/rccl/ext-src/mem-reg.patch @@ -1,10 +1,10 @@ diff --git a/apps/nccl/include/nccl.h b/apps/nccl/include/nccl.h -index 7f50792..b8b146d 100644 +index bfdb226..7fd07a8 100644 --- a/apps/nccl/include/nccl.h +++ b/apps/nccl/include/nccl.h -@@ -344,6 +344,13 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun - ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, - ncclComm_t comm, cudaStream_t stream); +@@ -167,6 +167,14 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); + ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); +/* + * Register/Deregister @@ -13,14 +13,15 @@ index 7f50792..b8b146d 100644 +ncclResult_t ncclCommDeregister(ncclComm_t comm, void* handle); +bool mscclpp_BuffIsRegistered(ncclComm_t comm, const void* buff, size_t count); +size_t mscclpp_BufferSize(ncclComm_t comm, void* handle); - /* - * Send - * ++ + /* Reduction operation selector */ + typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; + typedef enum { diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu -index a697be2..1d4af61 100644 +index 022d398..2a39643 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu -@@ -65,6 +65,7 @@ struct ncclComm { +@@ -85,6 +85,7 @@ struct ncclComm { std::unordered_map channelInInfos; std::unordered_map channelOutInfos; std::unordered_map channelScratchInfos; @@ -28,7 +29,7 @@ index a697be2..1d4af61 100644 std::shared_ptr scratchBuff; std::vector remoteScratchRegMemories; -@@ -73,6 +74,11 @@ struct ncclComm { +@@ -92,6 +93,11 @@ struct ncclComm { uint32_t buffFlag; }; @@ -40,8 +41,8 @@ index a697be2..1d4af61 100644 static size_t ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: -@@ -577,6 +583,104 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t - return ncclSuccess; +@@ -561,6 +567,105 @@ NCCL_API ncclResult_t ncclRedOpDestroy(ncclRedOp_t, ncclComm_t) { + return ncclInternalError; } +NCCL_API ncclResult_t ncclCommRegister(ncclComm_t comm, void* buff, size_t size, void** handle) { @@ -142,6 +143,7 @@ index a697be2..1d4af61 100644 + auto buffKeyIt = comm->handleKeys.find(handle); + return buffKeyIt != comm->handleKeys.end() ? buffKeyIt->second.bytes : 0; +} - NCCL_API ncclResult_t ncclSend(const void*, size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t) { ++ + NCCL_API ncclResult_t ncclReduce(const void*, void*, size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, + cudaStream_t) { // TODO: implement this function - return ncclInternalError; diff --git a/projects/rccl/ext-src/mscclpp b/projects/rccl/ext-src/mscclpp index 1e82dd444f..4ee15b7ad0 160000 --- a/projects/rccl/ext-src/mscclpp +++ b/projects/rccl/ext-src/mscclpp @@ -1 +1 @@ -Subproject commit 1e82dd444fc1ed8b7add354eebaab8a94e67d5fc +Subproject commit 4ee15b7ad085daaf74349d4c49c9b8480d28f0dc diff --git a/projects/rccl/ext-src/read-allred.patch b/projects/rccl/ext-src/read-allred.patch index a51f42b779..09f81f1a13 100644 --- a/projects/rccl/ext-src/read-allred.patch +++ b/projects/rccl/ext-src/read-allred.patch @@ -1,8 +1,8 @@ diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp -index 1b85136..ee90c2f 100644 +index 4134241..d65be4b 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp -@@ -386,24 +386,353 @@ __global__ void __launch_bounds__(512, 1) +@@ -495,24 +495,348 @@ __global__ void __launch_bounds__(512, 1) } } @@ -311,9 +311,6 @@ index 1b85136..ee90c2f 100644 + } + +} -+ -+ -+ + template cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, @@ -326,7 +323,7 @@ index 1b85136..ee90c2f 100644 + int readAllred = 0, hieAllred = 0; + char* envValue = nullptr; + char* envValue1 = nullptr; -+ + + nRanksPerNode = (worldSize < nRanksPerNode) ? worldSize : nRanksPerNode; + + envValue = std::getenv("MSCCLPP_READ_ALLRED"); @@ -342,8 +339,6 @@ index 1b85136..ee90c2f 100644 + hieAllred = 1; + } + } -+ - if (sizeof(T) * nelems < worldSize * sizeof(int)) { int nBlocks = 7; int nThreadsPerBlock = 32; @@ -361,8 +356,8 @@ index 1b85136..ee90c2f 100644 + nBlocks = 8*(nRanksPerNode - 1); nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } - allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, -@@ -412,9 +741,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< + #if defined(ENABLE_NPKIT) +@@ -528,9 +852,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< } else { int nBlocks = 35; int nThreadsPerBlock = 512; @@ -388,24 +383,25 @@ index 1b85136..ee90c2f 100644 return cudaGetLastError(); diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp -index 25c74e7..5e85468 100644 +index 015e0a2..f8ba6d6 100644 --- a/apps/nccl/src/common.hpp +++ b/apps/nccl/src/common.hpp -@@ -11,7 +11,9 @@ +@@ -13,8 +13,10 @@ #define WARP_SIZE 32 #endif +constexpr int NRANKS1_PER_NODE = 4; constexpr int NRANKS_PER_NODE = 8; - constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB + constexpr int NPEERS = 7; +constexpr int NPEER = 7; - #endif // NCCL_COMMON_HPP_ + constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB + diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu -index cb0e7d5..a697be2 100644 +index f91d15e..022d398 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu -@@ -49,7 +49,9 @@ struct hash { +@@ -70,7 +70,9 @@ struct hash { struct ChannelInfo { std::vector smChannels; @@ -415,18 +411,15 @@ index cb0e7d5..a697be2 100644 }; struct ncclComm { -@@ -212,8 +214,10 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, - int rank = comm->comm->bootstrap()->getRank(); - channelKey sendKey{(void*)sendBasePtr, sendBytes}; +@@ -213,6 +215,7 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, channelKey recvKey{(void*)recvBasePtr, recvBytes}; -+ mscclpp::DeviceHandle* smChannels = nullptr; mscclpp::DeviceHandle* smOutChannels = nullptr; + mscclpp::DeviceHandle* smScrChannels = nullptr; // Creating the channels - if (count * ncclTypeSize(datatype) <= comm->largeMessageSizeBoundary) { -@@ -221,19 +225,25 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, + if (count * ncclTypeSize(datatype) <= (1 << 20)) { +@@ -220,19 +223,24 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, if (sendIt == comm->channelScratchInfos.end()) { std::vector channels = setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); @@ -438,8 +431,8 @@ index cb0e7d5..a697be2 100644 smChannels = sendIt->second.smChannelDeviceHandles.get(); } else { std::vector remoteMemories; +- + std::vector remoteMemories1; - auto sendIt = comm->channelInInfos.find(sendKey); if (sendIt == comm->channelInInfos.end()) { std::vector channels = @@ -454,7 +447,7 @@ index cb0e7d5..a697be2 100644 sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first; } -@@ -243,35 +253,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, +@@ -242,35 +250,36 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc); std::vector outChannels = setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); @@ -501,8 +494,8 @@ index cb0e7d5..a697be2 100644 + NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; default: - return ncclInvalidArgument; -@@ -550,7 +561,7 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t + WARN("datatype is invalid"); +@@ -315,7 +324,7 @@ static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff, std::vector> smChannelDeviceHandles; std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); @@ -511,3 +504,29 @@ index cb0e7d5..a697be2 100644 it = comm->channelOutInfos.emplace(recvKey, channelInfo).first; } +@@ -597,7 +606,7 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff + std::vector> smChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), + [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); +- ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; ++ ChannelInfo channelInfo{channels, channels, setupSmChannelDeviceHandles(channels), setupSmChannelDeviceHandles(channels)}; + it = comm->channelOutInfos.emplace(recvKey, channelInfo).first; + } + +@@ -805,16 +814,6 @@ NCCL_API ncclResult_t ncclGroupEnd() { + return ncclSuccess; + } + +-NCCL_API ncclResult_t ncclCommRegister(const ncclComm_t, void*, size_t, void**) { +- // TODO: Implementation +- return ncclSuccess; +-} +- +-NCCL_API ncclResult_t ncclCommDeregister(const ncclComm_t, void*) { +- // TODO: Implementation +- return ncclSuccess; +-} +- + ncclResult_t ncclMemAlloc(void** ptr, size_t size) { + if (ptr == nullptr || size == 0) { + WARN("ptr is nullptr or size is 0"); diff --git a/projects/rccl/src/include/mscclpp/mscclpp_nccl.h b/projects/rccl/src/include/mscclpp/mscclpp_nccl.h index 760405e499..1bd3d8168d 100644 --- a/projects/rccl/src/include/mscclpp/mscclpp_nccl.h +++ b/projects/rccl/src/include/mscclpp/mscclpp_nccl.h @@ -46,6 +46,10 @@ extern "C" { bool mscclpp_BuffIsRegistered(mscclppComm_t comm, const void* buff, size_t count); size_t mscclpp_BufferSize(mscclppComm_t comm, void* handle); + + ncclResult_t mscclpp_ncclMemAlloc(void** ptr, size_t size); + + ncclResult_t mscclpp_ncclMemFree(void* ptr); } namespace std { diff --git a/projects/rccl/src/misc/mscclpp/mscclpp_nccl_syms.txt b/projects/rccl/src/misc/mscclpp/mscclpp_nccl_syms.txt index bb2bd858dc..1c11dcf048 100644 --- a/projects/rccl/src/misc/mscclpp/mscclpp_nccl_syms.txt +++ b/projects/rccl/src/misc/mscclpp/mscclpp_nccl_syms.txt @@ -32,3 +32,5 @@ ncclReduceScatter mscclpp_ncclReduceScatter ncclSend mscclpp_ncclSend ncclCommRegister mscclpp_ncclCommRegister ncclCommDeregister mscclpp_ncclCommDeregister +ncclMemAlloc mscclpp_ncclMemAlloc +ncclMemFree mscclpp_ncclMemFree