diff --git a/cmake/MSCCLPP.cmake b/cmake/MSCCLPP.cmake index cabc08b32e..15fbb60b27 100644 --- a/cmake/MSCCLPP.cmake +++ b/cmake/MSCCLPP.cmake @@ -85,6 +85,11 @@ if(ENABLE_MSCCLPP) WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + message(STATUS "Building mscclpp only for gfx942.") mscclpp_cmake_arg(CMAKE_PREFIX_PATH) mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH) @@ -128,6 +133,10 @@ if(ENABLE_MSCCLPP) COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) #endif() diff --git a/ext-src/non-multiple-128-fix.patch b/ext-src/non-multiple-128-fix.patch new file mode 100644 index 0000000000..ce7f8f2f6c --- /dev/null +++ b/ext-src/non-multiple-128-fix.patch @@ -0,0 +1,16 @@ +diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp +index 76674ba..7a2cd4a 100644 +--- a/apps/nccl/src/allreduce.hpp ++++ b/apps/nccl/src/allreduce.hpp +@@ -368,7 +368,10 @@ __global__ void __launch_bounds__(512, 1) + const size_t chanOffset = nPeer * blockIdx.x; + // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) + const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); +- const size_t nInt4PerRank = nInt4 / worldSize; ++ size_t nInt4PerRank = nInt4 / worldSize; ++ if (nInt4 % worldSize) ++ nInt4PerRank = nInt4PerRank + 1; ++ + auto smChans = smChannels + chanOffset; + auto smOutChans = smOutChannels + chanOffset; +