ext-src: fix mscclpp allreduce for non-multiple of 128 message sizes (#1556)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
ddc5d58b93
Коммит
83f8b191ff
@@ -85,6 +85,11 @@ if(ENABLE_MSCCLPP)
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
message(STATUS "Building mscclpp only for gfx942.")
|
||||
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
|
||||
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
@@ -128,6 +133,10 @@ if(ENABLE_MSCCLPP)
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
#endif()
|
||||
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
|
||||
index 76674ba..7a2cd4a 100644
|
||||
--- a/apps/nccl/src/allreduce.hpp
|
||||
+++ b/apps/nccl/src/allreduce.hpp
|
||||
@@ -368,7 +368,10 @@ __global__ void __launch_bounds__(512, 1)
|
||||
const size_t chanOffset = nPeer * blockIdx.x;
|
||||
// assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
|
||||
const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
|
||||
- const size_t nInt4PerRank = nInt4 / worldSize;
|
||||
+ size_t nInt4PerRank = nInt4 / worldSize;
|
||||
+ if (nInt4 % worldSize)
|
||||
+ nInt4PerRank = nInt4PerRank + 1;
|
||||
+
|
||||
auto smChans = smChannels + chanOffset;
|
||||
auto smOutChans = smOutChannels + chanOffset;
|
||||
|
||||
Ссылка в новой задаче
Block a user