ext-src: tuning for allreduce8 kernel (#1560)
This PR tunes the number of threadblocks used for larger (>1MB)
message sizes.
[ROCm/rccl commit: fdf75fd2c1]
이 커밋은 다음에 포함됨:
@@ -90,6 +90,11 @@ if(ENABLE_MSCCLPP)
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
message(STATUS "Building mscclpp only for gfx942.")
|
||||
mscclpp_cmake_arg(CMAKE_PREFIX_PATH)
|
||||
mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH)
|
||||
@@ -115,28 +120,35 @@ if(ENABLE_MSCCLPP)
|
||||
|
||||
|
||||
find_package(mscclpp_nccl REQUIRED)
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
execute_process(
|
||||
COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch
|
||||
WORKING_DIRECTORY ${MSCCLPP_SOURCE}
|
||||
)
|
||||
|
||||
#endif()
|
||||
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
|
||||
index 7a2cd4a..a14dfbc 100644
|
||||
--- a/apps/nccl/src/allreduce.hpp
|
||||
+++ b/apps/nccl/src/allreduce.hpp
|
||||
@@ -850,7 +850,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
|
||||
flag++);
|
||||
#endif
|
||||
} else {
|
||||
- int nBlocks = 5*(nRanksPerNode - 1);
|
||||
+ int nBlocks = 8 * (nRanksPerNode - 1);
|
||||
int nThreadsPerBlock = 512;
|
||||
if (hieAllred && worldSize >= 8) {
|
||||
nBlocks = 20;
|
||||
diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp
|
||||
index ca2c272..a6056ea 100644
|
||||
--- a/apps/nccl/src/common.hpp
|
||||
+++ b/apps/nccl/src/common.hpp
|
||||
@@ -17,7 +17,7 @@ constexpr int NRANKS1_PER_NODE = 4;
|
||||
constexpr int NRANKS_PER_NODE = 8;
|
||||
constexpr int NPEERS = 7;
|
||||
|
||||
-constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
|
||||
+constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 112; // double buffer * 56 thread-blocks * 8 ranks * 256KB = 112MB
|
||||
|
||||
__device__ mscclpp::DeviceSyncer deviceSyncer;
|
||||
|
||||
새 이슈에서 참조
사용자 차단