diff --git a/projects/rccl/cmake/MSCCLPP.cmake b/projects/rccl/cmake/MSCCLPP.cmake index 15fbb60b27..1f870860e5 100644 --- a/projects/rccl/cmake/MSCCLPP.cmake +++ b/projects/rccl/cmake/MSCCLPP.cmake @@ -90,6 +90,11 @@ if(ENABLE_MSCCLPP) WORKING_DIRECTORY ${MSCCLPP_SOURCE} ) + execute_process( + COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + message(STATUS "Building mscclpp only for gfx942.") mscclpp_cmake_arg(CMAKE_PREFIX_PATH) mscclpp_cmake_arg(CMAKE_INSTALL_RPATH_USE_LINK_PATH) @@ -115,28 +120,35 @@ if(ENABLE_MSCCLPP) find_package(mscclpp_nccl REQUIRED) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) - - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) - - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) - execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) execute_process( - COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch - WORKING_DIRECTORY ${MSCCLPP_SOURCE} - ) + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/cpx.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/read-allred.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mscclpp_ibv_access_relaxed_ordering.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/mem-reg.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/non-multiple-128-fix.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) + + execute_process( + COMMAND git apply --reverse ${CMAKE_CURRENT_SOURCE_DIR}/ext-src/bf16-tuning.patch + WORKING_DIRECTORY ${MSCCLPP_SOURCE} + ) #endif() diff --git a/projects/rccl/ext-src/bf16-tuning.patch b/projects/rccl/ext-src/bf16-tuning.patch new file mode 100644 index 0000000000..6b59b9856b --- /dev/null +++ b/projects/rccl/ext-src/bf16-tuning.patch @@ -0,0 +1,26 @@ +diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp +index 7a2cd4a..a14dfbc 100644 +--- a/apps/nccl/src/allreduce.hpp ++++ b/apps/nccl/src/allreduce.hpp +@@ -850,7 +850,7 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< + flag++); + #endif + } else { +- int nBlocks = 5*(nRanksPerNode - 1); ++ int nBlocks = 8 * (nRanksPerNode - 1); + int nThreadsPerBlock = 512; + if (hieAllred && worldSize >= 8) { + nBlocks = 20; +diff --git a/apps/nccl/src/common.hpp b/apps/nccl/src/common.hpp +index ca2c272..a6056ea 100644 +--- a/apps/nccl/src/common.hpp ++++ b/apps/nccl/src/common.hpp +@@ -17,7 +17,7 @@ constexpr int NRANKS1_PER_NODE = 4; + constexpr int NRANKS_PER_NODE = 8; + constexpr int NPEERS = 7; + +-constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB ++constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 112; // double buffer * 56 thread-blocks * 8 ranks * 256KB = 112MB + + __device__ mscclpp::DeviceSyncer deviceSyncer; +