From 42b6831a3941a6258ea7e5dc7d41199ad96b8908 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Fri, 6 Dec 2024 08:16:39 -0600 Subject: [PATCH] ext-src: tune TP=8 case on MI308 CPX mode (#1446) Tune the number of blocks for hierarchical mscclpp allreduce. --- ext-src/read-allred.patch | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/ext-src/read-allred.patch b/ext-src/read-allred.patch index de632f6284..a51f42b779 100644 --- a/ext-src/read-allred.patch +++ b/ext-src/read-allred.patch @@ -1,8 +1,8 @@ diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp -index 1b85136..a08f822 100644 +index 1b85136..ee90c2f 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp -@@ -386,24 +386,361 @@ __global__ void __launch_bounds__(512, 1) +@@ -386,24 +386,353 @@ __global__ void __launch_bounds__(512, 1) } } @@ -147,11 +147,8 @@ index 1b85136..a08f822 100644 + + const size_t nItrs = nInt4OfThisBlock / nInt4PerChunk; + const size_t restNInt4 = nInt4OfThisBlock % nInt4PerChunk; -+ const size_t chunkSizePerRank = nNeededBlocks * nInt4PerChunk; + + const size_t blockOffset = nInt4PerChunk * blockIdx.x; -+ const size_t scratchChunkRankOffset = chunkSizePerRank * rank; -+ const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4); + + int localRank = rank % NRANKS1_PER_NODE; + @@ -190,8 +187,6 @@ index 1b85136..a08f822 100644 + int4 data = buff4[nInt4PerRank * localRank + idx + offsetOfThisBlock]; + for (int peerIdx = NRANKS1_PER_NODE*myNode; peerIdx < (NRANKS1_PER_NODE*myNode + + NRANKS1_PER_NODE - 1); peerIdx++) { -+ const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; -+ + int4 val = channels[peerIdx].read(nInt4PerRank * localRank + offsetOfThisBlock + idx); + data = add_vectors(val, data); + } @@ -206,7 +201,6 @@ index 1b85136..a08f822 100644 + } + __syncthreads(); + -+ int remoteRank, peerIdx; + //Reduce across OAMs + + for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) { @@ -227,7 +221,6 @@ index 1b85136..a08f822 100644 + resultBuff4[nInt4PerRank * localRank + idx + offsetOfThisBlock] = data; + + for (int peerIdx = NRANKS1_PER_NODE*myNode; peerIdx < (NRANKS1_PER_NODE*myNode + NRANKS1_PER_NODE - 1); peerIdx++) { -+ const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; + outChannels[peerIdx].write(nInt4PerRank * localRank + idx + offsetOfThisBlock + + channelOutDataOffset / sizeof(int4), data); + } @@ -282,7 +275,6 @@ index 1b85136..a08f822 100644 + } + __syncthreads(); + -+ int remoteRank, peerIdx; + for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) { + int4 data = scratch4[idx + blockOffset]; + @@ -370,7 +362,7 @@ index 1b85136..a08f822 100644 nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, -@@ -412,9 +749,20 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< +@@ -412,9 +741,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< } else { int nBlocks = 35; int nThreadsPerBlock = 512; @@ -378,6 +370,7 @@ index 1b85136..a08f822 100644 - channelOutOffset, channelScratchOffset, rank, nRanksPerNode, - worldSize, nelems); + if (hieAllred && worldSize >= 8) { ++ nBlocks = 20; + allreduce10<<>>(buff, scratch, resultBuff, smChannels, smScrChannels, + smOutChannels, channelOutOffset, channelScratchOffset, rank, nRanksPerNode, + worldSize, nelems);