From 42b6831a3941a6258ea7e5dc7d41199ad96b8908 Mon Sep 17 00:00:00 2001
From: Nusrat Islam <Nusrat.Islam@amd.com>
Date: Fri, 6 Dec 2024 08:16:39 -0600
Subject: [PATCH] ext-src: tune TP=8 case on MI308 CPX mode (#1446)

Tune the number of blocks for hierarchical mscclpp allreduce.
---
 ext-src/read-allred.patch | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/ext-src/read-allred.patch b/ext-src/read-allred.patch
index de632f6284..a51f42b779 100644
--- a/ext-src/read-allred.patch
+++ b/ext-src/read-allred.patch
@@ -1,8 +1,8 @@
 diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
-index 1b85136..a08f822 100644
+index 1b85136..ee90c2f 100644
 --- a/apps/nccl/src/allreduce.hpp
 +++ b/apps/nccl/src/allreduce.hpp
-@@ -386,24 +386,361 @@ __global__ void __launch_bounds__(512, 1)
+@@ -386,24 +386,353 @@ __global__ void __launch_bounds__(512, 1)
    }
  }
  
@@ -147,11 +147,8 @@ index 1b85136..a08f822 100644
 +
 +  const size_t nItrs = nInt4OfThisBlock / nInt4PerChunk;
 +  const size_t restNInt4 = nInt4OfThisBlock % nInt4PerChunk;
-+  const size_t chunkSizePerRank = nNeededBlocks * nInt4PerChunk;
 +
 +  const size_t blockOffset = nInt4PerChunk * blockIdx.x;
-+  const size_t scratchChunkRankOffset = chunkSizePerRank * rank;
-+  const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4);
 +
 +  int localRank = rank % NRANKS1_PER_NODE;
 +
@@ -190,8 +187,6 @@ index 1b85136..a08f822 100644
 +      int4 data = buff4[nInt4PerRank * localRank + idx + offsetOfThisBlock];
 +      for (int peerIdx = NRANKS1_PER_NODE*myNode; peerIdx < (NRANKS1_PER_NODE*myNode +
 +                              NRANKS1_PER_NODE - 1); peerIdx++)      {
-+        const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
-+
 +        int4 val = channels[peerIdx].read<int4>(nInt4PerRank  * localRank + offsetOfThisBlock + idx);
 +        data = add_vectors<T>(val, data);
 +      }
@@ -206,7 +201,6 @@ index 1b85136..a08f822 100644
 +    }
 +    __syncthreads();
 +
-+    int remoteRank, peerIdx;
 +    //Reduce across OAMs
 +
 +    for (size_t idx = threadIdx.x; idx < nInt4PerChunk; idx += blockDim.x) {
@@ -227,7 +221,6 @@ index 1b85136..a08f822 100644
 +      resultBuff4[nInt4PerRank * localRank + idx + offsetOfThisBlock] = data;
 +
 +      for (int peerIdx = NRANKS1_PER_NODE*myNode; peerIdx < (NRANKS1_PER_NODE*myNode + NRANKS1_PER_NODE - 1); peerIdx++)      {
-+        const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1;
 +        outChannels[peerIdx].write(nInt4PerRank * localRank + idx + offsetOfThisBlock +
 +                        channelOutDataOffset / sizeof(int4), data);
 +      }
@@ -282,7 +275,6 @@ index 1b85136..a08f822 100644
 +    }
 +    __syncthreads();
 +
-+    int remoteRank, peerIdx;
 +    for (size_t idx = threadIdx.x; idx < restNInt4; idx += blockDim.x) {
 +        int4 data = scratch4[idx + blockOffset];
 +
@@ -370,7 +362,7 @@ index 1b85136..a08f822 100644
        nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
      }
      allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
-@@ -412,9 +749,20 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
+@@ -412,9 +741,21 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
    } else {
      int nBlocks = 35;
      int nThreadsPerBlock = 512;
@@ -378,6 +370,7 @@ index 1b85136..a08f822 100644
 -                                                         channelOutOffset, channelScratchOffset, rank, nRanksPerNode,
 -                                                         worldSize, nelems);
 +    if (hieAllred && worldSize >= 8) {
++	nBlocks = 20;
 +	allreduce10<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, smScrChannels,
 +			smOutChannels, channelOutOffset, channelScratchOffset, rank, nRanksPerNode,
 +			worldSize, nelems);