From a724f1ebb79f8d0d1bb37630bce3c028dcefe6b6 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Wed, 29 May 2024 07:59:47 -0700
Subject: [PATCH] Add ring simple chunk size tuning (#1180)

* Add ring simple chunk size tuning

* modifying the tuning table to improve the performance of broadcast for 8MB to 32MB for single-node MI300X after ring simple chunk size tuning

* modifying the tuning table to improve the performance of reduce for 1MB to 4MB for single-node MI300X after ring simple chunk size tuning

---------

Co-authored-by: PedramAlizadeh <pmohamma@amd.com>

[ROCm/rccl commit: 73221b4230d877868035ebceb4a47db3df4acb82]
---
 projects/rccl/src/enqueue.cc      | 8 ++++++++
 projects/rccl/src/graph/tuning.cc | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc
index 5544c02d63..8c6755f8f9 100644
--- a/projects/rccl/src/enqueue.cc
+++ b/projects/rccl/src/enqueue.cc
@@ -1799,6 +1799,14 @@ static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nByte
       while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < collInfo->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
       while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < collInfo->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
     }
+  } else if (collInfo->algorithm == NCCL_ALGO_RING && collInfo->protocol == NCCL_PROTO_SIMPLE) {
+    if (collInfo->pattern == ncclPatternPipelineFrom || collInfo->pattern == ncclPatternPipelineTo) {
+      // Optimize chunkSize / nSteps
+      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 64 && chunkSize > 262144) chunkSize /= 2;
+      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 32 && chunkSize > 131072) chunkSize /= 2;
+      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 16 && chunkSize > 65536) chunkSize /= 2;
+      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 8 && chunkSize > 32768) chunkSize /= 2;
+    }
   } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Optimize chunkSize / nSteps
     while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2;
diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc
index d2e958e034..a67cb17e11 100644
--- a/projects/rccl/src/graph/tuning.cc
+++ b/projects/rccl/src/graph/tuning.cc
@@ -379,7 +379,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
         else
           busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];
-        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && coll == ncclFuncBroadcast && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 2.33; }
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 1.65; }
 #else
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);