gfx950 channel tuning for ReduceScatter and AllGather (#1940)

* add channel thresholds to override channel-count adjustments [ROCm/rccl commit: 0f99fd84a3]
2025-10-14 09:50:44 -05:00
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 ### Added
 * Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
 * Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
+* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.

 ### Changed

@@ -2015,6 +2015,8 @@ static ncclResult_t updateCollCostTable(
  return ncclSuccess;
 }

+extern int64_t ncclParamMinNchannels();
+
 static ncclResult_t topoGetAlgoInfo(
    struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
    float** collCostTable, ncclSimInfo_t* simInfo
@@ -2079,11 +2081,17 @@ static ncclResult_t topoGetAlgoInfo(
    nc = comm->nvlsChannels;
  } else {
    rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
+    INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
+
+    int minNChannels = ncclParamMinNchannels();
    // Ring/Tree channel tuning
-    while (nBytes < nc * nt * threadThreshold) {
+    INFO(NCCL_INIT, "minNChannels:%i", minNChannels);
+    while (nBytes < nc * nt * threadThreshold && nc > minNChannels) {
      if (nc >= 2) nc--;
      else break;
    }
+    INFO(NCCL_INIT, "post-adjustment based on threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
+    rcclOverrideChannels(comm, info->func, nBytes, nc);
  }
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
@@ -151,6 +151,7 @@ struct tuningModel {
  float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
  float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
  uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
+  uint64_t channelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 2,4,8,16,32,40,48,56,64, {min,max,nchannels}
 };

 static struct tuningModel tuning_model_0 {
@@ -183,6 +184,7 @@ static struct tuningModel tuning_model_0 {
  },

  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_1 {
@@ -215,6 +217,7 @@ static struct tuningModel tuning_model_1 {
  },

  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_2 {
@@ -247,6 +250,7 @@ static struct tuningModel tuning_model_2 {
  },

  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_3 {
@@ -279,6 +283,7 @@ static struct tuningModel tuning_model_3 {
  },

  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_4 {
@@ -311,6 +316,7 @@ static struct tuningModel tuning_model_4 {
  },

  .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
 };

 static struct tuningModel tuning_model_5 {
@@ -354,6 +360,9 @@ static struct tuningModel tuning_model_5 {
    /*Broadcast*/
    {/*LL (min/max/factor/thread_threshold)*/ {0, 8192, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {8192, 33554432, 1, 0}},
  },
+
+  .channelThresholds  = {{{CHAN_THRESHOLDS_UNDEFINED}}},
+   
 };

 static struct tuningModel tuning_model_6 {
@@ -396,7 +405,14 @@ static struct tuningModel tuning_model_6 {
    {/*LL (min/max/factor/thread_threshold)*/ {0, 16383, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {16383, 16777216, 1, 0}},
    /*Broadcast*/
    {/*LL (min/max/factor/thread_threshold)*/ {0, 2048, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {2048, 16777216, 1, 0}},
-  },                                                                                                                    
+  },   
+  
+    .channelThresholds  = {
+    // For each collective, define minMax per-rank size threshold for 32,40,48,56,64 channels
+    /*ReduceScatter*/ {{512, 1024, 2},{1024, 2048, 4},{2048, 4096, 8},{4096, 65536, 16}, {65536, 262144, 32}, {262144, 524288, 40}, {1,1, 48}, {524288, 1048576, 56}, {1048576, 268435457, 64}},
+    /*AllGather*/     {{2048, 4096, 2},{4096, 8192, 4},{8192, 16384, 8},{16384, 262144, 16},{262144, 524288, 32}, {524288, 1048576, 40}, {1,1, 48}, {1048576, 4194304, 56}, {4194304, 268435457, 64}},
+    /*AllReduce*/     {{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}},
+  },                                                                                                                 
 };

 static struct tuningModel rcclTuningModel[] = {
@@ -407,7 +423,6 @@ static struct tuningModel rcclTuningModel[] = {
  tuning_model_4,
  tuning_model_5,
  tuning_model_6,
-
 };

 /* Array indexes used below */
@@ -519,6 +534,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        rcclTuningModel[comm->topo->tuning].llProtoRanges,
        sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges));

+  memcpy(comm->minMaxChannelThresholds,
+        rcclTuningModel[comm->topo->tuning].channelThresholds,
+        sizeof(rcclTuningModel[comm->topo->tuning].channelThresholds));
+
  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
    int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
@@ -550,6 +550,7 @@ struct ncclComm {
  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
  uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
+  uint64_t minMaxChannelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 32,40,48,56,64, the two values for min/max size-threshold 

  /* This attribute can indicate the states of communicators and return code of
  * asynchronous NCCL operations. */
@@ -91,4 +91,5 @@ typedef enum {
 #define NCCL_UNROLL_4 2

 #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
+
 #endif
@@ -35,6 +35,9 @@ typedef enum RcclTunableColls {
  RCCL_TUNABLE_COLLS = 5  // LL/LL64/LL128 tunable collectives count
 } rcclTunableIndex_t;

+#define CHAN_THRESHOLDS_UNDEFINED 0
+#define RCCL_CHANNELS_TUNABLE_ENTRIES 9 // 2,4,8,16,32,40,48,56,64 channels
+
 #define RCCL_LL_LIMITS_UNDEFINED 0
 #define RCCL_PROTOCOL_ENTRY_SIZE 4
 #define RCCL_PROTOCOL_MIN_IDX 0
@@ -88,6 +91,7 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
  // For AR, this is the send/recv size per rank
  return (func == ncclFuncReduceScatter || func == ncclFuncAllGather || func == ncclFuncBroadcast || func == ncclFuncReduce) ? nBytes / nRanks : nBytes;
 }
+ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc);
 ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoString[], int nEntries, int& result);
 ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
 ncclResult_t rcclOverrideAlgorithm(const char* ncclAlgoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
@@ -25,7 +25,7 @@ THE SOFTWARE.
 #include "graph/topo.h"
 #include "enqueue.h"
 #include "rocm_smi/rocm_smi.h"
-
+#include <algorithm>
 // Use this param to experiment pipelining new data types besides bfloat16
 // Make sure you generate the device code with the new data type (i.e. in generate.py)
 RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
@@ -108,6 +108,56 @@ ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoStri
  return ncclInvalidUsage;
 }

+extern int64_t ncclParamMinNchannels();
+extern int64_t ncclParamMaxNchannels();
+RCCL_PARAM(ChannelTuningEnable, "CHANNEL_TUNING_ENABLE", 1);
+
+ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc){
+  if(comm->nNodes < 2 || !rcclParamChannelTuningEnable()){
+    INFO(NCCL_TUNING, "RCCL Channel Tuning not applied");
+    return ncclSuccess;
+  }
+
+  auto tunableIndex = rcclGetTunableIndex(coll);
+  if(tunableIndex == RCCL_UNSUPPORTED_TUNABLE){
+    INFO(NCCL_TUNING, "tunableIndex:%i not supported", tunableIndex);
+    return ncclSuccess;
+  }
+
+  int minCTAs = comm->config.minCTAs;
+  int maxCTAs = comm->config.maxCTAs;
+  int minNChannels = ncclParamMinNchannels();
+  int maxNChannels = std::max(comm->nChannels, static_cast<int>(ncclParamMaxNchannels()));
+  size_t bytesPerRank = divUp(nBytes, comm->nRanks);
+
+  for(int channelCountIndex = 0; channelCountIndex < RCCL_CHANNELS_TUNABLE_ENTRIES; ++channelCountIndex){    
+    size_t minByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][0];
+    size_t maxByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][1];
+    INFO(NCCL_TUNING, "nBytes:%lu bytesPerRank:%lu minByteThreshold:%lu maxByteThreshold:%lu  NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", nBytes, bytesPerRank, minByteThreshold, maxByteThreshold, minNChannels, maxNChannels, minCTAs, maxCTAs);
+    if(minByteThreshold == CHAN_THRESHOLDS_UNDEFINED || maxByteThreshold == CHAN_THRESHOLDS_UNDEFINED) {
+      INFO(NCCL_TUNING, "RCCL tuning model does not define threshold for coll:%i and nbytes:%lu", coll, nBytes);
+      break; // Skip undefined thresholds
+    }
+    
+    if(bytesPerRank > minByteThreshold && bytesPerRank <= maxByteThreshold){
+      int channelCount = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
+
+      //honor user's min/max channels defined through NCCL_MIN_NCHANNELS and NCCL_MAX_NCHANNELS
+      if(channelCount >= minNChannels && channelCount <= maxNChannels && channelCount >= minCTAs && channelCount <= maxCTAs){
+        nc = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
+        INFO(NCCL_TUNING, "RCCL tuning model overrides nchannels to %i, channels may be decreased further due to MinTrafficPerchannel thresholds", channelCount);
+      }
+      else{
+        INFO(NCCL_TUNING, "RCCL tuning model cannot override nchannels to %i due to conflicting NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", channelCount, minNChannels, maxNChannels, minCTAs, maxCTAs);
+      }
+
+      break;
+    }
+
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info) {
  static const char* protoOverrideEnv = ncclGetEnv("RCCL_OVERRIDE_PROTO");
  static bool validInput = true;