Direct Reduce Scatter Implementation (#2765)

* Add new implementation of direct send/recv reduce scatter * Resolved conflicts * Add multiple channels support to the reduction kernel of direct reduce scatter and adjust offset into buffer to utilize multiple channels. * Resolve validation issue when number of elements is not divisible by number of channels leaving elements unaccount for in reduction. * fix proxy hang * set maxSrcs to 64 in reduceCopy * optimize multi-channel code * fix validation issue in single node MI300 * Tune the message size range for 2,4, and 8 Nodes * Move Direct RS into separate kernel * Add Copyright * resolve review comments * resolve review comments * fix merge build issue * revert move Direct RS into separate kernel * address review comments * address review comments --------- Co-authored-by: KawtharShafie <kawtharshafie@gmail.com> Co-authored-by: Ghadeer Alabandi <abandiga@gmail.com> Co-authored-by: systems-assistant[bot] <systems-assistant[bot]@users.noreply.github.com>
2026-01-30 09:27:27 -06:00
@@ -375,6 +375,7 @@ ncclResult_t ncclReduce_impl(const void* sendbuff, void* recvbuff, size_t count,
  return ncclEnqueueCheck(&info);
 }

+
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t recvcount,
@@ -386,6 +387,10 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
    REDUCESCATTER_CHUNKSTEPS, comm -> rcclUseOneSlice ? REDUCESCATTER_SLICESTEPS_SINGLE_NODE : REDUCESCATTER_SLICESTEPS, nullptr };

+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t msgSize = recvcount * ncclTypeSize(datatype) * nRanks;
+
  if (!mscclIsCaller()) // when msccl falls back to
  {
    NCCLCHECK(Recorder::instance().record(rrReduceScatter, info));
@@ -397,6 +402,41 @@ ncclResult_t ncclReduceScatter_impl(const void* sendbuff, void* recvbuff, size_t
      recvcount, datatype, 0, 0, op, mscclFuncReduceScatter, comm, stream);
  }
  
+  // Reset value forcing direct reduce scatter algorithm 
+  comm->enableDirectReduceScatter = 0; 
+
+  if (rcclUseReduceScatterDirect(comm, msgSize)) {
+    INFO(NCCL_INIT, "RCCL DIRECT REDUCE-SCATTER recvcount=%zu msgSize=%zu rank=%d nRanks=%d nNodes=%d comm=%p stream=%p sendbuff=%p recvbuff=%p",
+      recvcount, msgSize, comm->rank, nRanks, comm->nNodes, comm, stream, sendbuff, recvbuff);
+
+    // Temporary Buffer to store data from each rank
+    void* tempbuff = comm->tempBuff;
+
+    // Use Direct Reduce Scatter Algorithm
+    comm->enableDirectReduceScatter = 1;
+    
+    if (recvcount == 0) return ncclSuccess;
+    
+    // Calculate offset into buffers
+    size_t offset = recvcount * ncclTypeSize(datatype);
+    
+    // Copy Current ranks data to tempbuff
+    // Enqueue the copy on the user stream so it is correctly ordered w.r.t. the subsequent
+    // ncclSend/ncclRecv and the rest of the ReduceScatter work on the same stream.
+    NCCLCHECK(ncclCudaMemcpyAsync((char*)tempbuff + comm->rank * offset, (char*)sendbuff + comm->rank * offset, offset, stream));
+
+    NCCLCHECK(ncclGroupStart());
+    for (int i = 0; i < nRanks; i++) {
+      int peer = (comm->rank + i) % nRanks;
+      if (peer == comm->rank) {
+        continue;
+      }
+      NCCLCHECK(ncclSend((void*)((char*)sendbuff + peer * offset), recvcount, datatype, peer, comm, stream));
+      NCCLCHECK(ncclRecv((void*)((char*)tempbuff + peer * offset), recvcount, datatype, peer, comm, stream));
+    }
+    NCCLCHECK(ncclGroupEnd());
+  }
+  
  return ncclEnqueueCheck(&info);
 }

@@ -16,6 +16,42 @@ namespace {
 #else
  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
 #endif
+    //TODO: move Direct Reduce Scatter path to a separate kernel
+    size_t msgSize = work->count * sizeof(T) * ncclShmem.comm.nRanks;
+    if (work->enableDirectReduceScatter && msgSize <= (size_t)work->directReduceScatterLimitBytes) {
+      const int nRanks = ncclShmem.comm.nRanks; 
+      const ssize_t numElements = work->count;
+
+      // Calculate Offset to utilize multiple channels
+      ssize_t elementsPerBlock = numElements / gridDim.x;
+      ssize_t remainderElements = numElements % gridDim.x;
+      // Calculate the number of elements per block for each block
+      // The first n blocks get 1 extra element to account for the remainder (n = remainderElements)
+      ssize_t numElementsPerBlock = elementsPerBlock + (blockIdx.x < remainderElements ? 1 : 0);
+      ssize_t channelOffset = blockIdx.x * elementsPerBlock + min((ssize_t)blockIdx.x, remainderElements);
+
+      // Array of src pointers pointing to rank offsets in tempBuff
+      void** srcPtrs = (void**)ncclScratchForWarp(0); 
+      if (tid == 0) {
+        for (int i = 0; i < nRanks; i++) {
+          // Define offset into tempbuff for each rank's data
+          const ssize_t srcOffset = i * numElements + channelOffset;
+          srcPtrs[i] = (void*)((T*)work->tempBuff + srcOffset);
+        }
+      }
+      // Sync threads to ensure all srcPtrs are set before reduction
+      __syncthreads();
+
+      T* recvbuff = (T*)work->recvbuff;
+      // Array for destination pointer to recvbuff
+      void* dstPtrs[1];
+      dstPtrs[0] = (void*)(recvbuff + channelOffset);
+      if (tid < nthreads) {
+        // Call reduction across all rank offsets in tempbuff and store in recvbuff
+        reduceCopy<COLL_UNROLL, USE_ACC, RedOp, T, 0, 1, 64, 0, 1, 1, 0>
+          (tid, nthreads, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nRanks, srcPtrs, 1, dstPtrs, numElementsPerBlock);
+      }
+    } else {
  #ifdef ENABLE_WARP_SPEED
      int warp = threadIdx.x / WARP_SIZE;
      ncclRing *ring = &ncclShmem.warpChannel[warp].ring;
@@ -139,6 +175,7 @@ namespace {
  #endif
    }
  }
+}

 #if defined(__gfx942__) || defined(__gfx950__) // Use a single slice per simple primitive for a single node on some GFX9 devices.
 #define rcclReduceScatterRunRingSimpleProtoImpl(tid, nthreads, work) \
@@ -27,6 +27,7 @@
 #include "scheduler.h"
 #include "common.h"
 #include "api_trace.h"
+#include "rccl_common.h"

 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
@@ -161,6 +162,7 @@ static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) {
 }

 RCCL_PARAM_DECLARE(EnableProxyTrace);
+RCCL_PARAM_DECLARE(DirectReduceScatterThreshold);
 /*****************************************************************************/
 /*       Launch system : synchronization and CUDA kernel launch              */
 /*****************************************************************************/
@@ -412,6 +414,21 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
        devWork.size = task->count;
    }
 #endif
+    // Direct Reduce Scatter
+    if (task->func == ncclFuncReduceScatter && comm->enableDirectReduceScatter) {
+      devWork.enableDirectReduceScatter = comm->enableDirectReduceScatter;
+      int64_t directReduceScatterLimit = rcclParamDirectReduceScatterThreshold();
+      if (directReduceScatterLimit >= 0) {
+        // set threshold to 2MiB hard limit
+        directReduceScatterLimit = std::min(directReduceScatterLimit, (int64_t)2097152);
+        devWork.directReduceScatterLimitBytes = (uint32_t) directReduceScatterLimit;
+      } else {
+        devWork.directReduceScatterLimitBytes = (uint32_t)0;
+      }
+      devWork.tempBuff = (void*)comm->tempBuff;
+      devWork.currentRank = comm->rank;
+      devWork.count = task->count;
+    }
    
    devWork.isOneRPN = comm->isOneRPN;
    devWork.netRegUsed = devWork.regUsed = 0;
@@ -725,7 +742,9 @@ static ncclResult_t scheduleCollTasksToPlan(
        proxyOp.incWorkCounter = true;
        addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
        // Set pattern to profiler to add a proxy profiler for kernel events
-        if (task->func != ncclFuncAllToAllGda) {
+        // for Direct Reduce Scatter (DRS), we don't need to add proxy op
+        bool isDRS = task->func == ncclFuncReduceScatter && comm->enableDirectReduceScatter;
+        if (!isDRS && task->func != ncclFuncAllToAllGda) {
            NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
            NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
        }
@@ -875,7 +894,9 @@ static ncclResult_t scheduleCollTasksToPlan(
        // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
        // determine if that's actually true but it's also not clear if that would be an issue.
        // coverity[uninit_use_in_call:FALSE]
-        if (task->func != ncclFuncAllToAllGda) {
+        // for Direct Reduce Scatter (DRS), we don't need to add proxy op
+        bool isDRS = task->func == ncclFuncReduceScatter && comm->enableDirectReduceScatter;
+        if (!isDRS && task->func != ncclFuncAllToAllGda) {
            NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
            NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
        }
@@ -763,6 +763,11 @@ struct ncclComm {
  int symId;
 #endif

+  // Direct Reduce Scatter [RCCL]
+  bool enableDirectReduceScatter;
+  // Temporary Buffer [RCCL]
+  void* tempBuff;
+
  uint64_t endMagic;
 };

@@ -385,6 +385,15 @@ struct alignas(16) ncclDevWorkColl {
  uintptr_t recvbuffOffset;
  uintptr_t* sendbuffRmtAddrs;
  uintptr_t* recvbuffRmtAddrs;
+
+  bool enableDirectReduceScatter;
+  // Per-work (per kernel launch) limit for Direct ReduceScatter in bytes.
+  // This is set by the host and used as a device-side safety gate.
+  uint32_t directReduceScatterLimitBytes;
+  void* tempBuff;
+  int currentRank;
+  size_t count;
+
  union {
    // Continuous-byte-distribution scheduling. The lo and hi channels are of
    // different size than the channels in the middle.
@@ -25,6 +25,7 @@ THE SOFTWARE.
 #include "nccl.h"
 #include "param.h"
 #include "core.h"
+
 typedef enum RcclTunableColls {
  RCCL_UNSUPPORTED_TUNABLE = -1,
  RCCL_RS_TUNABLE = 0,    // reduce_scatter index
@@ -114,12 +115,16 @@ NCCL_API(ncclResult_t, rcclGetAlgoInfo, struct ncclComm* comm, ncclFunc_t coll,
 NCCL_API(ncclResult_t, rcclGetAlgoName, int algo, const char** algoName);
 NCCL_API(ncclResult_t, rcclGetProtocolName, int protocol, const char** algoName);
 bool rcclUseAllGatherDirect(struct ncclComm* comm, size_t& msgSize);
+bool rcclUseReduceScatterDirect(struct ncclComm* comm, size_t& msgSize);
 bool rcclUseAllToAllGda(struct ncclComm* comm);
 void rcclSetPxn(struct ncclComm* comm,  int& rcclPxnDisable);
 void rcclSetP2pNetChunkSize(struct ncclComm* comm,  int& rcclP2pNetChunkSize);
 ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
 bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
+
+// Direct ReduceScatter Limit
+RCCL_PARAM_DECLARE(DirectReduceScatterThreshold);
 int getFirmwareVersion();
 bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
 #ifdef ENABLE_WARP_SPEED
@@ -90,6 +90,8 @@
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif

+#define TEMP_BUFF_SIZE (4 * 1024 * 1024) // Define Size for Temporary Buffer for Direct RS
+
 using namespace rccl;

 const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "AllGather", "AllReduce", "AlltoAllPivot", "AllToAllGda", "Broadcast", "Reduce", "ReduceScatter", "SendRecv"};
@@ -484,6 +486,13 @@ static ncclResult_t commFree(ncclComm_t comm) {

  NCCLCHECK(ncclCeFinalize(comm));

+  // tempBuff is allocated per-communicator for direct ReduceScatter on gfx950.
+  // It is owned by the communicator; free it during communicator teardown.
+  if (comm->tempBuff) {
+    NCCLCHECK(ncclCudaFree(comm->tempBuff));
+    comm->tempBuff = nullptr;
+  }
+
  if (comm->symmetricSupport) {
    NCCLCHECK(ncclSymkFinalize(comm));
    NCCLCHECK(ncclDevrFinalize(comm));
@@ -2285,6 +2294,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  }
 #endif

+  // Allocate Temp Buffer for Direct Reduce Scatter
+  if (IsArchMatch(archName,"gfx950")) {
+    NCCLCHECK(ncclCudaMalloc(&(comm->tempBuff), TEMP_BUFF_SIZE));
+  }

 #ifdef ENABLE_MSCCLPP
  if (job->parent) {
@@ -41,6 +41,7 @@ RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
 // Otherwise, it is automatically set for certain archs, datatypes and reduction collectives
 RCCL_PARAM(disableReduceCopyPipelining, "DISABLE_REDUCE_COPY_PIPELINING", 0);
 RCCL_PARAM(DirectAllGatherThreshold, "DIRECT_ALLGATHER_THRESHOLD", 75497472);
+RCCL_PARAM(DirectReduceScatterThreshold, "DIRECT_REDUCE_SCATTER_THRESHOLD", 2097152);
 RCCL_PARAM(ThreadsPerBlock, "THREADS_PER_BLOCK", -1);
 RCCL_PARAM(UnrollFactor, "UNROLL_FACTOR", -1);
 #ifdef ENABLE_WARP_SPEED
@@ -465,6 +466,41 @@ bool rcclUseAllGatherDirect(struct ncclComm* comm, size_t& msgSize) {
    ;
 }

+bool rcclUseReduceScatterDirect(struct ncclComm* comm, size_t& msgSize) {
+  // Direct ReduceScatter is supported for MI350 (gfx950):
+  // - 2 nodes: enable for 128KiB .. 2MiB
+  // - 4 and 8 nodes: enable up to 2MiB
+  static int userDirectReduceScatterInput = -2;
+  if (userDirectReduceScatterInput == -2) {
+    const char *inputStr = getenv("RCCL_DIRECT_REDUCE_SCATTER_DISABLE");
+    userDirectReduceScatterInput = !inputStr ? 0 : 1;
+  }
+  if (userDirectReduceScatterInput == 1) {
+    INFO(NCCL_INIT, "RCCL DIRECT REDUCE-SCATTER has been disabled.");
+    return false;
+  }
+  const bool archGfx950 = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950");
+  if (!archGfx950) return false;
+
+  size_t threshold = rcclParamDirectReduceScatterThreshold();
+  if (threshold > -1) { 
+    // Set threshold to 2MiB hard limit
+    // NOTE: If the DirectReduceScatterThreshold / hard-limit is increased, ensure TEMP_BUFF_SIZE (init.cc)
+    // is increased accordingly -> TEMP_BUFF_SIZE >= 2 * (max enabled msgSize) for headroom.
+    threshold = std::min(threshold, (size_t)2097152);
+  } else {
+    threshold = 2097152;
+  }
+  INFO(NCCL_INIT, "RCCL DIRECT REDUCE-SCATTER threshold set to: %zu", threshold);
+
+  if (msgSize > threshold) return false;
+  // for 2 nodes, enable if msgSize is in 128KiB .. 2MiB range
+  if (comm->nNodes == 2) return msgSize >= (size_t)131072;
+  if (comm->nNodes == 8 || comm->nNodes == 4) return true;
+  return false;
+}
+
+
 void rcclSetPxn(struct ncclComm* comm,  int& rcclPxnDisable) {
  static int pxnDisable = RCCL_VALUE_UNSET;
  comm->enableCustColl = false;