From 9ef45df8f7e103c88bb1a53bbf0969a16b48fd41 Mon Sep 17 00:00:00 2001
From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com>
Date: Mon, 12 May 2025 18:05:57 -0500
Subject: [PATCH] Fix when more than 64 channels are used for multi-collective
 group calls (#1688)

* Fix when more than 64 channels are used for multi-collective group calls

* Update CHANGELOG.md

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>

---------

Co-authored-by: Jeffrey Novotny <jnovotny@amd.com>
---
 CHANGELOG.md   |  7 +++----
 src/enqueue.cc | 22 ++++++++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2b360cfb3a..c2f2d9c537 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 
 ## Unreleased - RCCL 2.24.3 for ROCm 6.5.0
 
+### Resolved issues
+* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
+
 ### Added
 
 * Added new GPU target `gfx950`.
@@ -12,10 +15,6 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 
 * Compatibility with NCCL 2.24.3
 
-### Known issue
-
-* Using more than 64 channels can cause a segmentation fault when multiple collectives are used in the same `ncclGroup()` call.
-
 ## Unreleased - RCCL 2.23.4 for ROCm 6.4.1
 
 ### Added
diff --git a/src/enqueue.cc b/src/enqueue.cc
index bf325c58f3..ebad6f68fe 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -214,12 +214,16 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   struct channelMasks hasBatchMask = plan->channelMask;
   struct ncclDevWorkBatch* batchPrev[MAXCHANNELS] = {}; // {0...}
   struct ncclDevWorkBatch* batchZero = (struct ncclDevWorkBatch*)(plan->kernelArgs+1);
+
+  // [RCCL] Preparing batchZero slightly different to support > 64 Channels
+  //        Need to ensure that all channels are processed first before dealing with
+  //        adding additional batches
   int batchIx = 0;
-  for (int maskIdx = 0; maskIdx < MAXCHANNELS/64; maskIdx++) {
-    while (hasBatchMask.masks[maskIdx] != 0) {
-      uint64_t tmpMask = hasBatchMask.masks[maskIdx]; // channels with a batch for this round.
-      do {
-        int c = popFirstOneBit(&tmpMask) + maskIdx * 64;
+  int done = 0;
+  while (!done) {
+    done = 1;
+    for (int c = 0; c < MAXCHANNELS; c++) {
+      if (hasBatchMask.masks[c / 64] & (1ULL << (c%64))) {
         if (!ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) {
           struct ncclWorkBatchList* batchNode = ncclIntruQueueDequeue(&wipChannels[c].workBatchQueue);
           if (batchPrev[c] != nullptr) {
@@ -229,9 +233,11 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
           batchZero[batchIx++] = batchNode->batch;
         }
         if (ncclIntruQueueEmpty(&wipChannels[c].workBatchQueue)) {
-          hasBatchMask.masks[maskIdx] ^= 1ull<<(c%64);
+          hasBatchMask.masks[c / 64] ^= (1ULL << (c%64));
+        } else {
+          done = 0;
         }
-      } while (tmpMask != 0);
+      }
     }
   }
 
@@ -2131,7 +2137,7 @@ static ncclResult_t hostToDevRedOp(
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
   bool datatype_signed = false;
-  
+
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;