From 7f7c8d14f6e3f7a1170b1cd3d419c7499dcc28a8 Mon Sep 17 00:00:00 2001
From: AbandiGa <abandiga@gmail.com>
Date: Thu, 13 Nov 2025 14:55:09 -0600
Subject: [PATCH] Disable Bfloatf16 pipelining for reduction collectives for
 gfx950 (#2047)

* disable bf16 reduce_copy pipelining for gfx950

* edit CHANGELOG

* Combine unroll and pipeline local arch calculation into single function

* fix multi-node error and disbale for gfx950 even if it's not a local build

* removed has_gfx950

* disable pipelining for gfx950 in rcclSetPipelining

---------

Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-08.prov.gtu.zts.cpe.ice.amd.com>
Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-18.prov.gtu.zts.cpe.ice.amd.com>
Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h28a-08.prov.gtu.zts.cpe.ice.amd.com>

[ROCm/rccl commit: 277b6e9bacca66ce1821c4062430fa3492549779]
---
 projects/rccl/CHANGELOG.md           |  1 +
 projects/rccl/src/device/generate.py | 24 +++++++++++++-----------
 projects/rccl/src/rccl_wrap.cc       | 18 +-----------------
 3 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md
index c1a3de4630..6f9117b244 100644
--- a/projects/rccl/CHANGELOG.md
+++ b/projects/rccl/CHANGELOG.md
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 ### Changed
 
 * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
+* Disabled `reduceCopyPacks` pipelining for `gfx950`.
 
 ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
 
diff --git a/projects/rccl/src/device/generate.py b/projects/rccl/src/device/generate.py
index 480ff74529..ef2ecf91b2 100755
--- a/projects/rccl/src/device/generate.py
+++ b/projects/rccl/src/device/generate.py
@@ -170,9 +170,10 @@ class Fn:
   def __iter__(self):
     return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll))
 
-def calc_unroll_for_local_arch():
+def calc_unroll_and_pipeline_for_local_arch():
+
   if not is_local_arch_only:
-    return all_unrolls
+    return (all_unrolls, all_pipelines)
 
   rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo"
 
@@ -197,22 +198,22 @@ def calc_unroll_for_local_arch():
   # We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts
   # Use (gfx_name, cu_count) as key for dictionary and convert it to list here
   gfx_targets = list(gfx_targets.keys())
-
+  
   # Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950)
   if len(gfx_targets) == 1:
     gfx_name, cu_count = gfx_targets[0]
     if "gfx950" == gfx_name:
-      return ["1", "2"]
+      return (["1", "2"], ["0"])  # Disable pipelining for gfx950
     elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
-      return ["2"]
+      return (["2"], all_pipelines)
     else:
-      return ["4"]
+      return (["4"], all_pipelines)
   else:
-    return all_unrolls
+    return (all_unrolls, all_pipelines)
 
 # if building for local arch only, we only need to build for 1 variant of unroll for most gfx targets,
-# except for gfx950
-local_unroll = calc_unroll_for_local_arch()
+# except for gfx950. For gfx950, we also disable pipelining.
+local_unroll, local_pipeline = calc_unroll_and_pipeline_for_local_arch()
 
 # Helper function to check if the conditions for the collective is being met
 def func_validate(coll, algo, proto, redop, ty, acc,  pipeline, unroll):
@@ -226,6 +227,7 @@ def func_validate(coll, algo, proto, redop, ty, acc,  pipeline, unroll):
       ty not in tys_of_coll[coll] or
       acc not in acc_of_coll[coll] or
       pipeline not in pipelines_of_coll[coll] or (pipeline in ["1"] and ty not in pipelined_types) or
+      pipeline not in local_pipeline or
       unroll not in local_unroll):
     return False
   return True
@@ -318,7 +320,7 @@ def enumerate_func_rows():
           for redop in all_redops:
             for ty in all_tys:
               for acc in all_accs:
-                for pipeline in all_pipelines:
+                for pipeline in local_pipeline:
                   if func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
                     yield (coll, algo, proto, redop, ty, acc, pipeline, unroll)
 
@@ -332,7 +334,7 @@ def custom_sort_key(fn: Fn):
         all_redops.index(fn.redop),
         all_tys.index(fn.ty),
         all_accs.index(fn.acc),
-        all_pipelines.index(fn.pipeline)
+        local_pipeline.index(fn.pipeline)
     )
 
 def get_arch_guard(fn):
diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc
index 5d700ace59..363e39ef9e 100644
--- a/projects/rccl/src/rccl_wrap.cc
+++ b/projects/rccl/src/rccl_wrap.cc
@@ -240,27 +240,11 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru
 
 void rcclSetPipelining(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
   info->pipeline = 0; // Default to no pipelining
-  if (rcclParamdisableReduceCopyPipelining()) {
+  if (rcclParamdisableReduceCopyPipelining() || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
     return;
   }
   const bool dtypeOK = (info->datatype == ncclBfloat16) || rcclParamPipelineAllDTypes();
 
-  if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && dtypeOK) {
-    if (comm->nNodes > 1) {
-      switch (info->func) {
-        case ncclFuncAllReduce:
-        case ncclFuncReduceScatter:
-        case ncclFuncReduce:
-          // Enable for multi-node
-          info->pipeline = 1;
-          break;
-        default:
-          break;
-      }
-    }
-    return;
-  }
-
   if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && dtypeOK) {
     switch (info->func) {
       // For multi-node case, we check if the number of bytes (`nBytes`) satisfies