Disable Bfloatf16 pipelining for reduction collectives for gfx950 (#2047)

* disable bf16 reduce_copy pipelining for gfx950 * edit CHANGELOG * Combine unroll and pipeline local arch calculation into single function * fix multi-node error and disbale for gfx950 even if it's not a local build * removed has_gfx950 * disable pipelining for gfx950 in rcclSetPipelining --------- Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-08.prov.gtu.zts.cpe.ice.amd.com> Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-18.prov.gtu.zts.cpe.ice.amd.com> Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h28a-08.prov.gtu.zts.cpe.ice.amd.com> [ROCm/rccl commit: 277b6e9bac]
2025-11-13 14:55:09 -06:00
commit 7f7c8d14f6
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 ### Changed

 * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
+* Disabled `reduceCopyPacks` pipelining for `gfx950`.

 ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1

@@ -170,9 +170,10 @@ class Fn:
  def __iter__(self):
    return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll))

-def calc_unroll_for_local_arch():
+def calc_unroll_and_pipeline_for_local_arch():
+
  if not is_local_arch_only:
-    return all_unrolls
+    return (all_unrolls, all_pipelines)

  rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo"

@@ -197,22 +198,22 @@ def calc_unroll_for_local_arch():
  # We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts
  # Use (gfx_name, cu_count) as key for dictionary and convert it to list here
  gfx_targets = list(gfx_targets.keys())
-
+  
  # Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950)
  if len(gfx_targets) == 1:
    gfx_name, cu_count = gfx_targets[0]
    if "gfx950" == gfx_name:
-      return ["1", "2"]
+      return (["1", "2"], ["0"])  # Disable pipelining for gfx950
    elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
-      return ["2"]
+      return (["2"], all_pipelines)
    else:
-      return ["4"]
+      return (["4"], all_pipelines)
  else:
-    return all_unrolls
+    return (all_unrolls, all_pipelines)

 # if building for local arch only, we only need to build for 1 variant of unroll for most gfx targets,
-# except for gfx950
-local_unroll = calc_unroll_for_local_arch()
+# except for gfx950. For gfx950, we also disable pipelining.
+local_unroll, local_pipeline = calc_unroll_and_pipeline_for_local_arch()

 # Helper function to check if the conditions for the collective is being met
 def func_validate(coll, algo, proto, redop, ty, acc,  pipeline, unroll):
@@ -226,6 +227,7 @@ def func_validate(coll, algo, proto, redop, ty, acc,  pipeline, unroll):
      ty not in tys_of_coll[coll] or
      acc not in acc_of_coll[coll] or
      pipeline not in pipelines_of_coll[coll] or (pipeline in ["1"] and ty not in pipelined_types) or
+      pipeline not in local_pipeline or
      unroll not in local_unroll):
    return False
  return True
@@ -318,7 +320,7 @@ def enumerate_func_rows():
          for redop in all_redops:
            for ty in all_tys:
              for acc in all_accs:
-                for pipeline in all_pipelines:
+                for pipeline in local_pipeline:
                  if func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
                    yield (coll, algo, proto, redop, ty, acc, pipeline, unroll)

@@ -332,7 +334,7 @@ def custom_sort_key(fn: Fn):
        all_redops.index(fn.redop),
        all_tys.index(fn.ty),
        all_accs.index(fn.acc),
-        all_pipelines.index(fn.pipeline)
+        local_pipeline.index(fn.pipeline)
    )

 def get_arch_guard(fn):
@@ -240,27 +240,11 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru

 void rcclSetPipelining(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
  info->pipeline = 0; // Default to no pipelining
-  if (rcclParamdisableReduceCopyPipelining()) {
+  if (rcclParamdisableReduceCopyPipelining() || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
    return;
  }
  const bool dtypeOK = (info->datatype == ncclBfloat16) || rcclParamPipelineAllDTypes();

-  if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && dtypeOK) {
-    if (comm->nNodes > 1) {
-      switch (info->func) {
-        case ncclFuncAllReduce:
-        case ncclFuncReduceScatter:
-        case ncclFuncReduce:
-          // Enable for multi-node
-          info->pipeline = 1;
-          break;
-        default:
-          break;
-      }
-    }
-    return;
-  }
-
  if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && dtypeOK) {
    switch (info->func) {
      // For multi-node case, we check if the number of bytes (`nBytes`) satisfies