From 7f7c8d14f6e3f7a1170b1cd3d419c7499dcc28a8 Mon Sep 17 00:00:00 2001 From: AbandiGa Date: Thu, 13 Nov 2025 14:55:09 -0600 Subject: [PATCH] Disable Bfloatf16 pipelining for reduction collectives for gfx950 (#2047) * disable bf16 reduce_copy pipelining for gfx950 * edit CHANGELOG * Combine unroll and pipeline local arch calculation into single function * fix multi-node error and disbale for gfx950 even if it's not a local build * removed has_gfx950 * disable pipelining for gfx950 in rcclSetPipelining --------- Co-authored-by: Ghadeer Alabandi Co-authored-by: Ghadeer Alabandi Co-authored-by: Ghadeer Alabandi [ROCm/rccl commit: 277b6e9bacca66ce1821c4062430fa3492549779] --- projects/rccl/CHANGELOG.md | 1 + projects/rccl/src/device/generate.py | 24 +++++++++++++----------- projects/rccl/src/rccl_wrap.cc | 18 +----------------- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index c1a3de4630..6f9117b244 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Changed * RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`. +* Disabled `reduceCopyPacks` pipelining for `gfx950`. ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1 diff --git a/projects/rccl/src/device/generate.py b/projects/rccl/src/device/generate.py index 480ff74529..ef2ecf91b2 100755 --- a/projects/rccl/src/device/generate.py +++ b/projects/rccl/src/device/generate.py @@ -170,9 +170,10 @@ class Fn: def __iter__(self): return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll)) -def calc_unroll_for_local_arch(): +def calc_unroll_and_pipeline_for_local_arch(): + if not is_local_arch_only: - return all_unrolls + return (all_unrolls, all_pipelines) rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo" @@ -197,22 +198,22 @@ def calc_unroll_for_local_arch(): # We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts # Use (gfx_name, cu_count) as key for dictionary and convert it to list here gfx_targets = list(gfx_targets.keys()) - + # Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950) if len(gfx_targets) == 1: gfx_name, cu_count = gfx_targets[0] if "gfx950" == gfx_name: - return ["1", "2"] + return (["1", "2"], ["0"]) # Disable pipelining for gfx950 elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80): - return ["2"] + return (["2"], all_pipelines) else: - return ["4"] + return (["4"], all_pipelines) else: - return all_unrolls + return (all_unrolls, all_pipelines) # if building for local arch only, we only need to build for 1 variant of unroll for most gfx targets, -# except for gfx950 -local_unroll = calc_unroll_for_local_arch() +# except for gfx950. For gfx950, we also disable pipelining. +local_unroll, local_pipeline = calc_unroll_and_pipeline_for_local_arch() # Helper function to check if the conditions for the collective is being met def func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll): @@ -226,6 +227,7 @@ def func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll): ty not in tys_of_coll[coll] or acc not in acc_of_coll[coll] or pipeline not in pipelines_of_coll[coll] or (pipeline in ["1"] and ty not in pipelined_types) or + pipeline not in local_pipeline or unroll not in local_unroll): return False return True @@ -318,7 +320,7 @@ def enumerate_func_rows(): for redop in all_redops: for ty in all_tys: for acc in all_accs: - for pipeline in all_pipelines: + for pipeline in local_pipeline: if func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll): yield (coll, algo, proto, redop, ty, acc, pipeline, unroll) @@ -332,7 +334,7 @@ def custom_sort_key(fn: Fn): all_redops.index(fn.redop), all_tys.index(fn.ty), all_accs.index(fn.acc), - all_pipelines.index(fn.pipeline) + local_pipeline.index(fn.pipeline) ) def get_arch_guard(fn): diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc index 5d700ace59..363e39ef9e 100644 --- a/projects/rccl/src/rccl_wrap.cc +++ b/projects/rccl/src/rccl_wrap.cc @@ -240,27 +240,11 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru void rcclSetPipelining(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) { info->pipeline = 0; // Default to no pipelining - if (rcclParamdisableReduceCopyPipelining()) { + if (rcclParamdisableReduceCopyPipelining() || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) { return; } const bool dtypeOK = (info->datatype == ncclBfloat16) || rcclParamPipelineAllDTypes(); - if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && dtypeOK) { - if (comm->nNodes > 1) { - switch (info->func) { - case ncclFuncAllReduce: - case ncclFuncReduceScatter: - case ncclFuncReduce: - // Enable for multi-node - info->pipeline = 1; - break; - default: - break; - } - } - return; - } - if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && dtypeOK) { switch (info->func) { // For multi-node case, we check if the number of bytes (`nBytes`) satisfies