Disable Bfloatf16 pipelining for reduction collectives for gfx950 (#2047)

* disable bf16 reduce_copy pipelining for gfx950

* edit CHANGELOG

* Combine unroll and pipeline local arch calculation into single function

* fix multi-node error and disbale for gfx950 even if it's not a local build

* removed has_gfx950

* disable pipelining for gfx950 in rcclSetPipelining

---------

Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-08.prov.gtu.zts.cpe.ice.amd.com>
Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h30-18.prov.gtu.zts.cpe.ice.amd.com>
Co-authored-by: Ghadeer Alabandi <galaband@cv350-zts-gtu-h28a-08.prov.gtu.zts.cpe.ice.amd.com>

[ROCm/rccl commit: 277b6e9bac]
This commit is contained in:
AbandiGa
2025-11-13 14:55:09 -06:00
zatwierdzone przez GitHub
rodzic 9a81823515
commit 7f7c8d14f6
3 zmienionych plików z 15 dodań i 28 usunięć
+1
Wyświetl plik
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
### Changed
* RCCL error messages have been made more verbose in several cases. RCCL now prints out fatal error messages by default. Fatal error messages can be suppressed by setting `NCCL_DEBUG=NONE`.
* Disabled `reduceCopyPacks` pipelining for `gfx950`.
## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
+13 -11
Wyświetl plik
@@ -170,9 +170,10 @@ class Fn:
def __iter__(self):
return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll))
def calc_unroll_for_local_arch():
def calc_unroll_and_pipeline_for_local_arch():
if not is_local_arch_only:
return all_unrolls
return (all_unrolls, all_pipelines)
rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo"
@@ -197,22 +198,22 @@ def calc_unroll_for_local_arch():
# We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts
# Use (gfx_name, cu_count) as key for dictionary and convert it to list here
gfx_targets = list(gfx_targets.keys())
# Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950)
if len(gfx_targets) == 1:
gfx_name, cu_count = gfx_targets[0]
if "gfx950" == gfx_name:
return ["1", "2"]
return (["1", "2"], ["0"]) # Disable pipelining for gfx950
elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
return ["2"]
return (["2"], all_pipelines)
else:
return ["4"]
return (["4"], all_pipelines)
else:
return all_unrolls
return (all_unrolls, all_pipelines)
# if building for local arch only, we only need to build for 1 variant of unroll for most gfx targets,
# except for gfx950
local_unroll = calc_unroll_for_local_arch()
# except for gfx950. For gfx950, we also disable pipelining.
local_unroll, local_pipeline = calc_unroll_and_pipeline_for_local_arch()
# Helper function to check if the conditions for the collective is being met
def func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
@@ -226,6 +227,7 @@ def func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
ty not in tys_of_coll[coll] or
acc not in acc_of_coll[coll] or
pipeline not in pipelines_of_coll[coll] or (pipeline in ["1"] and ty not in pipelined_types) or
pipeline not in local_pipeline or
unroll not in local_unroll):
return False
return True
@@ -318,7 +320,7 @@ def enumerate_func_rows():
for redop in all_redops:
for ty in all_tys:
for acc in all_accs:
for pipeline in all_pipelines:
for pipeline in local_pipeline:
if func_validate(coll, algo, proto, redop, ty, acc, pipeline, unroll):
yield (coll, algo, proto, redop, ty, acc, pipeline, unroll)
@@ -332,7 +334,7 @@ def custom_sort_key(fn: Fn):
all_redops.index(fn.redop),
all_tys.index(fn.ty),
all_accs.index(fn.acc),
all_pipelines.index(fn.pipeline)
local_pipeline.index(fn.pipeline)
)
def get_arch_guard(fn):
+1 -17
Wyświetl plik
@@ -240,27 +240,11 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru
void rcclSetPipelining(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
info->pipeline = 0; // Default to no pipelining
if (rcclParamdisableReduceCopyPipelining()) {
if (rcclParamdisableReduceCopyPipelining() || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
return;
}
const bool dtypeOK = (info->datatype == ncclBfloat16) || rcclParamPipelineAllDTypes();
if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && dtypeOK) {
if (comm->nNodes > 1) {
switch (info->func) {
case ncclFuncAllReduce:
case ncclFuncReduceScatter:
case ncclFuncReduce:
// Enable for multi-node
info->pipeline = 1;
break;
default:
break;
}
}
return;
}
if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && dtypeOK) {
switch (info->func) {
// For multi-node case, we check if the number of bytes (`nBytes`) satisfies