From 56e0b4e4454b0fc2267aab365c2db1b14b7059f2 Mon Sep 17 00:00:00 2001 From: alex-breslow-amd Date: Thu, 6 Nov 2025 12:12:45 -0800 Subject: [PATCH] [gfx950] Turn On Single Node One Slice Optimization for gfx950 and MI300A (#2017) * Internal benchmarking shows nice single-node performance uplift for MI300A and MI350 --- src/init.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/init.cc b/src/init.cc index 07ce94b219..69ce95b0ca 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1416,12 +1416,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p } INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON"); #endif + // RCCL: Only use one slice per primitive on some single node gfx9xx systems, only currently enabled for AllReduce, ReduceScatter, and AllGather + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){ + comm->rcclUseOneSlice = nNodes == 1; + } if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) { // Multi-node MI300A int managed = 0; CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0)); - // RCCL: Only use one slice per primitive on some single node gfx9xx systems - comm->rcclUseOneSlice = !managed && nNodes == 1; if (managed && nNodes > 1) { // This forces the minimum channels to 24 allGather3Data[rank].nc = 6;