From 455d516dc45cada97144cecd3ce6442fb21b5f91 Mon Sep 17 00:00:00 2001 From: alex-breslow-amd Date: Wed, 15 Oct 2025 09:15:36 -0700 Subject: [PATCH] [gfx950] Make bypassing __threadfence the default for multinode. (#1947) * Gate based on ROCM version, safe for ROCm 7.0.2 and beyond. * Updates naming to gfx9CheapFenceOff since we use this for gfx942 and gfx950. Thanks Nilesh. * Add info logging statement to NCCL_INIT to print whether enabled when INFO logging is enabled. [ROCm/rccl commit: c70f5b4621e951150a1d4140bb82382340b88745] --- projects/rccl/src/device/prims_simple.h | 2 +- projects/rccl/src/enqueue.cc | 4 ++-- projects/rccl/src/include/comm.h | 2 +- projects/rccl/src/include/device.h | 2 +- projects/rccl/src/init.cc | 13 +++++++------ 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/projects/rccl/src/device/prims_simple.h b/projects/rccl/src/device/prims_simple.h index f8187bdb28..1aceac87de 100644 --- a/projects/rccl/src/device/prims_simple.h +++ b/projects/rccl/src/device/prims_simple.h @@ -875,7 +875,7 @@ public: patBarrier(); } if(collWork){ - skip_fence = !collWork -> gfx942CheapFenceOff; + skip_fence = !collWork -> gfx9CheapFenceOff; } } diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index c03ba7b8cd..f55ed35a16 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -348,7 +348,7 @@ static bool testBudget( // Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what // it depends on. -bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){ +bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){ bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck; return !fenceOk; } @@ -388,7 +388,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { devWork.isOneRPN = comm->isOneRPN; devWork.netRegUsed = devWork.regUsed = 0; - devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff); + devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff); devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh); if (task->regBufType & NCCL_NET_REG_BUFFER) devWork.netRegUsed = 1; diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index a60d0d96c1..1038ebbb54 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -504,7 +504,7 @@ struct ncclComm { int node; int nNodes; int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive - int gfx942CheapFenceOff; // RCCL: true if gfx942 cheap fence is disabled + int gfx9CheapFenceOff; // RCCL: true if gfx9 cheap fence is disabled int localRank; int localRanks; int maxLocalRanks; diff --git a/projects/rccl/src/include/device.h b/projects/rccl/src/include/device.h index d65ffd3664..9df54adfda 100644 --- a/projects/rccl/src/include/device.h +++ b/projects/rccl/src/include/device.h @@ -327,7 +327,7 @@ struct alignas(16) ncclDevWorkColl { // nChannels == (channelHi - channelLo) + 1 uint32_t channelLo:8, channelHi:8; uint32_t nWarps:8; - uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx942CheapFenceOff:1; + uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx9CheapFenceOff:1; uint32_t root:30, connIndex:2; uint16_t pivotA2ANumBiRings:15, profilerEnabled:1; void* recvbuff; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index f684ea8248..b31d87029f 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -111,8 +111,8 @@ RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(16*1024*1024)); static constexpr int64_t defaultEnableMscclpp = 0; RCCL_PARAM(MscclppEnabled, "MSCCLPP_ENABLE", defaultEnableMscclpp); RCCL_PARAM(MscclppForceEnabled, "MSCCLPP_FORCE_ENABLE", 0); -// Turn off cheap fence for gfx942 -RCCL_PARAM(Gfx942CheapFenceOff, "GFX942_CHEAP_FENCE_OFF", 0); +// Turn off cheap fence for gfx942/gfx950 +RCCL_PARAM(Gfx9CheapFenceOff, "GFX9_CHEAP_FENCE_OFF", 0); // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -1397,16 +1397,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels); if (ringGraph->nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; - comm -> gfx942CheapFenceOff = 1; + comm -> gfx9CheapFenceOff = 1; #ifdef HIP_UNCACHED_MEMORY - if(!rcclParamGfx942CheapFenceOff()){ + if(!rcclParamGfx9CheapFenceOff()){ if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){ - comm -> gfx942CheapFenceOff = 0; + comm -> gfx9CheapFenceOff = 0; } else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){ - comm -> gfx942CheapFenceOff = nNodes > 1; + comm -> gfx9CheapFenceOff = ROCM_VERSION < 70200 && nNodes > 1; // Enable for single node only prior to ROCm 7.0.2 } } + INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON"); #endif if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) { // Multi-node MI300A