[gfx950] Make bypassing __threadfence the default for multinode. (#1947)

* Gate based on ROCM version, safe for ROCm 7.0.2 and beyond.
* Updates naming to gfx9CheapFenceOff since we use this for gfx942 and gfx950.  Thanks Nilesh.
* Add info logging statement to NCCL_INIT to print whether enabled when INFO logging is enabled.

[ROCm/rccl commit: c70f5b4621]
Этот коммит содержится в:
alex-breslow-amd
2025-10-15 09:15:36 -07:00
коммит произвёл GitHub
родитель 6d151d4e21
Коммит 455d516dc4
5 изменённых файлов: 12 добавлений и 11 удалений
+1 -1
Просмотреть файл
@@ -875,7 +875,7 @@ public:
patBarrier();
}
if(collWork){
skip_fence = !collWork -> gfx942CheapFenceOff;
skip_fence = !collWork -> gfx9CheapFenceOff;
}
}
+2 -2
Просмотреть файл
@@ -348,7 +348,7 @@ static bool testBudget(
// Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what
// it depends on.
bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck;
return !fenceOk;
}
@@ -388,7 +388,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
devWork.isOneRPN = comm->isOneRPN;
devWork.netRegUsed = devWork.regUsed = 0;
devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff);
devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
if (task->regBufType & NCCL_NET_REG_BUFFER)
devWork.netRegUsed = 1;
+1 -1
Просмотреть файл
@@ -504,7 +504,7 @@ struct ncclComm {
int node;
int nNodes;
int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive
int gfx942CheapFenceOff; // RCCL: true if gfx942 cheap fence is disabled
int gfx9CheapFenceOff; // RCCL: true if gfx9 cheap fence is disabled
int localRank;
int localRanks;
int maxLocalRanks;
+1 -1
Просмотреть файл
@@ -327,7 +327,7 @@ struct alignas(16) ncclDevWorkColl {
// nChannels == (channelHi - channelLo) + 1
uint32_t channelLo:8, channelHi:8;
uint32_t nWarps:8;
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx942CheapFenceOff:1;
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx9CheapFenceOff:1;
uint32_t root:30, connIndex:2;
uint16_t pivotA2ANumBiRings:15, profilerEnabled:1;
void* recvbuff;
+7 -6
Просмотреть файл
@@ -111,8 +111,8 @@ RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(16*1024*1024));
static constexpr int64_t defaultEnableMscclpp = 0;
RCCL_PARAM(MscclppEnabled, "MSCCLPP_ENABLE", defaultEnableMscclpp);
RCCL_PARAM(MscclppForceEnabled, "MSCCLPP_FORCE_ENABLE", 0);
// Turn off cheap fence for gfx942
RCCL_PARAM(Gfx942CheapFenceOff, "GFX942_CHEAP_FENCE_OFF", 0);
// Turn off cheap fence for gfx942/gfx950
RCCL_PARAM(Gfx9CheapFenceOff, "GFX9_CHEAP_FENCE_OFF", 0);
// GDRCOPY support: Off by default
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
@@ -1397,16 +1397,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
if (ringGraph->nChannels > MAXCHANNELS/2)
allGather3Data[rank].nc = 1;
comm -> gfx942CheapFenceOff = 1;
comm -> gfx9CheapFenceOff = 1;
#ifdef HIP_UNCACHED_MEMORY
if(!rcclParamGfx942CheapFenceOff()){
if(!rcclParamGfx9CheapFenceOff()){
if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){
comm -> gfx942CheapFenceOff = 0;
comm -> gfx9CheapFenceOff = 0;
}
else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
comm -> gfx942CheapFenceOff = nNodes > 1;
comm -> gfx9CheapFenceOff = ROCM_VERSION < 70200 && nNodes > 1; // Enable for single node only prior to ROCm 7.0.2
}
}
INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON");
#endif
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
// Multi-node MI300A