[gfx950] Make bypassing __threadfence the default for multinode. (#1947)
* Gate based on ROCM version, safe for ROCm 7.0.2 and beyond.
* Updates naming to gfx9CheapFenceOff since we use this for gfx942 and gfx950. Thanks Nilesh.
* Add info logging statement to NCCL_INIT to print whether enabled when INFO logging is enabled.
[ROCm/rccl commit: c70f5b4621]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
6d151d4e21
Коммит
455d516dc4
@@ -875,7 +875,7 @@ public:
|
||||
patBarrier();
|
||||
}
|
||||
if(collWork){
|
||||
skip_fence = !collWork -> gfx942CheapFenceOff;
|
||||
skip_fence = !collWork -> gfx9CheapFenceOff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -348,7 +348,7 @@ static bool testBudget(
|
||||
|
||||
// Returns whether this should be disabled at the device level. Should be called after devWork fields have been set for what
|
||||
// it depends on.
|
||||
bool gfx942CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
|
||||
bool gfx9CheapFenceOff(const ncclDevWorkColl& devWork, bool disabledByPrecheck){
|
||||
bool fenceOk = devWork.regUsed == 0 && devWork.netRegUsed == 0 && !disabledByPrecheck;
|
||||
return !fenceOk;
|
||||
}
|
||||
@@ -388,7 +388,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
|
||||
devWork.isOneRPN = comm->isOneRPN;
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.gfx942CheapFenceOff = gfx942CheapFenceOff(devWork, comm->gfx942CheapFenceOff);
|
||||
devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
|
||||
devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
|
||||
if (task->regBufType & NCCL_NET_REG_BUFFER)
|
||||
devWork.netRegUsed = 1;
|
||||
|
||||
@@ -504,7 +504,7 @@ struct ncclComm {
|
||||
int node;
|
||||
int nNodes;
|
||||
int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive
|
||||
int gfx942CheapFenceOff; // RCCL: true if gfx942 cheap fence is disabled
|
||||
int gfx9CheapFenceOff; // RCCL: true if gfx9 cheap fence is disabled
|
||||
int localRank;
|
||||
int localRanks;
|
||||
int maxLocalRanks;
|
||||
|
||||
@@ -327,7 +327,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
// nChannels == (channelHi - channelLo) + 1
|
||||
uint32_t channelLo:8, channelHi:8;
|
||||
uint32_t nWarps:8;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx942CheapFenceOff:1;
|
||||
uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1, rcclUseOneSlice:1, gfx9CheapFenceOff:1;
|
||||
uint32_t root:30, connIndex:2;
|
||||
uint16_t pivotA2ANumBiRings:15, profilerEnabled:1;
|
||||
void* recvbuff;
|
||||
|
||||
@@ -111,8 +111,8 @@ RCCL_PARAM(MscclppThreshold, "MSCCLPP_THRESHOLD", (size_t)(16*1024*1024));
|
||||
static constexpr int64_t defaultEnableMscclpp = 0;
|
||||
RCCL_PARAM(MscclppEnabled, "MSCCLPP_ENABLE", defaultEnableMscclpp);
|
||||
RCCL_PARAM(MscclppForceEnabled, "MSCCLPP_FORCE_ENABLE", 0);
|
||||
// Turn off cheap fence for gfx942
|
||||
RCCL_PARAM(Gfx942CheapFenceOff, "GFX942_CHEAP_FENCE_OFF", 0);
|
||||
// Turn off cheap fence for gfx942/gfx950
|
||||
RCCL_PARAM(Gfx9CheapFenceOff, "GFX9_CHEAP_FENCE_OFF", 0);
|
||||
|
||||
// GDRCOPY support: Off by default
|
||||
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
|
||||
@@ -1397,16 +1397,17 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
|
||||
if (ringGraph->nChannels > MAXCHANNELS/2)
|
||||
allGather3Data[rank].nc = 1;
|
||||
comm -> gfx942CheapFenceOff = 1;
|
||||
comm -> gfx9CheapFenceOff = 1;
|
||||
#ifdef HIP_UNCACHED_MEMORY
|
||||
if(!rcclParamGfx942CheapFenceOff()){
|
||||
if(!rcclParamGfx9CheapFenceOff()){
|
||||
if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")){
|
||||
comm -> gfx942CheapFenceOff = 0;
|
||||
comm -> gfx9CheapFenceOff = 0;
|
||||
}
|
||||
else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
|
||||
comm -> gfx942CheapFenceOff = nNodes > 1;
|
||||
comm -> gfx9CheapFenceOff = ROCM_VERSION < 70200 && nNodes > 1; // Enable for single node only prior to ROCm 7.0.2
|
||||
}
|
||||
}
|
||||
INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON");
|
||||
#endif
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
|
||||
// Multi-node MI300A
|
||||
|
||||
Ссылка в новой задаче
Block a user