[gfx950] Turn On Single Node One Slice Optimization for gfx950 and MI300A (#2017)

* Internal benchmarking shows nice single-node performance uplift for MI300A and MI350
Этот коммит содержится в:
alex-breslow-amd
2025-11-06 12:12:45 -08:00
коммит произвёл GitHub
родитель d6a53d2022
Коммит 56e0b4e445
+4 -2
Просмотреть файл
@@ -1416,12 +1416,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
}
INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON");
#endif
// RCCL: Only use one slice per primitive on some single node gfx9xx systems, only currently enabled for AllReduce, ReduceScatter, and AllGather
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
comm->rcclUseOneSlice = nNodes == 1;
}
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
// Multi-node MI300A
int managed = 0;
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
// RCCL: Only use one slice per primitive on some single node gfx9xx systems
comm->rcclUseOneSlice = !managed && nNodes == 1;
if (managed && nNodes > 1) {
// This forces the minimum channels to 24
allGather3Data[rank].nc = 6;