[gfx950] Turn On Single Node One Slice Optimization for gfx950 and MI300A (#2017)
* Internal benchmarking shows nice single-node performance uplift for MI300A and MI350
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
d6a53d2022
Коммит
56e0b4e445
+4
-2
@@ -1416,12 +1416,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
}
|
||||
INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON");
|
||||
#endif
|
||||
// RCCL: Only use one slice per primitive on some single node gfx9xx systems, only currently enabled for AllReduce, ReduceScatter, and AllGather
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){
|
||||
comm->rcclUseOneSlice = nNodes == 1;
|
||||
}
|
||||
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
|
||||
// Multi-node MI300A
|
||||
int managed = 0;
|
||||
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
|
||||
// RCCL: Only use one slice per primitive on some single node gfx9xx systems
|
||||
comm->rcclUseOneSlice = !managed && nNodes == 1;
|
||||
if (managed && nNodes > 1) {
|
||||
// This forces the minimum channels to 24
|
||||
allGather3Data[rank].nc = 6;
|
||||
|
||||
Ссылка в новой задаче
Block a user