From 73acf3eeec474aa59ed80a2bdb2eb495beb02677 Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Wed, 8 May 2024 15:49:33 -0400 Subject: [PATCH] modifying the tuning table to improve the performance of broadcast for 1MB to 64MB for single-node MI300X (#1172) --- src/graph/tuning.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 9d3441401e..d2e958e034 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -379,6 +379,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p]; else busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p]; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && coll == ncclFuncBroadcast && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) { busBw = busBw * 2.33; } #else if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);