From ccfb35fa6dcb8546659ec7ecaaafe0433be53d1a Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Fri, 26 Jan 2024 09:05:53 -0500 Subject: [PATCH] modifying the tuning table to improve the performance of allreduce for 8MB and 16MB for single-node MI300X (#1063) --- src/graph/tuning.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index fc9e7aa950..964a8446a6 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -362,6 +362,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (coll == ncclFuncAllGather && a != NCCL_ALGO_RING) continue; for (int p=0; ptopo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") && comm->topo->nodes[GPU].count == comm->topo->nRanks) continue; if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue; int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;