From 4f56aa5f8ce4524ee295f8d22f145ad9ea648796 Mon Sep 17 00:00:00 2001 From: ClementLinCF <162283536+ClementLinCF@users.noreply.github.com> Date: Sat, 1 Jun 2024 22:07:46 +0800 Subject: [PATCH] Optimize NCHANNELS and MSCCL config for gfx942 80CUs (#1195) * Optimize NCHANNELS and MSCCL config for gfx942 80CUs Set appropriately for different NCCL_MIN_NCHANNELS and MSCCL config, potentially improving communication perf on the MI300x 80CUs * Delete tools/msccl-algorithms/allreduce_1step_mccl_8_2_16777216_LL.xml * Change the factor of gfx94 and update msccl config [ROCm/rccl commit: cab25f919ee52919e7a081ba258624c67796dc9e] --- projects/rccl/src/init.cc | 14 +- .../allreduce-1step-4n-ll-1pass.xml | 306 ++++++++++++++++++ 2 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index d383d4e924..b68639ada8 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1390,6 +1390,18 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) { + if (nranks == 2) + // NCCL_MIN_NCHANNELS=32 + allGather3Data[rank].nc = 16; + else if (nranks == 4) + // NCCL_MIN_NCHANNELS=24 + allGather3Data[rank].nc = 6; + else if (nranks == 8) + // NCCL_MIN_NCHANNELS=56 + allGather3Data[rank].nc = 2; + } + allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable(); comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable(); allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled; @@ -2788,4 +2800,4 @@ exit: return ret; fail: goto exit; -} \ No newline at end of file +} diff --git a/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml b/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml new file mode 100644 index 0000000000..5cc05d0326 --- /dev/null +++ b/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml @@ -0,0 +1,306 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +