diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index d383d4e924..b68639ada8 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1390,6 +1390,18 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); if (ringGraph.nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx94")) { + if (nranks == 2) + // NCCL_MIN_NCHANNELS=32 + allGather3Data[rank].nc = 16; + else if (nranks == 4) + // NCCL_MIN_NCHANNELS=24 + allGather3Data[rank].nc = 6; + else if (nranks == 8) + // NCCL_MIN_NCHANNELS=56 + allGather3Data[rank].nc = 2; + } + allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable(); comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable(); allGather3Data[rank].ll128Enabled = comm->topo->ll128Enabled; @@ -2788,4 +2800,4 @@ exit: return ret; fail: goto exit; -} \ No newline at end of file +} diff --git a/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml b/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml new file mode 100644 index 0000000000..5cc05d0326 --- /dev/null +++ b/projects/rccl/tools/msccl-algorithms/allreduce-1step-4n-ll-1pass.xml @@ -0,0 +1,306 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +