From 63d5846452e1647a2ae7dcb559ec272c79042232 Mon Sep 17 00:00:00 2001 From: David DeBonis Date: Fri, 31 Oct 2025 08:59:46 -0600 Subject: [PATCH] Single-node AllGather and ReduceScatter Optimization (#2019) * Single-node performance tuning * Normalizing value to individual rank --- src/rccl_wrap.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/rccl_wrap.cc b/src/rccl_wrap.cc index 73bb816ac6..74b3994a61 100644 --- a/src/rccl_wrap.cc +++ b/src/rccl_wrap.cc @@ -38,14 +38,20 @@ RCCL_PARAM(DirectAllGatherThreshold, "DIRECT_ALLGATHER_THRESHOLD", 75497472); void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) { // Honor user input for protocol choice static int userProtocolInput = -2; + size_t sizePerRank = rcclGetSizePerRank(info->func, nBytes, comm->nRanks); if (userProtocolInput == -2) { const char *protoStr = getenv("NCCL_PROTO"); userProtocolInput = !protoStr ? 0 : 1; } - if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && nBytes <= 524288) { + if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && sizePerRank <= 88448) { + // Change LL protocol threshold + info->protocol = NCCL_PROTO_LL; + } else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 175488) { + // Change LL protocol threshold + info->protocol = NCCL_PROTO_LL; + } else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 352128) { // Change LL protocol threshold info->protocol = NCCL_PROTO_LL; - } else if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather || info->func == ncclFuncAllReduce || info->func == ncclFuncBroadcast || info->func == ncclFuncReduce)) { auto tunableIndex = rcclGetTunableIndex(info->func); auto llMin = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX];