Single-node AllGather and ReduceScatter Optimization (#2019)

* Single-node performance tuning

* Normalizing value to individual rank
Этот коммит содержится в:
David DeBonis
2025-10-31 08:59:46 -06:00
коммит произвёл GitHub
родитель 1ce83d5cc0
Коммит 63d5846452
+8 -2
Просмотреть файл
@@ -38,14 +38,20 @@ RCCL_PARAM(DirectAllGatherThreshold, "DIRECT_ALLGATHER_THRESHOLD", 75497472);
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) {
// Honor user input for protocol choice
static int userProtocolInput = -2;
size_t sizePerRank = rcclGetSizePerRank(info->func, nBytes, comm->nRanks);
if (userProtocolInput == -2) {
const char *protoStr = getenv("NCCL_PROTO");
userProtocolInput = !protoStr ? 0 : 1;
}
if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && nBytes <= 524288) {
if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncAllGather) && sizePerRank <= 88448) {
// Change LL protocol threshold
info->protocol = NCCL_PROTO_LL;
} else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 175488) {
// Change LL protocol threshold
info->protocol = NCCL_PROTO_LL;
} else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 352128) {
// Change LL protocol threshold
info->protocol = NCCL_PROTO_LL;
} else if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather || info->func == ncclFuncAllReduce || info->func == ncclFuncBroadcast || info->func == ncclFuncReduce)) {
auto tunableIndex = rcclGetTunableIndex(info->func);
auto llMin = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX];