From 058264b3f324430d3fd550644e67a0af596fc697 Mon Sep 17 00:00:00 2001 From: Mustafa Abduljabbar Date: Wed, 9 Jul 2025 20:17:10 -0400 Subject: [PATCH] Fix AllReduce regression due to previous max range increase for LL64/LL128 (#1787) * Adjust tuning factor impacting more than 2 nodes * Scale max LL128 size for > 2 nodes * Retune max LL128 range for N > 2 --- src/rccl_wrap.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/rccl_wrap.cc b/src/rccl_wrap.cc index f77b9fb7ee..7fba83dbf8 100644 --- a/src/rccl_wrap.cc +++ b/src/rccl_wrap.cc @@ -52,7 +52,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s // When LL128 is performant, the next condition overrides the previous LL choice if (comm->topo->ll128Enabled) { if (info->func == ncclFuncAllReduce) { - ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX]; + if(comm->nNodes > 2) { + ll128Max *= 3.8; // Scale max message size for n > 2 since Tree has special behavior at 2 nodes + } + // ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX]; } if (sizePerRank <= ll128Max && sizePerRank > ll128Min) { info->protocol = NCCL_PROTO_LL128;