Fix AllReduce regression due to previous max range increase for LL64/LL128 (#1787)

* Adjust tuning factor impacting more than 2 nodes
* Scale max LL128 size for > 2 nodes
* Retune max LL128 range for N > 2
Этот коммит содержится в:
Mustafa Abduljabbar
2025-07-09 20:17:10 -04:00
коммит произвёл GitHub
родитель a28d5cb986
Коммит 058264b3f3
+4 -1
Просмотреть файл
@@ -52,7 +52,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
// When LL128 is performant, the next condition overrides the previous LL choice
if (comm->topo->ll128Enabled) {
if (info->func == ncclFuncAllReduce) {
ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX];
if(comm->nNodes > 2) {
ll128Max *= 3.8; // Scale max message size for n > 2 since Tree has special behavior at 2 nodes
}
// ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX];
}
if (sizePerRank <= ll128Max && sizePerRank > ll128Min) {
info->protocol = NCCL_PROTO_LL128;