Fix AllReduce regression due to previous max range increase for LL64/LL128 (#1787)
* Adjust tuning factor impacting more than 2 nodes * Scale max LL128 size for > 2 nodes * Retune max LL128 range for N > 2
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
a28d5cb986
Коммит
058264b3f3
@@ -52,7 +52,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
|
||||
// When LL128 is performant, the next condition overrides the previous LL choice
|
||||
if (comm->topo->ll128Enabled) {
|
||||
if (info->func == ncclFuncAllReduce) {
|
||||
ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX];
|
||||
if(comm->nNodes > 2) {
|
||||
ll128Max *= 3.8; // Scale max message size for n > 2 since Tree has special behavior at 2 nodes
|
||||
}
|
||||
// ll128Max += (log2i(comm->nNodes) - 1) * comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL128][RCCL_PROTOCOL_FACTOR_IDX];
|
||||
}
|
||||
if (sizePerRank <= ll128Max && sizePerRank > ll128Min) {
|
||||
info->protocol = NCCL_PROTO_LL128;
|
||||
|
||||
Ссылка в новой задаче
Block a user