From 7c1049d2a4409ccff0f87d704a64e1e7a326722f Mon Sep 17 00:00:00 2001 From: isaki001 <36317038+isaki001@users.noreply.github.com> Date: Mon, 15 Dec 2025 18:55:46 -0600 Subject: [PATCH] Remove node-count and threshold restrictions from p2p-batching (#2077) * remove node-count and threshold restrictions from p2p-batching * remove batching threshold usage, fix typo for using batching-enablement flag --------- Co-authored-by: Mustafa Abduljabbar --- src/device/common.h | 4 ++-- src/enqueue.cc | 16 ++-------------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/device/common.h b/src/device/common.h index e526d85c14..2ac5511c19 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -288,10 +288,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem( if (WARP_SIZE == 64) { if (uint64_t(batch.offsetBitset) & (1ull<wipBatch.workBytes + workSize; if (workType == ncclDevWorkTypeP2p) { - newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1); + newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1); for (int i=0; i < chan->wipBatch.nP2ps; i++) { newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; } @@ -937,15 +937,6 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 8192); RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072); NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); -// This is the maximum P2P message size that can be batched with others -// Below this message size, NCCL_MAX_DEV_WORK_P2P_PER_BATCH will be applicable -// For alltoall, this can be mutiplied by number of ranks to match Size (B) in rccl-tests -// Without a threshold, RCCL will suffer large message regression due to limitation at a larger scale -// when more batches are needed to saturate the NIC BW in RCCL. -// The threshold can be set to a higher value to experiment on other platforms. -// This value has been tested on MI300. -RCCL_PARAM(P2pBatchThreshold, "P2P_BATCH_THRESHOLD", 1 << 16); // 64k - // Need this temporary parameter to disable p2p batching to avoid some dips at 4MB - 32 MB message size at large scale // This parameter must be removed after further investigation, @@ -974,9 +965,6 @@ static ncclResult_t addP2pToPlan( bool proxySameProcess[2] = {true, true}; void** handles[2] = {NULL, NULL}; auto batchP2PEnableEnv = rcclParamP2pBatchEnable(); - auto p2pBatchThreshold = rcclParamP2pBatchThreshold(); - bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold); - bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold; //ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1) //but batching is only applicable if msg size is below threshold which is not checked below @@ -1156,7 +1144,7 @@ static ncclResult_t addP2pToPlan( plan->channelMask.masks[channelId/64] |= uint64_t(1)<<(channelId%64); // Add batch first. int funcIdx = ncclDevFuncId_P2p(); - addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2P); + addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2PEnableEnv); if (funcIdx < 0) { WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__); return ncclInvalidUsage;