Remove node-count and threshold restrictions from p2p-batching (#2077)

* remove node-count and threshold restrictions from p2p-batching

* remove batching threshold usage, fix typo for using batching-enablement flag

---------

Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
이 커밋은 다음에 포함됨:
isaki001
2025-12-15 18:55:46 -06:00
커밋한 사람 GitHub
부모 5787c960fc
커밋 7c1049d2a4
2개의 변경된 파일4개의 추가작업 그리고 16개의 파일을 삭제
+2 -2
파일 보기
@@ -288,10 +288,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
if (WARP_SIZE == 64) {
if (uint64_t(batch.offsetBitset) & (1ull<<lane)) {
int nWorksBelow = __popc(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
int nWorksBelow = __popcll(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
fnsOfBitset[nWorksBelow] = lane;
}
nWorks = __popc(uint64_t(batch.offsetBitset));
nWorks = __popcll(uint64_t(batch.offsetBitset));
} else {
// WARP_SIZE == 32
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
+2 -14
파일 보기
@@ -191,7 +191,7 @@ static void addWorkBatchToPlan(
// batch further down.
newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
if (workType == ncclDevWorkTypeP2p) {
newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
for (int i=0; i < chan->wipBatch.nP2ps; i++) {
newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
}
@@ -937,15 +937,6 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 8192);
RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
// This is the maximum P2P message size that can be batched with others
// Below this message size, NCCL_MAX_DEV_WORK_P2P_PER_BATCH will be applicable
// For alltoall, this can be mutiplied by number of ranks to match Size (B) in rccl-tests
// Without a threshold, RCCL will suffer large message regression due to limitation at a larger scale
// when more batches are needed to saturate the NIC BW in RCCL.
// The threshold can be set to a higher value to experiment on other platforms.
// This value has been tested on MI300.
RCCL_PARAM(P2pBatchThreshold, "P2P_BATCH_THRESHOLD", 1 << 16); // 64k
// Need this temporary parameter to disable p2p batching to avoid some dips at 4MB - 32 MB message size at large scale
// This parameter must be removed after further investigation,
@@ -974,9 +965,6 @@ static ncclResult_t addP2pToPlan(
bool proxySameProcess[2] = {true, true};
void** handles[2] = {NULL, NULL};
auto batchP2PEnableEnv = rcclParamP2pBatchEnable();
auto p2pBatchThreshold = rcclParamP2pBatchThreshold();
bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold);
bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold;
//ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1)
//but batching is only applicable if msg size is below threshold which is not checked below
@@ -1156,7 +1144,7 @@ static ncclResult_t addP2pToPlan(
plan->channelMask.masks[channelId/64] |= uint64_t(1)<<(channelId%64);
// Add batch first.
int funcIdx = ncclDevFuncId_P2p();
addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2P);
addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2PEnableEnv);
if (funcIdx < 0) {
WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
return ncclInvalidUsage;