Remove node-count and threshold restrictions from p2p-batching (#2077)
* remove node-count and threshold restrictions from p2p-batching
* remove batching threshold usage, fix typo for using batching-enablement flag
---------
Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
[ROCm/rccl commit: 7c1049d2a4]
Этот коммит содержится в:
@@ -288,10 +288,10 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
|
||||
|
||||
if (WARP_SIZE == 64) {
|
||||
if (uint64_t(batch.offsetBitset) & (1ull<<lane)) {
|
||||
int nWorksBelow = __popc(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
|
||||
int nWorksBelow = __popcll(uint64_t(batch.offsetBitset) & ((1ull<<lane)-1));
|
||||
fnsOfBitset[nWorksBelow] = lane;
|
||||
}
|
||||
nWorks = __popc(uint64_t(batch.offsetBitset));
|
||||
nWorks = __popcll(uint64_t(batch.offsetBitset));
|
||||
} else {
|
||||
// WARP_SIZE == 32
|
||||
if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
|
||||
|
||||
@@ -191,7 +191,7 @@ static void addWorkBatchToPlan(
|
||||
// batch further down.
|
||||
newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
|
||||
if (workType == ncclDevWorkTypeP2p) {
|
||||
newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
|
||||
newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
|
||||
for (int i=0; i < chan->wipBatch.nP2ps; i++) {
|
||||
newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
|
||||
}
|
||||
@@ -937,15 +937,6 @@ NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 8192);
|
||||
RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
|
||||
NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
|
||||
|
||||
// This is the maximum P2P message size that can be batched with others
|
||||
// Below this message size, NCCL_MAX_DEV_WORK_P2P_PER_BATCH will be applicable
|
||||
// For alltoall, this can be mutiplied by number of ranks to match Size (B) in rccl-tests
|
||||
// Without a threshold, RCCL will suffer large message regression due to limitation at a larger scale
|
||||
// when more batches are needed to saturate the NIC BW in RCCL.
|
||||
// The threshold can be set to a higher value to experiment on other platforms.
|
||||
// This value has been tested on MI300.
|
||||
RCCL_PARAM(P2pBatchThreshold, "P2P_BATCH_THRESHOLD", 1 << 16); // 64k
|
||||
|
||||
|
||||
// Need this temporary parameter to disable p2p batching to avoid some dips at 4MB - 32 MB message size at large scale
|
||||
// This parameter must be removed after further investigation,
|
||||
@@ -974,9 +965,6 @@ static ncclResult_t addP2pToPlan(
|
||||
bool proxySameProcess[2] = {true, true};
|
||||
void** handles[2] = {NULL, NULL};
|
||||
auto batchP2PEnableEnv = rcclParamP2pBatchEnable();
|
||||
auto p2pBatchThreshold = rcclParamP2pBatchThreshold();
|
||||
bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold);
|
||||
bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold;
|
||||
|
||||
//ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1)
|
||||
//but batching is only applicable if msg size is below threshold which is not checked below
|
||||
@@ -1156,7 +1144,7 @@ static ncclResult_t addP2pToPlan(
|
||||
plan->channelMask.masks[channelId/64] |= uint64_t(1)<<(channelId%64);
|
||||
// Add batch first.
|
||||
int funcIdx = ncclDevFuncId_P2p();
|
||||
addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2P);
|
||||
addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, funcIdx, workOffset, p2pRound, batchP2PEnableEnv);
|
||||
if (funcIdx < 0) {
|
||||
WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
|
||||
return ncclInvalidUsage;
|
||||
|
||||
Ссылка в новой задаче
Block a user