From c601f9b3f8dcdb765baa3f21e4971b4018d153b6 Mon Sep 17 00:00:00 2001 From: Dingming Wu Date: Mon, 10 Nov 2025 09:23:49 -0800 Subject: [PATCH] Increment opCount for intra-node comms as well (#2024) * Enhance logging in NCCL initialization It's convenient to log comms obj and default channels together for debugging * Add opCount to collDevWork and update increment logic Added opCount to collDevWork and incremented it when proxyOpQueue is empty (e.g., for intra-node comms) * Clarify opCount increment logic in enqueue.cc Updated comment to clarify incrementing opCount for intranode communications. * Refactor NCCL_INIT logging format Updated logging format for NCCL_INIT to improve clarity. * Remove duplicate INFO logging in init.cc [ROCm/rccl commit: b00ee4c83cbc7693f3610d1a644316ad967a7902] --- projects/rccl/src/enqueue.cc | 10 +++++++++- projects/rccl/src/init.cc | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 0baf4df801..1e61d0a05e 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -388,7 +388,9 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) { devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr; devWork.oneNode = (comm->nNodes == 1); devWork.rcclUseOneSlice = comm->rcclUseOneSlice; - + //[Added-comment] opCount is missing for collDevWork, adding here + devWork.opCount = task->opCount; + devWork.isOneRPN = comm->isOneRPN; devWork.netRegUsed = devWork.regUsed = 0; devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff); @@ -1881,6 +1883,12 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern // hostStreamPlanTask directly NCCLCHECK(hostStreamPlanTask(comm, plan)); } + + // Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty + // opCount was not incremented because ncclProxyStart wasn't called in hostStreamPlanTask + if (!plan->persistent && ncclIntruQueueHead(&plan->proxyOpQueue) == nullptr) { + comm->opCount++; + } return ncclSuccess; } diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index f21b6f0042..4a511633ec 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1793,8 +1793,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p // Compute time models for algorithm and protocol combinations NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail); - INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); - + INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); + if (comm->intraRank == 0) { // Load ncclParamLaunchMode const char* str = ncclGetEnv("NCCL_LAUNCH_MODE"); enum ncclLaunchMode mode, modeOld;