Increment opCount for intra-node comms as well (#2024)
* Enhance logging in NCCL initialization
It's convenient to log comms obj and default channels together for debugging
* Add opCount to collDevWork and update increment logic
Added opCount to collDevWork and incremented it when proxyOpQueue is empty (e.g., for intra-node comms)
* Clarify opCount increment logic in enqueue.cc
Updated comment to clarify incrementing opCount for intranode communications.
* Refactor NCCL_INIT logging format
Updated logging format for NCCL_INIT to improve clarity.
* Remove duplicate INFO logging in init.cc
[ROCm/rccl commit: b00ee4c83c]
Этот коммит содержится в:
@@ -388,7 +388,9 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
|
||||
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
|
||||
devWork.oneNode = (comm->nNodes == 1);
|
||||
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
|
||||
|
||||
//[Added-comment] opCount is missing for collDevWork, adding here
|
||||
devWork.opCount = task->opCount;
|
||||
|
||||
devWork.isOneRPN = comm->isOneRPN;
|
||||
devWork.netRegUsed = devWork.regUsed = 0;
|
||||
devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
|
||||
@@ -1881,6 +1883,12 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
|
||||
// hostStreamPlanTask directly
|
||||
NCCLCHECK(hostStreamPlanTask(comm, plan));
|
||||
}
|
||||
|
||||
// Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty
|
||||
// opCount was not incremented because ncclProxyStart wasn't called in hostStreamPlanTask
|
||||
if (!plan->persistent && ncclIntruQueueHead(&plan->proxyOpQueue) == nullptr) {
|
||||
comm->opCount++;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1793,8 +1793,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
||||
// Compute time models for algorithm and protocol combinations
|
||||
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
|
||||
|
||||
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
|
||||
INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
|
||||
|
||||
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
|
||||
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
|
||||
enum ncclLaunchMode mode, modeOld;
|
||||
|
||||
Ссылка в новой задаче
Block a user