diff --git a/src/device/common.h b/src/device/common.h index ca5c8fe642..bb0ed1fbb9 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -46,7 +46,8 @@ uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \ struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \ collTrace->timeStamp = wall_clock64(); \ - collTrace->bid = blockIdx.x; + collTrace->bid = blockIdx.x; \ + collTrace->channelId = ncclShmem.channelId; // TODO: switch to atomicInc after llvm crash is fixed // uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS) @@ -65,6 +66,8 @@ collTrace->p2p.connIndex = p2pWork->connIndex; \ collTrace->p2p.sendProtoLL = p2pWork->sendProtoLL; \ collTrace->p2p.recvProtoLL = p2pWork->recvProtoLL; \ + collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \ + collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \ collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \ } else if (ncclShmem.workType == ncclDevWorkTypeColl) { \ struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \ @@ -72,6 +75,7 @@ collTrace->coll.nChannels = collWork->channelHi-collWork->channelLo+1; \ collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \ collTrace->coll.root = collWork->root; \ + collTrace->opCount = collWork->opCount; \ collTrace->type = (launch_type) | ncclCollTraceCollElemType; \ } \ } diff --git a/src/enqueue.cc b/src/enqueue.cc index f622309be0..5a68432779 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -566,6 +566,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool devWork.oneNode = (comm->nNodes == 1); devWork.regUsed = task->regBufType; devWork.pivotA2ANumBiRings = comm->topo->pivotA2ANumBiRings; + devWork.opCount = task->opCount; struct ncclWorkList* workNode; switch (task->regBufType) { @@ -867,7 +868,8 @@ static ncclResult_t addP2pToPlan( struct ncclComm* comm, struct ncclKernelPlan* plan, int nChannelsMin, int nChannelsMax, int p2pRound, int sendRank, void* sendAddr, ssize_t sendBytes, - int recvRank, void* recvAddr, ssize_t recvBytes + int recvRank, void* recvAddr, ssize_t recvBytes, + uint64_t sendOpCount, uint64_t recvOpCount ) { int connIndex = 1; bool selfSend = (sendRank == comm->rank); @@ -976,6 +978,8 @@ static ncclResult_t addP2pToPlan( work->recvAddr = recvAddr; work->recvBytes = recvBytes==-1 ? 0 : recvBytes; work->connIndex = connIndex; + work->sendOpCount = sendOpCount; + work->recvOpCount = recvOpCount; struct ncclProxyOp proxyOps[2] = {}; int nProxyOps = selfSend ? 0 : 2; @@ -1112,7 +1116,8 @@ static ncclResult_t scheduleP2pTasksToPlan( if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) { return ncclSuccess; } - NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes)); + NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, + send ? send->opCount : 0, recv ? recv->opCount : 0)); if (send != nullptr) { ncclIntruQueueDequeue(&peers[sendRank].sendQueue); comm->planner.nTasksP2p -= 1; @@ -2106,6 +2111,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { struct ncclTaskP2p* p2p = ncclMemoryStackAlloc(&comm->memScoped); p2p->buff = (void*)info->recvbuff; p2p->bytes = nBytes; + p2p->opCount = comm->opCount; ncclIntruQueueEnqueue( isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue, p2p); @@ -2182,6 +2188,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { t->opDev = opDev; // C++ struct assignment t->chunkSteps = info->chunkSteps; t->sliceSteps = info->sliceSteps; + t->opCount = comm->opCount; planner->nTasksColl += 1; ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes); diff --git a/src/include/comm.h b/src/include/comm.h index 94b8b4457e..254981263e 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -233,11 +233,13 @@ struct ncclTaskColl { void* sendMhandle; void* recvMhandle; + uint64_t opCount; }; struct ncclTaskP2p { struct ncclTaskP2p* next; void* buff; size_t bytes; + uint64_t opCount; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/include/device.h b/src/include/device.h index c130a89819..5083f7ac07 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -234,6 +234,7 @@ struct alignas(16) ncclDevWorkP2p { void *sendAddr, *recvAddr; size_t sendBytes, recvBytes; int sendRank, recvRank; + uint64_t sendOpCount, recvOpCount; // From the part index, nP2pChannels, and channelBase the device code can // calculate which part of the transfer a channel is responsible for. uint8_t nP2pChannels; // Always equal to comm->p2pnChannels @@ -298,6 +299,7 @@ struct alignas(16) ncclDevWorkColl { } collnet; }; uint64_t redOpArg; + uint64_t opCount; }; @@ -418,8 +420,12 @@ struct ncclCollTrace { uint8_t bid; int16_t funcIndex; uint32_t data_0; - uint64_t timeStamp; - uint64_t opCount; + uint8_t channelId; + uint64_t timeStamp:56; + union { + uint64_t opCount; + uint32_t p2pOpCount[2]; + }; union { uint64_t data_1; struct { diff --git a/src/init.cc b/src/init.cc index 586102c291..ddd2c9134e 100644 --- a/src/init.cc +++ b/src/init.cc @@ -248,11 +248,14 @@ void *ncclCommThreadMain(void *arg) { int offset = 0; uint16_t fIdx = td->funcIndex; if (type == ncclCollTraceDataType) { - sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx", - (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, + sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx", + (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, fIdx, td->data_0, td->opCount, td->data_1); } else { - sprintf(line, "## [%012.6f] [%02d:%02d]", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid); + if (type & ncclCollTraceP2pElemType) + sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]); + else if (type & ncclCollTraceCollElemType) + sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount); offset = strlen(line); if (type == ncclCollTraceCollElemType) { sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);