Add back opCount and channel ID to debug trace (#1520)

이 커밋은 다음에 포함됨:
Wenkai Du
2025-02-03 08:55:27 -08:00
커밋한 사람 GitHub
부모 7af21dd996
커밋 a5c6b547a2
5개의 변경된 파일30개의 추가작업 그리고 8개의 파일을 삭제
+5 -1
파일 보기
@@ -46,7 +46,8 @@
uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x;
collTrace->bid = blockIdx.x; \
collTrace->channelId = ncclShmem.channelId;
// TODO: switch to atomicInc after llvm crash is fixed
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
@@ -65,6 +66,8 @@
collTrace->p2p.connIndex = p2pWork->connIndex; \
collTrace->p2p.sendProtoLL = p2pWork->sendProtoLL; \
collTrace->p2p.recvProtoLL = p2pWork->recvProtoLL; \
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
@@ -72,6 +75,7 @@
collTrace->coll.nChannels = collWork->channelHi-collWork->channelLo+1; \
collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \
collTrace->coll.root = collWork->root; \
collTrace->opCount = collWork->opCount; \
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
} \
}
+9 -2
파일 보기
@@ -566,6 +566,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
devWork.oneNode = (comm->nNodes == 1);
devWork.regUsed = task->regBufType;
devWork.pivotA2ANumBiRings = comm->topo->pivotA2ANumBiRings;
devWork.opCount = task->opCount;
struct ncclWorkList* workNode;
switch (task->regBufType) {
@@ -867,7 +868,8 @@ static ncclResult_t addP2pToPlan(
struct ncclComm* comm, struct ncclKernelPlan* plan,
int nChannelsMin, int nChannelsMax, int p2pRound,
int sendRank, void* sendAddr, ssize_t sendBytes,
int recvRank, void* recvAddr, ssize_t recvBytes
int recvRank, void* recvAddr, ssize_t recvBytes,
uint64_t sendOpCount, uint64_t recvOpCount
) {
int connIndex = 1;
bool selfSend = (sendRank == comm->rank);
@@ -976,6 +978,8 @@ static ncclResult_t addP2pToPlan(
work->recvAddr = recvAddr;
work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
work->connIndex = connIndex;
work->sendOpCount = sendOpCount;
work->recvOpCount = recvOpCount;
struct ncclProxyOp proxyOps[2] = {};
int nProxyOps = selfSend ? 0 : 2;
@@ -1112,7 +1116,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
return ncclSuccess;
}
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes,
send ? send->opCount : 0, recv ? recv->opCount : 0));
if (send != nullptr) {
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
comm->planner.nTasksP2p -= 1;
@@ -2106,6 +2111,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
p2p->buff = (void*)info->recvbuff;
p2p->bytes = nBytes;
p2p->opCount = comm->opCount;
ncclIntruQueueEnqueue(
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
p2p);
@@ -2182,6 +2188,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
t->opDev = opDev; // C++ struct assignment
t->chunkSteps = info->chunkSteps;
t->sliceSteps = info->sliceSteps;
t->opCount = comm->opCount;
planner->nTasksColl += 1;
ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
+2
파일 보기
@@ -233,11 +233,13 @@ struct ncclTaskColl {
void* sendMhandle;
void* recvMhandle;
uint64_t opCount;
};
struct ncclTaskP2p {
struct ncclTaskP2p* next;
void* buff;
size_t bytes;
uint64_t opCount;
};
////////////////////////////////////////////////////////////////////////////////
+8 -2
파일 보기
@@ -234,6 +234,7 @@ struct alignas(16) ncclDevWorkP2p {
void *sendAddr, *recvAddr;
size_t sendBytes, recvBytes;
int sendRank, recvRank;
uint64_t sendOpCount, recvOpCount;
// From the part index, nP2pChannels, and channelBase the device code can
// calculate which part of the transfer a channel is responsible for.
uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
@@ -298,6 +299,7 @@ struct alignas(16) ncclDevWorkColl {
} collnet;
};
uint64_t redOpArg;
uint64_t opCount;
};
@@ -418,8 +420,12 @@ struct ncclCollTrace {
uint8_t bid;
int16_t funcIndex;
uint32_t data_0;
uint64_t timeStamp;
uint64_t opCount;
uint8_t channelId;
uint64_t timeStamp:56;
union {
uint64_t opCount;
uint32_t p2pOpCount[2];
};
union {
uint64_t data_1;
struct {
+6 -3
파일 보기
@@ -248,11 +248,14 @@ void *ncclCommThreadMain(void *arg) {
int offset = 0;
uint16_t fIdx = td->funcIndex;
if (type == ncclCollTraceDataType) {
sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid,
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx",
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId,
fIdx, td->data_0, td->opCount, td->data_1);
} else {
sprintf(line, "## [%012.6f] [%02d:%02d]", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid);
if (type & ncclCollTraceP2pElemType)
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
else if (type & ncclCollTraceCollElemType)
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
offset = strlen(line);
if (type == ncclCollTraceCollElemType) {
sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);