Add back opCount and channel ID to debug trace (#1520)
이 커밋은 다음에 포함됨:
+5
-1
@@ -46,7 +46,8 @@
|
||||
uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x;
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->channelId = ncclShmem.channelId;
|
||||
// TODO: switch to atomicInc after llvm crash is fixed
|
||||
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
|
||||
|
||||
@@ -65,6 +66,8 @@
|
||||
collTrace->p2p.connIndex = p2pWork->connIndex; \
|
||||
collTrace->p2p.sendProtoLL = p2pWork->sendProtoLL; \
|
||||
collTrace->p2p.recvProtoLL = p2pWork->recvProtoLL; \
|
||||
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
|
||||
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
|
||||
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
|
||||
@@ -72,6 +75,7 @@
|
||||
collTrace->coll.nChannels = collWork->channelHi-collWork->channelLo+1; \
|
||||
collTrace->coll.bid = ncclShmem.channelId - collWork->channelLo; \
|
||||
collTrace->coll.root = collWork->root; \
|
||||
collTrace->opCount = collWork->opCount; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
|
||||
+9
-2
@@ -566,6 +566,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
|
||||
devWork.oneNode = (comm->nNodes == 1);
|
||||
devWork.regUsed = task->regBufType;
|
||||
devWork.pivotA2ANumBiRings = comm->topo->pivotA2ANumBiRings;
|
||||
devWork.opCount = task->opCount;
|
||||
|
||||
struct ncclWorkList* workNode;
|
||||
switch (task->regBufType) {
|
||||
@@ -867,7 +868,8 @@ static ncclResult_t addP2pToPlan(
|
||||
struct ncclComm* comm, struct ncclKernelPlan* plan,
|
||||
int nChannelsMin, int nChannelsMax, int p2pRound,
|
||||
int sendRank, void* sendAddr, ssize_t sendBytes,
|
||||
int recvRank, void* recvAddr, ssize_t recvBytes
|
||||
int recvRank, void* recvAddr, ssize_t recvBytes,
|
||||
uint64_t sendOpCount, uint64_t recvOpCount
|
||||
) {
|
||||
int connIndex = 1;
|
||||
bool selfSend = (sendRank == comm->rank);
|
||||
@@ -976,6 +978,8 @@ static ncclResult_t addP2pToPlan(
|
||||
work->recvAddr = recvAddr;
|
||||
work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
|
||||
work->connIndex = connIndex;
|
||||
work->sendOpCount = sendOpCount;
|
||||
work->recvOpCount = recvOpCount;
|
||||
|
||||
struct ncclProxyOp proxyOps[2] = {};
|
||||
int nProxyOps = selfSend ? 0 : 2;
|
||||
@@ -1112,7 +1116,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
|
||||
if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes));
|
||||
NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes,
|
||||
send ? send->opCount : 0, recv ? recv->opCount : 0));
|
||||
if (send != nullptr) {
|
||||
ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
|
||||
comm->planner.nTasksP2p -= 1;
|
||||
@@ -2106,6 +2111,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
|
||||
p2p->buff = (void*)info->recvbuff;
|
||||
p2p->bytes = nBytes;
|
||||
p2p->opCount = comm->opCount;
|
||||
ncclIntruQueueEnqueue(
|
||||
isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
|
||||
p2p);
|
||||
@@ -2182,6 +2188,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
t->opDev = opDev; // C++ struct assignment
|
||||
t->chunkSteps = info->chunkSteps;
|
||||
t->sliceSteps = info->sliceSteps;
|
||||
t->opCount = comm->opCount;
|
||||
|
||||
planner->nTasksColl += 1;
|
||||
ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
|
||||
|
||||
@@ -233,11 +233,13 @@ struct ncclTaskColl {
|
||||
|
||||
void* sendMhandle;
|
||||
void* recvMhandle;
|
||||
uint64_t opCount;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
struct ncclTaskP2p* next;
|
||||
void* buff;
|
||||
size_t bytes;
|
||||
uint64_t opCount;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
+8
-2
@@ -234,6 +234,7 @@ struct alignas(16) ncclDevWorkP2p {
|
||||
void *sendAddr, *recvAddr;
|
||||
size_t sendBytes, recvBytes;
|
||||
int sendRank, recvRank;
|
||||
uint64_t sendOpCount, recvOpCount;
|
||||
// From the part index, nP2pChannels, and channelBase the device code can
|
||||
// calculate which part of the transfer a channel is responsible for.
|
||||
uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
|
||||
@@ -298,6 +299,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
} collnet;
|
||||
};
|
||||
uint64_t redOpArg;
|
||||
uint64_t opCount;
|
||||
};
|
||||
|
||||
|
||||
@@ -418,8 +420,12 @@ struct ncclCollTrace {
|
||||
uint8_t bid;
|
||||
int16_t funcIndex;
|
||||
uint32_t data_0;
|
||||
uint64_t timeStamp;
|
||||
uint64_t opCount;
|
||||
uint8_t channelId;
|
||||
uint64_t timeStamp:56;
|
||||
union {
|
||||
uint64_t opCount;
|
||||
uint32_t p2pOpCount[2];
|
||||
};
|
||||
union {
|
||||
uint64_t data_1;
|
||||
struct {
|
||||
|
||||
+6
-3
@@ -248,11 +248,14 @@ void *ncclCommThreadMain(void *arg) {
|
||||
int offset = 0;
|
||||
uint16_t fIdx = td->funcIndex;
|
||||
if (type == ncclCollTraceDataType) {
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid,
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId,
|
||||
fIdx, td->data_0, td->opCount, td->data_1);
|
||||
} else {
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d]", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid);
|
||||
if (type & ncclCollTraceP2pElemType)
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
|
||||
else if (type & ncclCollTraceCollElemType)
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
|
||||
offset = strlen(line);
|
||||
if (type == ncclCollTraceCollElemType) {
|
||||
sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);
|
||||
|
||||
새 이슈에서 참조
사용자 차단