collective trace improvements for debugging (#1661)
[ROCm/rccl commit: c54a0c085a]
This commit is contained in:
@@ -76,6 +76,7 @@
|
||||
}
|
||||
#define traceKernelEnd(end_type) { \
|
||||
INC_COLL_TRACE \
|
||||
collTrace->funcIndex = ncclShmem.funcId;\
|
||||
if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
|
||||
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
|
||||
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
|
||||
@@ -95,10 +96,16 @@
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#define traceAbort(){\
|
||||
INC_COLL_TRACE\
|
||||
collTrace->funcIndex = ncclShmem.funcId;\
|
||||
collTrace->type = ncclCollTraceAbortType;\
|
||||
}
|
||||
#else
|
||||
#define traceKernelLaunch(launch_type, batchIx)
|
||||
#define traceKernelEnd(end_type)
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#define traceAbort()
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
@@ -600,8 +607,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
|
||||
ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
|
||||
}
|
||||
if (aborted) break;
|
||||
|
||||
if (aborted) {
|
||||
if(COLLTRACE && tid%WARP_SIZE == 0) traceAbort();
|
||||
break;
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType, batchIx);
|
||||
}
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
|
||||
|
||||
@@ -258,12 +258,12 @@ void *ncclCommThreadMain(void *arg) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS;
|
||||
head[channel] ++;
|
||||
uint8_t type = td->type;
|
||||
const uint8_t type = td->type;
|
||||
if (type == ncclCollTraceNotReady)
|
||||
continue;
|
||||
char line[1024];
|
||||
int offset = 0;
|
||||
uint16_t fIdx = td->funcIndex;
|
||||
const uint16_t fIdx = td->funcIndex;
|
||||
if (type == ncclCollTraceDataType) {
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, fIdx, td->data_0, td->opCount, td->data_1);
|
||||
@@ -284,9 +284,9 @@ void *ncclCommThreadMain(void *arg) {
|
||||
case ncclCollTraceKernelLaunchType:
|
||||
case ncclCollTraceCollLaunchType:
|
||||
if ((type&0xf) == ncclCollTraceKernelLaunchType)
|
||||
sprintf(line+offset, " KL HWID %8x %s", td->data_0, funcNames[fIdx]);
|
||||
sprintf(line+offset, " KL %s [%02d:%02d-%02d:%02x] HWID %8x ", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, td->data_0);
|
||||
else if ((type&0xf) == ncclCollTraceCollLaunchType)
|
||||
sprintf(line+offset, " CL %d %s", td->batchIx, funcNames[fIdx]);
|
||||
sprintf(line+offset, " CL %s [%02d:%02d-%02d:%02x] %d ", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, td->batchIx);
|
||||
offset = strlen(line);
|
||||
if ((type&0xf0) == ncclCollTraceCollElemType)
|
||||
sprintf(line+offset, " nw %d bi %d nc %d root %d busId %lx nRanks %d", td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);
|
||||
@@ -296,10 +296,10 @@ void *ncclCommThreadMain(void *arg) {
|
||||
comm->busId, comm->nRanks);
|
||||
break;
|
||||
case ncclCollTraceKernelEndType:
|
||||
sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks);
|
||||
sprintf(line+offset, " KE %s [%02d:%02d-%02d:%02x] busId %lx nRanks %d", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, comm->busId, comm->nRanks);
|
||||
break;
|
||||
case ncclCollTraceAbortType:
|
||||
sprintf(line+offset, " Abort");
|
||||
sprintf(line+offset, " KA %s [%02d:%02d-%02d:%02x]", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid);
|
||||
break;
|
||||
default:
|
||||
sprintf(line+offset, " unknown collective trace data type");
|
||||
@@ -307,7 +307,7 @@ void *ncclCommThreadMain(void *arg) {
|
||||
}
|
||||
}
|
||||
}
|
||||
INFO(NCCL_COLL, "%s", line);
|
||||
INFO(NCCL_COLL, "%s td->type:%d", line, type);
|
||||
td->type = ncclCollTraceNotReady;
|
||||
}
|
||||
}
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user