Print KL/CL/KE events for all warps (#1544)
* Print KL/CL/KE events for all warps
* Fix count off-by-one issue
* Fix opCount in KE and restore CPU thread option
* Simplify count calculation
[ROCm/rccl commit: ebf7e2305e]
Cette révision appartient à :
@@ -47,6 +47,7 @@
|
||||
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
|
||||
collTrace->timeStamp = wall_clock64(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->tid = threadIdx.x; \
|
||||
collTrace->channelId = ncclShmem.channelId;
|
||||
// TODO: switch to atomicInc after llvm crash is fixed
|
||||
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
|
||||
@@ -82,7 +83,16 @@
|
||||
}
|
||||
#define traceKernelEnd(end_type) { \
|
||||
INC_COLL_TRACE \
|
||||
collTrace->type = end_type; \
|
||||
if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
|
||||
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
|
||||
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
|
||||
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
|
||||
collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
|
||||
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
|
||||
collTrace->opCount = collWork->opCount; \
|
||||
collTrace->type = (end_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
INC_COLL_TRACE \
|
||||
@@ -519,7 +529,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
}
|
||||
#endif
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
|
||||
|
||||
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
|
||||
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
|
||||
@@ -569,9 +579,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
}
|
||||
if (aborted) break;
|
||||
|
||||
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
|
||||
}
|
||||
if (COLLTRACE && tid == 0) traceKernelEnd(ncclCollTraceKernelEndType);
|
||||
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
|
||||
|
||||
@@ -419,7 +419,8 @@ struct ncclCollTrace {
|
||||
uint8_t type;
|
||||
uint8_t bid;
|
||||
int16_t funcIndex;
|
||||
uint32_t data_0;
|
||||
uint32_t data_0:24;
|
||||
uint8_t tid;
|
||||
uint8_t channelId;
|
||||
uint64_t timeStamp:56;
|
||||
union {
|
||||
|
||||
@@ -226,8 +226,14 @@ void *ncclCommThreadMain(void *arg) {
|
||||
int head[MAXCHANNELS];
|
||||
double vega_gpu_rtc_freq;
|
||||
|
||||
memset(head, 0, sizeof(int)*MAXCHANNELS);
|
||||
vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
|
||||
for (int channel = 0; channel < MAXCHANNELS; channel++) {
|
||||
int tail = comm->collTraceTail[channel].tail;
|
||||
if (tail < COLLTRACE_NUM_ITEMS)
|
||||
head[channel] = 0;
|
||||
else
|
||||
head[channel] = tail - COLLTRACE_NUM_ITEMS;
|
||||
}
|
||||
do {
|
||||
int numActiveChans = MAXCHANNELS;
|
||||
for (int channel = 0; channel < MAXCHANNELS; channel++) {
|
||||
@@ -238,24 +244,23 @@ void *ncclCommThreadMain(void *arg) {
|
||||
numActiveChans--;
|
||||
continue;
|
||||
}
|
||||
count = count%COLLTRACE_NUM_ITEMS;
|
||||
for (int i = 0; i < count; i++) {
|
||||
volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel];
|
||||
volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS;
|
||||
head[channel] ++;
|
||||
uint8_t type = td->type;
|
||||
if (type == ncclCollTraceNotReady)
|
||||
break;
|
||||
continue;
|
||||
char line[1024];
|
||||
int offset = 0;
|
||||
uint16_t fIdx = td->funcIndex;
|
||||
if (type == ncclCollTraceDataType) {
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId,
|
||||
fIdx, td->data_0, td->opCount, td->data_1);
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx",
|
||||
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, fIdx, td->data_0, td->opCount, td->data_1);
|
||||
} else {
|
||||
if (type & ncclCollTraceP2pElemType)
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->p2pOpCount[0], td->p2pOpCount[1]);
|
||||
else
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->opCount);
|
||||
offset = strlen(line);
|
||||
if (type == ncclCollTraceCollElemType) {
|
||||
sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);
|
||||
@@ -293,7 +298,6 @@ void *ncclCommThreadMain(void *arg) {
|
||||
}
|
||||
INFO(NCCL_COLL, "%s", line);
|
||||
td->type = ncclCollTraceNotReady;
|
||||
head[channel] ++;
|
||||
}
|
||||
}
|
||||
if (comm->collTraceExit && numActiveChans == 0)
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur