Print KL/CL/KE events for all warps (#1544)

* Print KL/CL/KE events for all warps

* Fix count off-by-one issue

* Fix opCount in KE and restore CPU thread option

* Simplify count calculation

[ROCm/rccl commit: ebf7e2305e]
Cette révision appartient à :
Wenkai Du
2025-02-12 13:36:31 -08:00
révisé par GitHub
Parent 7fdbcdfdec
révision 170cc1afde
3 fichiers modifiés avec 30 ajouts et 15 suppressions
+14 -4
Voir le fichier
@@ -47,6 +47,7 @@
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x; \
collTrace->tid = threadIdx.x; \
collTrace->channelId = ncclShmem.channelId;
// TODO: switch to atomicInc after llvm crash is fixed
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
@@ -82,7 +83,16 @@
}
#define traceKernelEnd(end_type) { \
INC_COLL_TRACE \
collTrace->type = end_type; \
if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
} else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
collTrace->opCount = collWork->opCount; \
collTrace->type = (end_type) | ncclCollTraceCollElemType; \
} \
}
#define traceData(data2, data4, data8_0, data8_1) { \
INC_COLL_TRACE \
@@ -519,7 +529,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
// ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
@@ -569,9 +579,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
}
if (aborted) break;
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
}
if (COLLTRACE && tid == 0) traceKernelEnd(ncclCollTraceKernelEndType);
if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);
#ifdef ENABLE_PROFILING
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
+2 -1
Voir le fichier
@@ -419,7 +419,8 @@ struct ncclCollTrace {
uint8_t type;
uint8_t bid;
int16_t funcIndex;
uint32_t data_0;
uint32_t data_0:24;
uint8_t tid;
uint8_t channelId;
uint64_t timeStamp:56;
union {
+14 -10
Voir le fichier
@@ -226,8 +226,14 @@ void *ncclCommThreadMain(void *arg) {
int head[MAXCHANNELS];
double vega_gpu_rtc_freq;
memset(head, 0, sizeof(int)*MAXCHANNELS);
vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
for (int channel = 0; channel < MAXCHANNELS; channel++) {
int tail = comm->collTraceTail[channel].tail;
if (tail < COLLTRACE_NUM_ITEMS)
head[channel] = 0;
else
head[channel] = tail - COLLTRACE_NUM_ITEMS;
}
do {
int numActiveChans = MAXCHANNELS;
for (int channel = 0; channel < MAXCHANNELS; channel++) {
@@ -238,24 +244,23 @@ void *ncclCommThreadMain(void *arg) {
numActiveChans--;
continue;
}
count = count%COLLTRACE_NUM_ITEMS;
for (int i = 0; i < count; i++) {
volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel];
volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS;
head[channel] ++;
uint8_t type = td->type;
if (type == ncclCollTraceNotReady)
break;
continue;
char line[1024];
int offset = 0;
uint16_t fIdx = td->funcIndex;
if (type == ncclCollTraceDataType) {
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx",
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId,
fIdx, td->data_0, td->opCount, td->data_1);
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx",
(double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, fIdx, td->data_0, td->opCount, td->data_1);
} else {
if (type & ncclCollTraceP2pElemType)
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->p2pOpCount[0], td->p2pOpCount[1]);
else
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->opCount);
offset = strlen(line);
if (type == ncclCollTraceCollElemType) {
sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);
@@ -293,7 +298,6 @@ void *ncclCommThreadMain(void *arg) {
}
INFO(NCCL_COLL, "%s", line);
td->type = ncclCollTraceNotReady;
head[channel] ++;
}
}
if (comm->collTraceExit && numActiveChans == 0)