From 170cc1afded076d5fd8dd94e4d89c7deafe11335 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Wed, 12 Feb 2025 13:36:31 -0800 Subject: [PATCH] Print KL/CL/KE events for all warps (#1544) * Print KL/CL/KE events for all warps * Fix count off-by-one issue * Fix opCount in KE and restore CPU thread option * Simplify count calculation [ROCm/rccl commit: ebf7e2305e1a37246b70847a447b3b8d4b045be2] --- projects/rccl/src/device/common.h | 18 ++++++++++++++---- projects/rccl/src/include/device.h | 3 ++- projects/rccl/src/init.cc | 24 ++++++++++++++---------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/projects/rccl/src/device/common.h b/projects/rccl/src/device/common.h index 49d3ee1c02..a58a2e28d9 100644 --- a/projects/rccl/src/device/common.h +++ b/projects/rccl/src/device/common.h @@ -47,6 +47,7 @@ struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \ collTrace->timeStamp = wall_clock64(); \ collTrace->bid = blockIdx.x; \ + collTrace->tid = threadIdx.x; \ collTrace->channelId = ncclShmem.channelId; // TODO: switch to atomicInc after llvm crash is fixed // uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS) @@ -82,7 +83,16 @@ } #define traceKernelEnd(end_type) { \ INC_COLL_TRACE \ - collTrace->type = end_type; \ + if (ncclShmem.workType == ncclDevWorkTypeP2p) { \ + struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \ + collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \ + collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \ + collTrace->type = (end_type) | ncclCollTraceP2pElemType; \ + } else if (ncclShmem.workType == ncclDevWorkTypeColl) { \ + struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \ + collTrace->opCount = collWork->opCount; \ + collTrace->type = (end_type) | ncclCollTraceCollElemType; \ + } \ } #define traceData(data2, data4, data8_0, data8_1) { \ INC_COLL_TRACE \ @@ -519,7 +529,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a } #endif if (tid == 0) __insert_timestamp(__LINE__); - if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType); + if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType); if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) { // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads() @@ -569,9 +579,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a } if (aborted) break; - if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceCollLaunchType); + if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType); } - if (COLLTRACE && tid == 0) traceKernelEnd(ncclCollTraceKernelEndType); + if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType); #ifdef ENABLE_PROFILING if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) { diff --git a/projects/rccl/src/include/device.h b/projects/rccl/src/include/device.h index 436bd118dc..bbce1337b8 100644 --- a/projects/rccl/src/include/device.h +++ b/projects/rccl/src/include/device.h @@ -419,7 +419,8 @@ struct ncclCollTrace { uint8_t type; uint8_t bid; int16_t funcIndex; - uint32_t data_0; + uint32_t data_0:24; + uint8_t tid; uint8_t channelId; uint64_t timeStamp:56; union { diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index e7de0f877b..ec9407c4c9 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -226,8 +226,14 @@ void *ncclCommThreadMain(void *arg) { int head[MAXCHANNELS]; double vega_gpu_rtc_freq; - memset(head, 0, sizeof(int)*MAXCHANNELS); vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3; + for (int channel = 0; channel < MAXCHANNELS; channel++) { + int tail = comm->collTraceTail[channel].tail; + if (tail < COLLTRACE_NUM_ITEMS) + head[channel] = 0; + else + head[channel] = tail - COLLTRACE_NUM_ITEMS; + } do { int numActiveChans = MAXCHANNELS; for (int channel = 0; channel < MAXCHANNELS; channel++) { @@ -238,24 +244,23 @@ void *ncclCommThreadMain(void *arg) { numActiveChans--; continue; } - count = count%COLLTRACE_NUM_ITEMS; for (int i = 0; i < count; i++) { - volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]; + volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS; + head[channel] ++; uint8_t type = td->type; if (type == ncclCollTraceNotReady) - break; + continue; char line[1024]; int offset = 0; uint16_t fIdx = td->funcIndex; if (type == ncclCollTraceDataType) { - sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx", - (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, - fIdx, td->data_0, td->opCount, td->data_1); + sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx", + (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, fIdx, td->data_0, td->opCount, td->data_1); } else { if (type & ncclCollTraceP2pElemType) - sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]); + sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->p2pOpCount[0], td->p2pOpCount[1]); else - sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount); + sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->opCount); offset = strlen(line); if (type == ncclCollTraceCollElemType) { sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks); @@ -293,7 +298,6 @@ void *ncclCommThreadMain(void *arg) { } INFO(NCCL_COLL, "%s", line); td->type = ncclCollTraceNotReady; - head[channel] ++; } } if (comm->collTraceExit && numActiveChans == 0)