From c81ea25407eb4a79ea30e7828e253cdaaedc1ca9 Mon Sep 17 00:00:00 2001 From: Avinash <44542533+PJAvinash@users.noreply.github.com> Date: Wed, 7 May 2025 13:37:31 -0500 Subject: [PATCH] collective trace improvements for debugging (#1661) [ROCm/rccl commit: c54a0c085a24bd1880f845d2db90cf1e5e01fdaf] --- projects/rccl/src/device/common.h | 13 +++++++++++-- projects/rccl/src/init.cc | 14 +++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/projects/rccl/src/device/common.h b/projects/rccl/src/device/common.h index c90d3e78d5..26ac84af9c 100644 --- a/projects/rccl/src/device/common.h +++ b/projects/rccl/src/device/common.h @@ -76,6 +76,7 @@ } #define traceKernelEnd(end_type) { \ INC_COLL_TRACE \ + collTrace->funcIndex = ncclShmem.funcId;\ if (ncclShmem.workType == ncclDevWorkTypeP2p) { \ struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \ collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \ @@ -95,10 +96,16 @@ collTrace->data_1 = data8_1; \ collTrace->type = ncclCollTraceDataType; \ } + #define traceAbort(){\ + INC_COLL_TRACE\ + collTrace->funcIndex = ncclShmem.funcId;\ + collTrace->type = ncclCollTraceAbortType;\ + } #else #define traceKernelLaunch(launch_type, batchIx) #define traceKernelEnd(end_type) #define traceData(data2, data4, data8_0, data8_1) +#define traceAbort() #endif #if __CUDA_ARCH__ >= 700 @@ -600,8 +607,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or() ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed; } - if (aborted) break; - + if (aborted) { + if(COLLTRACE && tid%WARP_SIZE == 0) traceAbort(); + break; + } if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType, batchIx); } if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 44d63ecaef..c1e1403f0e 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -258,12 +258,12 @@ void *ncclCommThreadMain(void *arg) { for (int i = 0; i < count; i++) { volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS; head[channel] ++; - uint8_t type = td->type; + const uint8_t type = td->type; if (type == ncclCollTraceNotReady) continue; char line[1024]; int offset = 0; - uint16_t fIdx = td->funcIndex; + const uint16_t fIdx = td->funcIndex; if (type == ncclCollTraceDataType) { sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, fIdx, td->data_0, td->opCount, td->data_1); @@ -284,9 +284,9 @@ void *ncclCommThreadMain(void *arg) { case ncclCollTraceKernelLaunchType: case ncclCollTraceCollLaunchType: if ((type&0xf) == ncclCollTraceKernelLaunchType) - sprintf(line+offset, " KL HWID %8x %s", td->data_0, funcNames[fIdx]); + sprintf(line+offset, " KL %s [%02d:%02d-%02d:%02x] HWID %8x ", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, td->data_0); else if ((type&0xf) == ncclCollTraceCollLaunchType) - sprintf(line+offset, " CL %d %s", td->batchIx, funcNames[fIdx]); + sprintf(line+offset, " CL %s [%02d:%02d-%02d:%02x] %d ", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, td->batchIx); offset = strlen(line); if ((type&0xf0) == ncclCollTraceCollElemType) sprintf(line+offset, " nw %d bi %d nc %d root %d busId %lx nRanks %d", td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks); @@ -296,10 +296,10 @@ void *ncclCommThreadMain(void *arg) { comm->busId, comm->nRanks); break; case ncclCollTraceKernelEndType: - sprintf(line+offset, " KE busId %lx nRanks %d", comm->busId, comm->nRanks); + sprintf(line+offset, " KE %s [%02d:%02d-%02d:%02x] busId %lx nRanks %d", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid, comm->busId, comm->nRanks); break; case ncclCollTraceAbortType: - sprintf(line+offset, " Abort"); + sprintf(line+offset, " KA %s [%02d:%02d-%02d:%02x]", funcNames[fIdx], comm->rank, td->bid, td->channelId, td->tid); break; default: sprintf(line+offset, " unknown collective trace data type"); @@ -307,7 +307,7 @@ void *ncclCommThreadMain(void *arg) { } } } - INFO(NCCL_COLL, "%s", line); + INFO(NCCL_COLL, "%s td->type:%d", line, type); td->type = ncclCollTraceNotReady; } }