Print KL/CL/KE events for all warps (#1544)

* Print KL/CL/KE events for all warps * Fix count off-by-one issue * Fix opCount in KE and restore CPU thread option * Simplify count calculation [ROCm/rccl commit: ebf7e2305e]
2025-02-12 13:36:31 -08:00
Parent 7fdbcdfdec
@@ -47,6 +47,7 @@
    struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
    collTrace->timeStamp = wall_clock64(); \
    collTrace->bid = blockIdx.x; \
+    collTrace->tid = threadIdx.x; \
    collTrace->channelId = ncclShmem.channelId;
    // TODO: switch to atomicInc after llvm crash is fixed
    // uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
@@ -82,7 +83,16 @@
  }
  #define traceKernelEnd(end_type)  { \
    INC_COLL_TRACE \
-    collTrace->type = end_type; \
+    if (ncclShmem.workType == ncclDevWorkTypeP2p) { \
+      struct ncclDevWorkP2p *p2pWork = (struct ncclDevWorkP2p*)ncclShmem.workStorage; \
+      collTrace->p2pOpCount[0] = p2pWork->sendOpCount; \
+      collTrace->p2pOpCount[1] = p2pWork->recvOpCount; \
+      collTrace->type = (end_type) | ncclCollTraceP2pElemType; \
+    } else if (ncclShmem.workType == ncclDevWorkTypeColl) { \
+      struct ncclDevWorkColl *collWork = (struct ncclDevWorkColl*)ncclShmem.workStorage; \
+      collTrace->opCount = collWork->opCount; \
+      collTrace->type = (end_type) | ncclCollTraceCollElemType; \
+    } \
  }
  #define traceData(data2, data4, data8_0, data8_1) { \
    INC_COLL_TRACE \
@@ -519,7 +529,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
  }
 #endif
  if (tid == 0) __insert_timestamp(__LINE__);
-  if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
+  if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);

  if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
    // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
@@ -569,9 +579,9 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
    }
    if (aborted) break;

-    if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
+    if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
  }
-  if (COLLTRACE && tid == 0) traceKernelEnd(ncclCollTraceKernelEndType);
+  if (COLLTRACE && tid%WARP_SIZE == 0) traceKernelEnd(ncclCollTraceKernelEndType);

 #ifdef ENABLE_PROFILING
  if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
@@ -419,7 +419,8 @@ struct ncclCollTrace {
  uint8_t type;
  uint8_t bid;
  int16_t funcIndex;
-  uint32_t data_0;
+  uint32_t data_0:24;
+  uint8_t tid;
  uint8_t channelId;
  uint64_t timeStamp:56;
  union {
@@ -226,8 +226,14 @@ void *ncclCommThreadMain(void *arg) {
  int head[MAXCHANNELS];
  double vega_gpu_rtc_freq;

-  memset(head, 0, sizeof(int)*MAXCHANNELS);
  vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
+  for (int channel = 0; channel < MAXCHANNELS; channel++) {
+    int tail = comm->collTraceTail[channel].tail;
+    if (tail < COLLTRACE_NUM_ITEMS)
+      head[channel] = 0;
+    else
+      head[channel] = tail - COLLTRACE_NUM_ITEMS;
+  }
  do {
    int numActiveChans = MAXCHANNELS;
    for (int channel = 0; channel < MAXCHANNELS; channel++) {
@@ -238,24 +244,23 @@ void *ncclCommThreadMain(void *arg) {
        numActiveChans--;
        continue;
      }
-      count = count%COLLTRACE_NUM_ITEMS;
      for (int i = 0; i < count; i++) {
-        volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel];
+        volatile struct ncclCollTrace *td = comm->collTrace+COLLTRACE_NUM_ITEMS*channel+head[channel]%COLLTRACE_NUM_ITEMS;
+        head[channel] ++;
        uint8_t type = td->type;
        if (type == ncclCollTraceNotReady)
-          break;
+          continue;
        char line[1024];
        int offset = 0;
        uint16_t fIdx = td->funcIndex;
        if (type == ncclCollTraceDataType) {
-          sprintf(line, "## [%012.6f] [%02d:%02d:%02d] L:%04d DT %08x %016lx %016lx",
-            (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId,
-            fIdx, td->data_0, td->opCount, td->data_1);
+          sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] L:%04d DT %08x %016lx %016lx",
+            (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid,             fIdx, td->data_0, td->opCount, td->data_1);
        } else {
          if (type & ncclCollTraceP2pElemType)
-            sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
+            sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->p2pOpCount[0], td->p2pOpCount[1]);
          else
-            sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
+            sprintf(line, "## [%012.6f] [%02d:%02d-%02d:%02x] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->tid, td->opCount);
          offset = strlen(line);
          if (type == ncclCollTraceCollElemType) {
            sprintf(line+offset, " CE %s nw %d bi %d nc %d root %d busId %lx nRanks %d", funcNames[fIdx], td->coll.nWarps, td->coll.bid, td->coll.nChannels, td->coll.root, comm->busId, comm->nRanks);
@@ -293,7 +298,6 @@ void *ncclCommThreadMain(void *arg) {
        }
        INFO(NCCL_COLL, "%s", line);
        td->type = ncclCollTraceNotReady;
-        head[channel] ++;
      }
    }
    if (comm->collTraceExit && numActiveChans == 0)