Repurpose profiling implementation to simple timestamps tracing (#600)

[ROCm/rccl commit: 14b8ff153f]
2022-08-18 15:34:46 -07:00
parent 6c3f1366e8
commit c2e9ada40b
7 changed files with 60 additions and 180 deletions
@@ -26,11 +26,6 @@ namespace {
    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
    const int nranks = ncclShmem->comm.nRanks;
    const ssize_t loopSize = nChannels*nranks*chunkSize;
-#ifdef ENABLE_PROFILING
-    auto devProf = ncclShmem->comm.devProf;
-    uint64_t clk, t0 = 0ULL, ws;
-    if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
-#endif
    const ssize_t size = args->count;

 #if defined(ENABLE_NPKIT)
@@ -67,10 +62,8 @@ namespace {
      minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
    }

-    INIT_COUNTER;
    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
-    ACCUMULATE_PRIM_COUNTER(prim);

 #if defined(ENABLE_NPKIT)
    if (tid == 0) {
@@ -115,9 +108,7 @@ namespace {
      }
 #endif

-      INIT_COUNTER;
      prims.send(offset, nelem);
-      ACCUMULATE_COUNTER(send);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
      if (tid == 0) {
@@ -140,9 +131,7 @@ namespace {
        chunk = modRanks(ringIx + nranks-j);
        offset = calcOffset(chunk);
        nelem = min(realChunkSize, size-offset);
-        INIT_COUNTER;
        prims.recvReduceSend(offset, nelem);
-        ACCUMULATE_COUNTER(recvReduceSend);
      }

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
@@ -166,9 +155,7 @@ namespace {
      }
 #endif

-      INIT_COUNTER;
      prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
-      ACCUMULATE_COUNTER(directRecvReduceCopySend);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
      if (tid == 0) {
@@ -190,9 +177,7 @@ namespace {
        chunk = modRanks(ringIx + nranks-j);
        offset = calcOffset(chunk);
        nelem = min(realChunkSize, size-offset);
-        INIT_COUNTER;
        prims.directRecvCopySend(offset, offset, nelem);
-        ACCUMULATE_COUNTER(directRecvCopySend);
      }

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
@@ -215,9 +200,7 @@ namespace {
      }
 #endif

-      INIT_COUNTER;
      prims.directRecv(offset, nelem);
-      ACCUMULATE_COUNTER(directRecv);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
      if (tid == 0) {
@@ -227,12 +210,6 @@ namespace {
 #endif

    }
-#ifdef ENABLE_PROFILING
-    if (tid == 0) {
-      struct ncclProfElem *elem = devProf.elems+args->opCount;
-      elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
-    }
-#endif

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
    if (tid == 0) {
@@ -597,6 +574,7 @@ template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
    using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
+    if (threadIdx.x == 0) __insert_timestamp(__LINE__);
    runRing<T, RedOp, Proto>(args);
  }
 };
@@ -604,6 +582,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+    if (threadIdx.x == 0) __insert_timestamp(__LINE__);
    runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
  }
 };
@@ -709,6 +688,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+    if (threadIdx.x == 0) __insert_timestamp(__LINE__);
    runRing<T, RedOp, ProtoLL>(args);
  }
 };
@@ -716,6 +696,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
 template<typename T, typename RedOp>
 struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
  __device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
+    if (threadIdx.x == 0) __insert_timestamp(__LINE__);
    if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
    else runTreeSplit<T, RedOp, ProtoLL>(args);
  }
@@ -22,11 +22,6 @@ namespace {
    const ssize_t size = args->count;
    const int rank = ring->devUserRanks[0];
    const int nextRank = ring->devUserRanks[1];
-#ifdef ENABLE_PROFILING
-    auto devProf = ncclShmem->comm.devProf;
-    uint64_t clk, t0 = 0ULL, ws;
-    if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
-#endif
    const int root = args->root;

    T *inputBuf = (T*)args->sendbuff;
@@ -51,30 +46,16 @@ namespace {

      if (rank == root) {
        if (inputBuf == outputBuf) {
-          INIT_COUNTER;
          prims.send(offset, nelem);
-          ACCUMULATE_COUNTER(send);
        } else {
-          INIT_COUNTER;
          prims.copySend(offset, offset, nelem);
-          ACCUMULATE_COUNTER(copySend);
        }
      } else if (nextRank == root) {
-        INIT_COUNTER;
        prims.recv(offset, nelem);
-        ACCUMULATE_COUNTER(recv);
      } else {
-        INIT_COUNTER;
        prims.recvCopySend(offset, nelem);
-        ACCUMULATE_COUNTER(recvCopySend);
      }
    }
-#ifdef ENABLE_PROFILING
-    if (tid == 0) {
-      struct ncclProfElem *elem = devProf.elems+args->opCount;
-      elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
-    }
-#endif
  }
 }

@@ -279,6 +279,18 @@ class ncclFunction {
 #define traceData(data2, data4, data8_0, data8_1)
 #endif

+#ifdef ENABLE_PROFILING
+#define __insert_timestamp(line_num) do { \
+      if (ncclShmem->prof.count < PROFILE_NUM_ITEMS) { \
+        ncclShmem->prof.elem[ncclShmem->prof.count].line = line_num; \
+        ncclShmem->prof.elem[ncclShmem->prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
+        ncclShmem->prof.count++; \
+      } \
+    } while(0);
+#else
+#define __insert_timestamp(line_num)
+#endif
+
 __device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) {
  if (bit) atomicAdd(abortCount, 1); \
  __syncthreads(); \
@@ -382,6 +394,9 @@ struct ncclShmemData {
  struct ncclChannel channel;
  uint64_t pad;
  struct ncclWork work;
+#ifdef ENABLE_PROFILING
+  struct ncclProf prof;
+#endif
 };

 static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
@@ -421,10 +436,17 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
      shmem.groups[i].barrier = 0;
      for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
    }
-  }
+ }
  __syncthreads();

  int turn = copyToShmem(&ncclShmem->comm, comm);
+#ifdef ENABLE_PROFILING
+  if (tid == 0) {
+    ncclShmem->prof.count = 0;
+    ncclShmem->prof.seq = ncclShmem->comm.devProf[bid].seq;
+  }
+#endif
+  if (tid == 0) __insert_timestamp(__LINE__);
  // get address of channel without incurring indirect load from ncclDevCom::channels
  ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
  turn = copyToShmem(&ncclShmem->channel, channel, turn);
@@ -435,6 +457,8 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
    copyToShmem(&ncclShmem->work, &first, tid, nthreads);
  }
  __syncthreads(); // publish ncclShmem
+  if (tid == 0) __insert_timestamp(__LINE__);
+  if (tid == 0) __insert_timestamp(__LINE__);

  ncclWork *workFifoHost = ncclShmem->channel.workFifo;
  ncclWork *workFifoDev = ncclShmem->channel.workFifoDev;
@@ -447,6 +471,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
  while (true) {
    if (!skipLoadWork) {
      copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads);
+      if (tid == 0) __insert_timestamp(__LINE__);
      { // Check whether the last operation was aborted and make sure all threads exit
        int aborted = tid == 0 ? *comm->abortFlag : 0;
        if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work
@@ -457,6 +482,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
          workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
      }
    }
+    if (tid == 0) __insert_timestamp(__LINE__);

    workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
    if (tid == 0)
@@ -478,6 +504,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
        traceColl(ncclShmem->work.elems[e], 0);
      }
    }
+    if (tid == 0) __insert_timestamp(__LINE__);
    if (ncclShmem->work.header.funcIndex == FnIndex)
      RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem->work);
    else
@@ -488,6 +515,12 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first)  {
    skipLoadWork = false;
  }
  if (COLLTRACE && tid == 0) traceKernelEnd()
+#ifdef ENABLE_PROFILING
+  if (ncclShmem->comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
+    copyToShmem(ncclShmem->comm.devProf+MAXCHANNELS*ncclShmem->prof.seq+blockIdx.x, &ncclShmem->prof);
+    if (tid == 0) ncclShmem->comm.devProf[bid].seq++;
+  }
+#endif
 }

 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
@@ -156,20 +156,4 @@ struct PrimitivesWithoutDirect {
 #include "prims_ll.h"
 #include "prims_ll128.h"

-#ifdef ENABLE_PROFILING
-#define INIT_COUNTER \
-  if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; }
-#define ACCUMULATE_COUNTER(prim) \
-  if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
-    + ws - elem->elem[blockIdx.x].wait_cycle); \
-    elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;}
-#define ACCUMULATE_PRIM_COUNTER(prim) \
-  if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
-    + ws - elem->elem[blockIdx.x].wait_cycle); elem->elem[blockIdx.x].opCount = args->opCount;}
-#else
-#define INIT_COUNTER
-#define ACCUMULATE_COUNTER(prim)
-#define ACCUMULATE_PRIM_COUNTER(prim)
-#endif
-
 #endif
@@ -204,18 +204,8 @@ private:
          ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
        if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
          ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
-#ifdef ENABLE_PROFILING
-        uint64_t t0;
-        if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
-#endif
        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
        subBarrier();
-#ifdef ENABLE_PROFILING
-        if (tid == 0) {
-          struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS;
-          elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
-        }
-#endif
        if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
          if (Send) {
@@ -290,16 +290,6 @@ struct ncclChannel {
      int workCount;
      size_t totalSize;
      uint64_t workFifoTail; // Only used by CPU
-
-#ifdef ENABLE_PROFILING
-      struct timeval tvs;
-      uint64_t sizes;
-      int active_req;
-      uint64_t send_byte;
-      uint64_t recv_byte;
-      float bw_cumulative;
-      int bw_count;
-#endif
      uint16_t index;        // Only used by GPU

      // GDRCOPY support
@@ -314,49 +304,18 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must
 #pragma pack(pop)   /* restore original alignment from stack */

 #ifdef ENABLE_PROFILING
-struct ncclProfElem {
-  union {
-    struct {
-      uint64_t opCount;
-      uint64_t total_cycle;
-      uint64_t wait_cycle;      // total wait cycle
-      // primtive cycles
-      uint64_t prim_cycle;
-      uint64_t send_cycle;
-      uint64_t directSend_cycle;
-      uint64_t recv_cycle;
-      uint64_t directRecv_cycle;
-      uint64_t copySend_cycle;
-      uint64_t directCopySend_cycle;
-      uint64_t recvCopySend_cycle;
-      uint64_t directRecvCopySend_cycle;
-      uint64_t recvReduceCopy_cycle;
-      uint64_t recvReduceSend_cycle;
-      uint64_t recvReduceCopySend_cycle;
-      uint64_t directRecvReduceCopySend_cycle;
-      // primitive bytes
-      uint64_t send_byte;
-      uint64_t directSend_byte;
-      uint64_t recv_byte;
-      uint64_t directRecv_byte;
-      uint64_t copySend_byte;
-      uint64_t directCopySend_byte;
-      uint64_t recvCopySend_byte;
-      uint64_t directRecvCopySend_byte;
-      uint64_t recvReduceCopy_byte;
-      uint64_t recvReduceSend_byte;
-      uint64_t recvReduceCopySend_byte;
-      uint64_t directRecvReduceCopySend_byte;
-    };
-    int data[0x80];
-  } elem[MAXCHANNELS];
-};
+#define PROFILE_NUM_ITEMS 31
+#define PROFILE_NUM_LAUNCHES 1024

 struct ncclProf {
-  struct ncclProfElem* elems;
+  uint32_t count;
+  uint32_t seq; // only entry from first launch is used
+  struct {
+    uint64_t line:16;
+    uint64_t timeStamp:48;
+  } elem[PROFILE_NUM_ITEMS];
 };
-
-#define PROFILE_NUM_ITEMS 1024
+static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
 #endif

 #ifdef ENABLE_COLLTRACE
@@ -420,7 +379,7 @@ struct ncclDevComm {

 #ifdef ENABLE_PROFILING
  // Profiling counters
-  struct ncclProf devProf;
+  struct ncclProf* devProf;
 #endif

 #ifdef ENABLE_COLLTRACE
@@ -233,7 +233,6 @@ void *ncclCommThreadMain(void *arg) {
 #endif

 #undef NCCL_NO_OPTIMIZE
-#define PROFILE_USE_TIME

 static ncclResult_t commFree(ncclComm_t comm) {
  if (comm == NULL)
@@ -255,67 +254,20 @@ static ncclResult_t commFree(ncclComm_t comm) {
  free(comm->asyncOps);

 #ifdef ENABLE_PROFILING
-  struct ncclProf prof;
-  prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS);
-  CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost));
+  struct ncclProf *prof, *prof_seq;
+  prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
+  CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
  #define VEGA_GPU_RTC_FREQUENCY 2.5E7
-  if (comm->rank == 0) {
-    INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", "  prim", "  wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
-#ifdef PROFILE_USE_TIME
-    INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)", "  (us)");
-#else
-    INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", "   (s)", "   (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
-#endif
-  }
-  for (int i = 1; i < PROFILE_NUM_ITEMS; i++) {
-    int valid = 0;
-    for (int chan=0; chan<comm->nChannels; chan++) {
-      struct ncclProfElem *elem = prof.elems+i;
-      if (elem->elem[chan].opCount == 0)
-        continue;
-      valid++;
-#ifdef PROFILE_USE_TIME
-      INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.2f %6.2f %6.2f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
-        comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
-        (double)elem->elem[chan].prim_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
-        (elem->elem[chan].send_cycle) ? ((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].recvReduceSend_cycle) ? ((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].directRecvReduceCopySend_cycle) ? ((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].directRecvCopySend_cycle) ? ((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].directRecv_cycle) ? ((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].copySend_cycle) ? ((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].recv_cycle) ? ((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
-        (elem->elem[chan].recvCopySend_cycle) ? ((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0);
-#else
-      INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
-        comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY,
-        (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY,
-        (elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
-        (elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
-#endif
+  for (int i=0; i<comm->nChannels; i++) {
+    for (int s=0; s<prof[MAXCHANNELS*i].seq; s++) {
+      if (prof[MAXCHANNELS*s+i].count == 0) continue;
+      for (int j=0; j<prof[MAXCHANNELS*s+i].count; j++) {
+        INFO(NCCL_INIT, "# [%02d:%02d] %02d-%02d L:%04u %6.2fus", comm->rank, i, s, j, prof[MAXCHANNELS*s+i].elem[j].line, (prof[MAXCHANNELS*s+i].elem[j].timeStamp-prof[MAXCHANNELS*s+i].elem[0].timeStamp)/VEGA_GPU_RTC_FREQUENCY*1.0E6);
+      }
    }
-    if (valid == 0)
-      break;
-  }
-  free(prof.elems);
-  CUDACHECK(hipFree(comm->hostDevComm.devProf.elems));
-
-  for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
-    if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
-      comm->rank, channel, (comm->channels[channel].bw_count) ?
-      (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
-      comm->channels[channel].send_byte, comm->channels[channel].bw_count);
-    if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %6.2f GB/s (%ld bytes %d measurements)",
-      comm->rank, channel, (comm->channels[channel].bw_count) ?
-      (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
-      comm->channels[channel].recv_byte, comm->channels[channel].bw_count);
  }
+  free(prof);
+  CUDACHECK(hipFree(comm->hostDevComm.devProf));
 #endif

 #ifdef ENABLE_COLLTRACE
@@ -432,7 +384,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
  comm->argsptrs[0] = &comm->devComm;
  comm->argsptrs[1] = &comm->args;
 #ifdef ENABLE_PROFILING
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS));
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES));
 #endif

 #ifdef ENABLE_COLLTRACE