From c2e9ada40b7d14873cedb11090fc8fc658ded2ef Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Thu, 18 Aug 2022 15:34:46 -0700 Subject: [PATCH] Repurpose profiling implementation to simple timestamps tracing (#600) [ROCm/rccl commit: 14b8ff153ffaa734d07622581520b3e59c969d01] --- .../rccl/src/collectives/device/all_reduce.h | 27 ++----- .../rccl/src/collectives/device/broadcast.h | 19 ----- projects/rccl/src/collectives/device/common.h | 35 ++++++++- .../rccl/src/collectives/device/primitives.h | 16 ----- .../src/collectives/device/prims_simple.h | 10 --- projects/rccl/src/include/devcomm.h | 61 +++------------- projects/rccl/src/init.cc | 72 ++++--------------- 7 files changed, 60 insertions(+), 180 deletions(-) diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index b041b6bf1b..79aa3fbfa8 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -26,11 +26,6 @@ namespace { const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1)); const int nranks = ncclShmem->comm.nRanks; const ssize_t loopSize = nChannels*nranks*chunkSize; -#ifdef ENABLE_PROFILING - auto devProf = ncclShmem->comm.devProf; - uint64_t clk, t0 = 0ULL, ws; - if (tid == 0) clk = __builtin_amdgcn_s_memrealtime(); -#endif const ssize_t size = args->count; #if defined(ENABLE_NPKIT) @@ -67,10 +62,8 @@ namespace { minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2; } - INIT_COUNTER; Primitives, 0, Proto, 0> prims (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16); - ACCUMULATE_PRIM_COUNTER(prim); #if defined(ENABLE_NPKIT) if (tid == 0) { @@ -115,9 +108,7 @@ namespace { } #endif - INIT_COUNTER; prims.send(offset, nelem); - ACCUMULATE_COUNTER(send); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT) if (tid == 0) { @@ -140,9 +131,7 @@ namespace { chunk = modRanks(ringIx + nranks-j); offset = calcOffset(chunk); nelem = min(realChunkSize, size-offset); - INIT_COUNTER; prims.recvReduceSend(offset, nelem); - ACCUMULATE_COUNTER(recvReduceSend); } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT) @@ -166,9 +155,7 @@ namespace { } #endif - INIT_COUNTER; prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true); - ACCUMULATE_COUNTER(directRecvReduceCopySend); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) if (tid == 0) { @@ -190,9 +177,7 @@ namespace { chunk = modRanks(ringIx + nranks-j); offset = calcOffset(chunk); nelem = min(realChunkSize, size-offset); - INIT_COUNTER; prims.directRecvCopySend(offset, offset, nelem); - ACCUMULATE_COUNTER(directRecvCopySend); } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT) @@ -215,9 +200,7 @@ namespace { } #endif - INIT_COUNTER; prims.directRecv(offset, nelem); - ACCUMULATE_COUNTER(directRecv); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT) if (tid == 0) { @@ -227,12 +210,6 @@ namespace { #endif } -#ifdef ENABLE_PROFILING - if (tid == 0) { - struct ncclProfElem *elem = devProf.elems+args->opCount; - elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); - } -#endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT) if (tid == 0) { @@ -597,6 +574,7 @@ template struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { using Proto = ProtoSimple; + if (threadIdx.x == 0) __insert_timestamp(__LINE__); runRing(args); } }; @@ -604,6 +582,7 @@ struct RunWorkElement struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + if (threadIdx.x == 0) __insert_timestamp(__LINE__); runTreeUpDown>(args); } }; @@ -709,6 +688,7 @@ struct RunWorkElement struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + if (threadIdx.x == 0) __insert_timestamp(__LINE__); runRing(args); } }; @@ -716,6 +696,7 @@ struct RunWorkElement struct RunWorkElement { __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + if (threadIdx.x == 0) __insert_timestamp(__LINE__); if (args->pad_0 == 0) runTreeUpDown(args); else runTreeSplit(args); } diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index a97836c672..3a17e0edd3 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -22,11 +22,6 @@ namespace { const ssize_t size = args->count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; -#ifdef ENABLE_PROFILING - auto devProf = ncclShmem->comm.devProf; - uint64_t clk, t0 = 0ULL, ws; - if (tid == 0) clk = __builtin_amdgcn_s_memrealtime(); -#endif const int root = args->root; T *inputBuf = (T*)args->sendbuff; @@ -51,30 +46,16 @@ namespace { if (rank == root) { if (inputBuf == outputBuf) { - INIT_COUNTER; prims.send(offset, nelem); - ACCUMULATE_COUNTER(send); } else { - INIT_COUNTER; prims.copySend(offset, offset, nelem); - ACCUMULATE_COUNTER(copySend); } } else if (nextRank == root) { - INIT_COUNTER; prims.recv(offset, nelem); - ACCUMULATE_COUNTER(recv); } else { - INIT_COUNTER; prims.recvCopySend(offset, nelem); - ACCUMULATE_COUNTER(recvCopySend); } } -#ifdef ENABLE_PROFILING - if (tid == 0) { - struct ncclProfElem *elem = devProf.elems+args->opCount; - elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); - } -#endif } } diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 084612f9a2..689a91576c 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -279,6 +279,18 @@ class ncclFunction { #define traceData(data2, data4, data8_0, data8_1) #endif +#ifdef ENABLE_PROFILING +#define __insert_timestamp(line_num) do { \ + if (ncclShmem->prof.count < PROFILE_NUM_ITEMS) { \ + ncclShmem->prof.elem[ncclShmem->prof.count].line = line_num; \ + ncclShmem->prof.elem[ncclShmem->prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem->prof.count++; \ + } \ + } while(0); +#else +#define __insert_timestamp(line_num) +#endif + __device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) { if (bit) atomicAdd(abortCount, 1); \ __syncthreads(); \ @@ -382,6 +394,9 @@ struct ncclShmemData { struct ncclChannel channel; uint64_t pad; struct ncclWork work; +#ifdef ENABLE_PROFILING + struct ncclProf prof; +#endif }; static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { @@ -421,10 +436,17 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { shmem.groups[i].barrier = 0; for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0; } - } + } __syncthreads(); int turn = copyToShmem(&ncclShmem->comm, comm); +#ifdef ENABLE_PROFILING + if (tid == 0) { + ncclShmem->prof.count = 0; + ncclShmem->prof.seq = ncclShmem->comm.devProf[bid].seq; + } +#endif + if (tid == 0) __insert_timestamp(__LINE__); // get address of channel without incurring indirect load from ncclDevCom::channels ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid]; turn = copyToShmem(&ncclShmem->channel, channel, turn); @@ -435,6 +457,8 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { copyToShmem(&ncclShmem->work, &first, tid, nthreads); } __syncthreads(); // publish ncclShmem + if (tid == 0) __insert_timestamp(__LINE__); + if (tid == 0) __insert_timestamp(__LINE__); ncclWork *workFifoHost = ncclShmem->channel.workFifo; ncclWork *workFifoDev = ncclShmem->channel.workFifoDev; @@ -447,6 +471,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { while (true) { if (!skipLoadWork) { copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads); + if (tid == 0) __insert_timestamp(__LINE__); { // Check whether the last operation was aborted and make sure all threads exit int aborted = tid == 0 ? *comm->abortFlag : 0; if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work @@ -457,6 +482,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused; } } + if (tid == 0) __insert_timestamp(__LINE__); workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS; if (tid == 0) @@ -478,6 +504,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { traceColl(ncclShmem->work.elems[e], 0); } } + if (tid == 0) __insert_timestamp(__LINE__); if (ncclShmem->work.header.funcIndex == FnIndex) RunWork().run(&ncclShmem->work); else @@ -488,6 +515,12 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { skipLoadWork = false; } if (COLLTRACE && tid == 0) traceKernelEnd() +#ifdef ENABLE_PROFILING + if (ncclShmem->comm.devProf->seq < PROFILE_NUM_LAUNCHES) { + copyToShmem(ncclShmem->comm.devProf+MAXCHANNELS*ncclShmem->prof.seq+blockIdx.x, &ncclShmem->prof); + if (tid == 0) ncclShmem->comm.devProf[bid].seq++; + } +#endif } #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index ce97c48232..bf6b7677f4 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -156,20 +156,4 @@ struct PrimitivesWithoutDirect { #include "prims_ll.h" #include "prims_ll128.h" -#ifdef ENABLE_PROFILING -#define INIT_COUNTER \ - if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; } -#define ACCUMULATE_COUNTER(prim) \ - if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \ - + ws - elem->elem[blockIdx.x].wait_cycle); \ - elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;} -#define ACCUMULATE_PRIM_COUNTER(prim) \ - if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \ - + ws - elem->elem[blockIdx.x].wait_cycle); elem->elem[blockIdx.x].opCount = args->opCount;} -#else -#define INIT_COUNTER -#define ACCUMULATE_COUNTER(prim) -#define ACCUMULATE_PRIM_COUNTER(prim) -#endif - #endif diff --git a/projects/rccl/src/collectives/device/prims_simple.h b/projects/rccl/src/collectives/device/prims_simple.h index bd58deb881..a7634cb7c4 100644 --- a/projects/rccl/src/collectives/device/prims_simple.h +++ b/projects/rccl/src/collectives/device/prims_simple.h @@ -204,18 +204,8 @@ private: ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset; if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput))) ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset; -#ifdef ENABLE_PROFILING - uint64_t t0; - if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime(); -#endif waitPeer(dstIx, remoteIx, offset, sliceSize); subBarrier(); -#ifdef ENABLE_PROFILING - if (tid == 0) { - struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS; - elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0); - } -#endif if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h index 8e4e97d11e..4f3887fe25 100644 --- a/projects/rccl/src/include/devcomm.h +++ b/projects/rccl/src/include/devcomm.h @@ -290,16 +290,6 @@ struct ncclChannel { int workCount; size_t totalSize; uint64_t workFifoTail; // Only used by CPU - -#ifdef ENABLE_PROFILING - struct timeval tvs; - uint64_t sizes; - int active_req; - uint64_t send_byte; - uint64_t recv_byte; - float bw_cumulative; - int bw_count; -#endif uint16_t index; // Only used by GPU // GDRCOPY support @@ -314,49 +304,18 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must #pragma pack(pop) /* restore original alignment from stack */ #ifdef ENABLE_PROFILING -struct ncclProfElem { - union { - struct { - uint64_t opCount; - uint64_t total_cycle; - uint64_t wait_cycle; // total wait cycle - // primtive cycles - uint64_t prim_cycle; - uint64_t send_cycle; - uint64_t directSend_cycle; - uint64_t recv_cycle; - uint64_t directRecv_cycle; - uint64_t copySend_cycle; - uint64_t directCopySend_cycle; - uint64_t recvCopySend_cycle; - uint64_t directRecvCopySend_cycle; - uint64_t recvReduceCopy_cycle; - uint64_t recvReduceSend_cycle; - uint64_t recvReduceCopySend_cycle; - uint64_t directRecvReduceCopySend_cycle; - // primitive bytes - uint64_t send_byte; - uint64_t directSend_byte; - uint64_t recv_byte; - uint64_t directRecv_byte; - uint64_t copySend_byte; - uint64_t directCopySend_byte; - uint64_t recvCopySend_byte; - uint64_t directRecvCopySend_byte; - uint64_t recvReduceCopy_byte; - uint64_t recvReduceSend_byte; - uint64_t recvReduceCopySend_byte; - uint64_t directRecvReduceCopySend_byte; - }; - int data[0x80]; - } elem[MAXCHANNELS]; -}; +#define PROFILE_NUM_ITEMS 31 +#define PROFILE_NUM_LAUNCHES 1024 struct ncclProf { - struct ncclProfElem* elems; + uint32_t count; + uint32_t seq; // only entry from first launch is used + struct { + uint64_t line:16; + uint64_t timeStamp:48; + } elem[PROFILE_NUM_ITEMS]; }; - -#define PROFILE_NUM_ITEMS 1024 +static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256"); #endif #ifdef ENABLE_COLLTRACE @@ -420,7 +379,7 @@ struct ncclDevComm { #ifdef ENABLE_PROFILING // Profiling counters - struct ncclProf devProf; + struct ncclProf* devProf; #endif #ifdef ENABLE_COLLTRACE diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 5a8f0bcf73..670b257bc9 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -233,7 +233,6 @@ void *ncclCommThreadMain(void *arg) { #endif #undef NCCL_NO_OPTIMIZE -#define PROFILE_USE_TIME static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) @@ -255,67 +254,20 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->asyncOps); #ifdef ENABLE_PROFILING - struct ncclProf prof; - prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS); - CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost)); + struct ncclProf *prof, *prof_seq; + prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES); + CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost)); #define VEGA_GPU_RTC_FREQUENCY 2.5E7 - if (comm->rank == 0) { - INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", " prim", " wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); -#ifdef PROFILE_USE_TIME - INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)"); -#else - INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (s)", " (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); -#endif - } - for (int i = 1; i < PROFILE_NUM_ITEMS; i++) { - int valid = 0; - for (int chan=0; channChannels; chan++) { - struct ncclProfElem *elem = prof.elems+i; - if (elem->elem[chan].opCount == 0) - continue; - valid++; -#ifdef PROFILE_USE_TIME - INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.2f %6.2f %6.2f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", - comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, - (double)elem->elem[chan].prim_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, - (elem->elem[chan].send_cycle) ? ((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].recvReduceSend_cycle) ? ((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].directRecvReduceCopySend_cycle) ? ((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].directRecvCopySend_cycle) ? ((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].directRecv_cycle) ? ((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].copySend_cycle) ? ((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].recv_cycle) ? ((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0, - (elem->elem[chan].recvCopySend_cycle) ? ((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0); -#else - INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", - comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY, - (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY, - (elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); -#endif + for (int i=0; inChannels; i++) { + for (int s=0; srank, i, s, j, prof[MAXCHANNELS*s+i].elem[j].line, (prof[MAXCHANNELS*s+i].elem[j].timeStamp-prof[MAXCHANNELS*s+i].elem[0].timeStamp)/VEGA_GPU_RTC_FREQUENCY*1.0E6); + } } - if (valid == 0) - break; - } - free(prof.elems); - CUDACHECK(hipFree(comm->hostDevComm.devProf.elems)); - - for (int channel=0; channelnChannels, comm->p2pnChannels); channel++) { - if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)", - comm->rank, channel, (comm->channels[channel].bw_count) ? - (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0, - comm->channels[channel].send_byte, comm->channels[channel].bw_count); - if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %6.2f GB/s (%ld bytes %d measurements)", - comm->rank, channel, (comm->channels[channel].bw_count) ? - (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0, - comm->channels[channel].recv_byte, comm->channels[channel].bw_count); } + free(prof); + CUDACHECK(hipFree(comm->hostDevComm.devProf)); #endif #ifdef ENABLE_COLLTRACE @@ -432,7 +384,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua comm->argsptrs[0] = &comm->devComm; comm->argsptrs[1] = &comm->args; #ifdef ENABLE_PROFILING - NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS)); + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES)); #endif #ifdef ENABLE_COLLTRACE