Repurpose profiling implementation to simple timestamps tracing (#600)

[ROCm/rccl commit: 14b8ff153f]
This commit is contained in:
Wenkai Du
2022-08-18 15:34:46 -07:00
committed by GitHub
parent 6c3f1366e8
commit c2e9ada40b
7 changed files with 60 additions and 180 deletions
@@ -26,11 +26,6 @@ namespace {
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
const int nranks = ncclShmem->comm.nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
#ifdef ENABLE_PROFILING
auto devProf = ncclShmem->comm.devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
#endif
const ssize_t size = args->count;
#if defined(ENABLE_NPKIT)
@@ -67,10 +62,8 @@ namespace {
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
}
INIT_COUNTER;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
ACCUMULATE_PRIM_COUNTER(prim);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
@@ -115,9 +108,7 @@ namespace {
}
#endif
INIT_COUNTER;
prims.send(offset, nelem);
ACCUMULATE_COUNTER(send);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
if (tid == 0) {
@@ -140,9 +131,7 @@ namespace {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.recvReduceSend(offset, nelem);
ACCUMULATE_COUNTER(recvReduceSend);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
@@ -166,9 +155,7 @@ namespace {
}
#endif
INIT_COUNTER;
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
ACCUMULATE_COUNTER(directRecvReduceCopySend);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
@@ -190,9 +177,7 @@ namespace {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
INIT_COUNTER;
prims.directRecvCopySend(offset, offset, nelem);
ACCUMULATE_COUNTER(directRecvCopySend);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
@@ -215,9 +200,7 @@ namespace {
}
#endif
INIT_COUNTER;
prims.directRecv(offset, nelem);
ACCUMULATE_COUNTER(directRecv);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
@@ -227,12 +210,6 @@ namespace {
#endif
}
#ifdef ENABLE_PROFILING
if (tid == 0) {
struct ncclProfElem *elem = devProf.elems+args->opCount;
elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
if (tid == 0) {
@@ -597,6 +574,7 @@ template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
runRing<T, RedOp, Proto>(args);
}
};
@@ -604,6 +582,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
}
};
@@ -709,6 +688,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
runRing<T, RedOp, ProtoLL>(args);
}
};
@@ -716,6 +696,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
else runTreeSplit<T, RedOp, ProtoLL>(args);
}
@@ -22,11 +22,6 @@ namespace {
const ssize_t size = args->count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
#ifdef ENABLE_PROFILING
auto devProf = ncclShmem->comm.devProf;
uint64_t clk, t0 = 0ULL, ws;
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
#endif
const int root = args->root;
T *inputBuf = (T*)args->sendbuff;
@@ -51,30 +46,16 @@ namespace {
if (rank == root) {
if (inputBuf == outputBuf) {
INIT_COUNTER;
prims.send(offset, nelem);
ACCUMULATE_COUNTER(send);
} else {
INIT_COUNTER;
prims.copySend(offset, offset, nelem);
ACCUMULATE_COUNTER(copySend);
}
} else if (nextRank == root) {
INIT_COUNTER;
prims.recv(offset, nelem);
ACCUMULATE_COUNTER(recv);
} else {
INIT_COUNTER;
prims.recvCopySend(offset, nelem);
ACCUMULATE_COUNTER(recvCopySend);
}
}
#ifdef ENABLE_PROFILING
if (tid == 0) {
struct ncclProfElem *elem = devProf.elems+args->opCount;
elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
}
#endif
}
}
+34 -1
View File
@@ -279,6 +279,18 @@ class ncclFunction {
#define traceData(data2, data4, data8_0, data8_1)
#endif
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (ncclShmem->prof.count < PROFILE_NUM_ITEMS) { \
ncclShmem->prof.elem[ncclShmem->prof.count].line = line_num; \
ncclShmem->prof.elem[ncclShmem->prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
ncclShmem->prof.count++; \
} \
} while(0);
#else
#define __insert_timestamp(line_num)
#endif
__device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) {
if (bit) atomicAdd(abortCount, 1); \
__syncthreads(); \
@@ -382,6 +394,9 @@ struct ncclShmemData {
struct ncclChannel channel;
uint64_t pad;
struct ncclWork work;
#ifdef ENABLE_PROFILING
struct ncclProf prof;
#endif
};
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
@@ -421,10 +436,17 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
shmem.groups[i].barrier = 0;
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
}
}
}
__syncthreads();
int turn = copyToShmem(&ncclShmem->comm, comm);
#ifdef ENABLE_PROFILING
if (tid == 0) {
ncclShmem->prof.count = 0;
ncclShmem->prof.seq = ncclShmem->comm.devProf[bid].seq;
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
// get address of channel without incurring indirect load from ncclDevCom::channels
ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
turn = copyToShmem(&ncclShmem->channel, channel, turn);
@@ -435,6 +457,8 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
copyToShmem(&ncclShmem->work, &first, tid, nthreads);
}
__syncthreads(); // publish ncclShmem
if (tid == 0) __insert_timestamp(__LINE__);
if (tid == 0) __insert_timestamp(__LINE__);
ncclWork *workFifoHost = ncclShmem->channel.workFifo;
ncclWork *workFifoDev = ncclShmem->channel.workFifoDev;
@@ -447,6 +471,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
while (true) {
if (!skipLoadWork) {
copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads);
if (tid == 0) __insert_timestamp(__LINE__);
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work
@@ -457,6 +482,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
}
}
if (tid == 0) __insert_timestamp(__LINE__);
workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
if (tid == 0)
@@ -478,6 +504,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
traceColl(ncclShmem->work.elems[e], 0);
}
}
if (tid == 0) __insert_timestamp(__LINE__);
if (ncclShmem->work.header.funcIndex == FnIndex)
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem->work);
else
@@ -488,6 +515,12 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
skipLoadWork = false;
}
if (COLLTRACE && tid == 0) traceKernelEnd()
#ifdef ENABLE_PROFILING
if (ncclShmem->comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
copyToShmem(ncclShmem->comm.devProf+MAXCHANNELS*ncclShmem->prof.seq+blockIdx.x, &ncclShmem->prof);
if (tid == 0) ncclShmem->comm.devProf[bid].seq++;
}
#endif
}
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
@@ -156,20 +156,4 @@ struct PrimitivesWithoutDirect {
#include "prims_ll.h"
#include "prims_ll128.h"
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; }
#define ACCUMULATE_COUNTER(prim) \
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
+ ws - elem->elem[blockIdx.x].wait_cycle); \
elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;}
#define ACCUMULATE_PRIM_COUNTER(prim) \
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
+ ws - elem->elem[blockIdx.x].wait_cycle); elem->elem[blockIdx.x].opCount = args->opCount;}
#else
#define INIT_COUNTER
#define ACCUMULATE_COUNTER(prim)
#define ACCUMULATE_PRIM_COUNTER(prim)
#endif
#endif
@@ -204,18 +204,8 @@ private:
ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
#ifdef ENABLE_PROFILING
uint64_t t0;
if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
#endif
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
subBarrier();
#ifdef ENABLE_PROFILING
if (tid == 0) {
struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS;
elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
}
#endif
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
+10 -51
View File
@@ -290,16 +290,6 @@ struct ncclChannel {
int workCount;
size_t totalSize;
uint64_t workFifoTail; // Only used by CPU
#ifdef ENABLE_PROFILING
struct timeval tvs;
uint64_t sizes;
int active_req;
uint64_t send_byte;
uint64_t recv_byte;
float bw_cumulative;
int bw_count;
#endif
uint16_t index; // Only used by GPU
// GDRCOPY support
@@ -314,49 +304,18 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
struct ncclProfElem {
union {
struct {
uint64_t opCount;
uint64_t total_cycle;
uint64_t wait_cycle; // total wait cycle
// primtive cycles
uint64_t prim_cycle;
uint64_t send_cycle;
uint64_t directSend_cycle;
uint64_t recv_cycle;
uint64_t directRecv_cycle;
uint64_t copySend_cycle;
uint64_t directCopySend_cycle;
uint64_t recvCopySend_cycle;
uint64_t directRecvCopySend_cycle;
uint64_t recvReduceCopy_cycle;
uint64_t recvReduceSend_cycle;
uint64_t recvReduceCopySend_cycle;
uint64_t directRecvReduceCopySend_cycle;
// primitive bytes
uint64_t send_byte;
uint64_t directSend_byte;
uint64_t recv_byte;
uint64_t directRecv_byte;
uint64_t copySend_byte;
uint64_t directCopySend_byte;
uint64_t recvCopySend_byte;
uint64_t directRecvCopySend_byte;
uint64_t recvReduceCopy_byte;
uint64_t recvReduceSend_byte;
uint64_t recvReduceCopySend_byte;
uint64_t directRecvReduceCopySend_byte;
};
int data[0x80];
} elem[MAXCHANNELS];
};
#define PROFILE_NUM_ITEMS 31
#define PROFILE_NUM_LAUNCHES 1024
struct ncclProf {
struct ncclProfElem* elems;
uint32_t count;
uint32_t seq; // only entry from first launch is used
struct {
uint64_t line:16;
uint64_t timeStamp:48;
} elem[PROFILE_NUM_ITEMS];
};
#define PROFILE_NUM_ITEMS 1024
static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
#endif
#ifdef ENABLE_COLLTRACE
@@ -420,7 +379,7 @@ struct ncclDevComm {
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf devProf;
struct ncclProf* devProf;
#endif
#ifdef ENABLE_COLLTRACE
+12 -60
View File
@@ -233,7 +233,6 @@ void *ncclCommThreadMain(void *arg) {
#endif
#undef NCCL_NO_OPTIMIZE
#define PROFILE_USE_TIME
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
@@ -255,67 +254,20 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->asyncOps);
#ifdef ENABLE_PROFILING
struct ncclProf prof;
prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS);
CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost));
struct ncclProf *prof, *prof_seq;
prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
if (comm->rank == 0) {
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", " prim", " wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
#ifdef PROFILE_USE_TIME
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)");
#else
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (s)", " (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
#endif
}
for (int i = 1; i < PROFILE_NUM_ITEMS; i++) {
int valid = 0;
for (int chan=0; chan<comm->nChannels; chan++) {
struct ncclProfElem *elem = prof.elems+i;
if (elem->elem[chan].opCount == 0)
continue;
valid++;
#ifdef PROFILE_USE_TIME
INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.2f %6.2f %6.2f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
(double)elem->elem[chan].prim_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
(elem->elem[chan].send_cycle) ? ((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].recvReduceSend_cycle) ? ((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].directRecvReduceCopySend_cycle) ? ((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].directRecvCopySend_cycle) ? ((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].directRecv_cycle) ? ((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].copySend_cycle) ? ((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].recv_cycle) ? ((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
(elem->elem[chan].recvCopySend_cycle) ? ((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0);
#else
INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY,
(double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY,
(elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
#endif
for (int i=0; i<comm->nChannels; i++) {
for (int s=0; s<prof[MAXCHANNELS*i].seq; s++) {
if (prof[MAXCHANNELS*s+i].count == 0) continue;
for (int j=0; j<prof[MAXCHANNELS*s+i].count; j++) {
INFO(NCCL_INIT, "# [%02d:%02d] %02d-%02d L:%04u %6.2fus", comm->rank, i, s, j, prof[MAXCHANNELS*s+i].elem[j].line, (prof[MAXCHANNELS*s+i].elem[j].timeStamp-prof[MAXCHANNELS*s+i].elem[0].timeStamp)/VEGA_GPU_RTC_FREQUENCY*1.0E6);
}
}
if (valid == 0)
break;
}
free(prof.elems);
CUDACHECK(hipFree(comm->hostDevComm.devProf.elems));
for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
comm->rank, channel, (comm->channels[channel].bw_count) ?
(float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
comm->channels[channel].send_byte, comm->channels[channel].bw_count);
if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %6.2f GB/s (%ld bytes %d measurements)",
comm->rank, channel, (comm->channels[channel].bw_count) ?
(float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
comm->channels[channel].recv_byte, comm->channels[channel].bw_count);
}
free(prof);
CUDACHECK(hipFree(comm->hostDevComm.devProf));
#endif
#ifdef ENABLE_COLLTRACE
@@ -432,7 +384,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
comm->argsptrs[0] = &comm->devComm;
comm->argsptrs[1] = &comm->args;
#ifdef ENABLE_PROFILING
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS));
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES));
#endif
#ifdef ENABLE_COLLTRACE