Repurpose profiling implementation to simple timestamps tracing (#600)
[ROCm/rccl commit: 14b8ff153f]
This commit is contained in:
@@ -26,11 +26,6 @@ namespace {
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = ncclShmem->comm.devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
const ssize_t size = args->count;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -67,10 +62,8 @@ namespace {
|
||||
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
|
||||
}
|
||||
|
||||
INIT_COUNTER;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
|
||||
ACCUMULATE_PRIM_COUNTER(prim);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (tid == 0) {
|
||||
@@ -115,9 +108,7 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.send(offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -140,9 +131,7 @@ namespace {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
INIT_COUNTER;
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvReduceSend);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
|
||||
@@ -166,9 +155,7 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
|
||||
ACCUMULATE_COUNTER(directRecvReduceCopySend);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -190,9 +177,7 @@ namespace {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
INIT_COUNTER;
|
||||
prims.directRecvCopySend(offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecvCopySend);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
|
||||
@@ -215,9 +200,7 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecv(offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecv);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -227,12 +210,6 @@ namespace {
|
||||
#endif
|
||||
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
struct ncclProfElem *elem = devProf.elems+args->opCount;
|
||||
elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
@@ -597,6 +574,7 @@ template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
|
||||
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
};
|
||||
@@ -604,6 +582,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
|
||||
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
|
||||
}
|
||||
};
|
||||
@@ -709,6 +688,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
@@ -716,6 +696,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
if (threadIdx.x == 0) __insert_timestamp(__LINE__);
|
||||
if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
|
||||
else runTreeSplit<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
|
||||
@@ -22,11 +22,6 @@ namespace {
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
#ifdef ENABLE_PROFILING
|
||||
auto devProf = ncclShmem->comm.devProf;
|
||||
uint64_t clk, t0 = 0ULL, ws;
|
||||
if (tid == 0) clk = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
const int root = args->root;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
@@ -51,30 +46,16 @@ namespace {
|
||||
|
||||
if (rank == root) {
|
||||
if (inputBuf == outputBuf) {
|
||||
INIT_COUNTER;
|
||||
prims.send(offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.copySend(offset, offset, nelem);
|
||||
ACCUMULATE_COUNTER(copySend);
|
||||
}
|
||||
} else if (nextRank == root) {
|
||||
INIT_COUNTER;
|
||||
prims.recv(offset, nelem);
|
||||
ACCUMULATE_COUNTER(recv);
|
||||
} else {
|
||||
INIT_COUNTER;
|
||||
prims.recvCopySend(offset, nelem);
|
||||
ACCUMULATE_COUNTER(recvCopySend);
|
||||
}
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
struct ncclProfElem *elem = devProf.elems+args->opCount;
|
||||
elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -279,6 +279,18 @@ class ncclFunction {
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
if (ncclShmem->prof.count < PROFILE_NUM_ITEMS) { \
|
||||
ncclShmem->prof.elem[ncclShmem->prof.count].line = line_num; \
|
||||
ncclShmem->prof.elem[ncclShmem->prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
ncclShmem->prof.count++; \
|
||||
} \
|
||||
} while(0);
|
||||
#else
|
||||
#define __insert_timestamp(line_num)
|
||||
#endif
|
||||
|
||||
__device__ inline bool barrierReduceAny(int bit, uint32_t* abortCount) {
|
||||
if (bit) atomicAdd(abortCount, 1); \
|
||||
__syncthreads(); \
|
||||
@@ -382,6 +394,9 @@ struct ncclShmemData {
|
||||
struct ncclChannel channel;
|
||||
uint64_t pad;
|
||||
struct ncclWork work;
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf prof;
|
||||
#endif
|
||||
};
|
||||
|
||||
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
@@ -421,10 +436,17 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
shmem.groups[i].barrier = 0;
|
||||
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int turn = copyToShmem(&ncclShmem->comm, comm);
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
ncclShmem->prof.count = 0;
|
||||
ncclShmem->prof.seq = ncclShmem->comm.devProf[bid].seq;
|
||||
}
|
||||
#endif
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
// get address of channel without incurring indirect load from ncclDevCom::channels
|
||||
ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
|
||||
turn = copyToShmem(&ncclShmem->channel, channel, turn);
|
||||
@@ -435,6 +457,8 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
copyToShmem(&ncclShmem->work, &first, tid, nthreads);
|
||||
}
|
||||
__syncthreads(); // publish ncclShmem
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
|
||||
ncclWork *workFifoHost = ncclShmem->channel.workFifo;
|
||||
ncclWork *workFifoDev = ncclShmem->channel.workFifoDev;
|
||||
@@ -447,6 +471,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
while (true) {
|
||||
if (!skipLoadWork) {
|
||||
copyToShmem(&ncclShmem->work, &workFifoDev[workFifoIx], tid, nthreads);
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
{ // Check whether the last operation was aborted and make sure all threads exit
|
||||
int aborted = tid == 0 ? *comm->abortFlag : 0;
|
||||
if (barrierReduceAny(aborted, &abortCount)) { // publish ncclShmem->work
|
||||
@@ -457,6 +482,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
|
||||
}
|
||||
}
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
|
||||
workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
|
||||
if (tid == 0)
|
||||
@@ -478,6 +504,7 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
traceColl(ncclShmem->work.elems[e], 0);
|
||||
}
|
||||
}
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (ncclShmem->work.header.funcIndex == FnIndex)
|
||||
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem->work);
|
||||
else
|
||||
@@ -488,6 +515,12 @@ __device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
|
||||
skipLoadWork = false;
|
||||
}
|
||||
if (COLLTRACE && tid == 0) traceKernelEnd()
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (ncclShmem->comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
|
||||
copyToShmem(ncclShmem->comm.devProf+MAXCHANNELS*ncclShmem->prof.seq+blockIdx.x, &ncclShmem->prof);
|
||||
if (tid == 0) ncclShmem->comm.devProf[bid].seq++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
|
||||
|
||||
@@ -156,20 +156,4 @@ struct PrimitivesWithoutDirect {
|
||||
#include "prims_ll.h"
|
||||
#include "prims_ll128.h"
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define INIT_COUNTER \
|
||||
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; t0 = __builtin_amdgcn_s_memrealtime(); ws = elem->elem[blockIdx.x].wait_cycle; }
|
||||
#define ACCUMULATE_COUNTER(prim) \
|
||||
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
|
||||
+ ws - elem->elem[blockIdx.x].wait_cycle); \
|
||||
elem->elem[blockIdx.x].prim##_byte += nelem * sizeof(T); elem->elem[blockIdx.x].opCount = args->opCount;}
|
||||
#define ACCUMULATE_PRIM_COUNTER(prim) \
|
||||
if (tid == 0) { struct ncclProfElem *elem = devProf.elems+args->opCount%PROFILE_NUM_ITEMS; elem->elem[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
|
||||
+ ws - elem->elem[blockIdx.x].wait_cycle); elem->elem[blockIdx.x].opCount = args->opCount;}
|
||||
#else
|
||||
#define INIT_COUNTER
|
||||
#define ACCUMULATE_COUNTER(prim)
|
||||
#define ACCUMULATE_PRIM_COUNTER(prim)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -204,18 +204,8 @@ private:
|
||||
ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
|
||||
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
|
||||
ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
|
||||
#ifdef ENABLE_PROFILING
|
||||
uint64_t t0;
|
||||
if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
|
||||
#endif
|
||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
|
||||
subBarrier();
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
struct ncclProfElem *elem = ncclShmem->comm.devProf.elems+opCount%PROFILE_NUM_ITEMS;
|
||||
elem->elem[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
|
||||
}
|
||||
#endif
|
||||
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (Send) {
|
||||
|
||||
@@ -290,16 +290,6 @@ struct ncclChannel {
|
||||
int workCount;
|
||||
size_t totalSize;
|
||||
uint64_t workFifoTail; // Only used by CPU
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct timeval tvs;
|
||||
uint64_t sizes;
|
||||
int active_req;
|
||||
uint64_t send_byte;
|
||||
uint64_t recv_byte;
|
||||
float bw_cumulative;
|
||||
int bw_count;
|
||||
#endif
|
||||
uint16_t index; // Only used by GPU
|
||||
|
||||
// GDRCOPY support
|
||||
@@ -314,49 +304,18 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProfElem {
|
||||
union {
|
||||
struct {
|
||||
uint64_t opCount;
|
||||
uint64_t total_cycle;
|
||||
uint64_t wait_cycle; // total wait cycle
|
||||
// primtive cycles
|
||||
uint64_t prim_cycle;
|
||||
uint64_t send_cycle;
|
||||
uint64_t directSend_cycle;
|
||||
uint64_t recv_cycle;
|
||||
uint64_t directRecv_cycle;
|
||||
uint64_t copySend_cycle;
|
||||
uint64_t directCopySend_cycle;
|
||||
uint64_t recvCopySend_cycle;
|
||||
uint64_t directRecvCopySend_cycle;
|
||||
uint64_t recvReduceCopy_cycle;
|
||||
uint64_t recvReduceSend_cycle;
|
||||
uint64_t recvReduceCopySend_cycle;
|
||||
uint64_t directRecvReduceCopySend_cycle;
|
||||
// primitive bytes
|
||||
uint64_t send_byte;
|
||||
uint64_t directSend_byte;
|
||||
uint64_t recv_byte;
|
||||
uint64_t directRecv_byte;
|
||||
uint64_t copySend_byte;
|
||||
uint64_t directCopySend_byte;
|
||||
uint64_t recvCopySend_byte;
|
||||
uint64_t directRecvCopySend_byte;
|
||||
uint64_t recvReduceCopy_byte;
|
||||
uint64_t recvReduceSend_byte;
|
||||
uint64_t recvReduceCopySend_byte;
|
||||
uint64_t directRecvReduceCopySend_byte;
|
||||
};
|
||||
int data[0x80];
|
||||
} elem[MAXCHANNELS];
|
||||
};
|
||||
#define PROFILE_NUM_ITEMS 31
|
||||
#define PROFILE_NUM_LAUNCHES 1024
|
||||
|
||||
struct ncclProf {
|
||||
struct ncclProfElem* elems;
|
||||
uint32_t count;
|
||||
uint32_t seq; // only entry from first launch is used
|
||||
struct {
|
||||
uint64_t line:16;
|
||||
uint64_t timeStamp:48;
|
||||
} elem[PROFILE_NUM_ITEMS];
|
||||
};
|
||||
|
||||
#define PROFILE_NUM_ITEMS 1024
|
||||
static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
@@ -420,7 +379,7 @@ struct ncclDevComm {
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf devProf;
|
||||
struct ncclProf* devProf;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
|
||||
+12
-60
@@ -233,7 +233,6 @@ void *ncclCommThreadMain(void *arg) {
|
||||
#endif
|
||||
|
||||
#undef NCCL_NO_OPTIMIZE
|
||||
#define PROFILE_USE_TIME
|
||||
|
||||
static ncclResult_t commFree(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
@@ -255,67 +254,20 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->asyncOps);
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf prof;
|
||||
prof.elems = (struct ncclProfElem*)malloc(sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS);
|
||||
CUDACHECK(hipMemcpy(prof.elems, comm->hostDevComm.devProf.elems, sizeof(struct ncclProfElem)*PROFILE_NUM_ITEMS, hipMemcpyDeviceToHost));
|
||||
struct ncclProf *prof, *prof_seq;
|
||||
prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
|
||||
CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
|
||||
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
|
||||
if (comm->rank == 0) {
|
||||
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank:Ch", "opCt", "total", " prim", " wait", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
|
||||
#ifdef PROFILE_USE_TIME
|
||||
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)", " (us)");
|
||||
#else
|
||||
INFO(NCCL_INIT, "# %7s %4s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "", " (s)", " (s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
|
||||
#endif
|
||||
}
|
||||
for (int i = 1; i < PROFILE_NUM_ITEMS; i++) {
|
||||
int valid = 0;
|
||||
for (int chan=0; chan<comm->nChannels; chan++) {
|
||||
struct ncclProfElem *elem = prof.elems+i;
|
||||
if (elem->elem[chan].opCount == 0)
|
||||
continue;
|
||||
valid++;
|
||||
#ifdef PROFILE_USE_TIME
|
||||
INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.2f %6.2f %6.2f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
|
||||
comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
|
||||
(double)elem->elem[chan].prim_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6, (double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6,
|
||||
(elem->elem[chan].send_cycle) ? ((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].recvReduceSend_cycle) ? ((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].directRecvReduceCopySend_cycle) ? ((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].directRecvCopySend_cycle) ? ((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].directRecv_cycle) ? ((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].copySend_cycle) ? ((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].recv_cycle) ? ((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0,
|
||||
(elem->elem[chan].recvCopySend_cycle) ? ((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E6) : 0);
|
||||
#else
|
||||
INFO(NCCL_INIT, "# [%02d:%02d] %04d %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
|
||||
comm->rank, chan, (uint32_t)elem->elem[chan].opCount, (double)elem->elem[chan].total_cycle/VEGA_GPU_RTC_FREQUENCY,
|
||||
(double)elem->elem[chan].wait_cycle/VEGA_GPU_RTC_FREQUENCY,
|
||||
(elem->elem[chan].send_cycle) ? (double)elem->elem[chan].send_byte/((double)elem->elem[chan].send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].recvReduceSend_cycle) ? (double)elem->elem[chan].recvReduceSend_byte/((double)elem->elem[chan].recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].directRecvReduceCopySend_cycle) ? (double)elem->elem[chan].directRecvReduceCopySend_byte/((double)elem->elem[chan].directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].directRecvCopySend_cycle) ? (double)elem->elem[chan].directRecvCopySend_byte/((double)elem->elem[chan].directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].directRecv_cycle) ? (double)elem->elem[chan].directRecv_byte/((double)elem->elem[chan].directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].copySend_cycle) ? (double)elem->elem[chan].copySend_byte/((double)elem->elem[chan].copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].recv_cycle) ? (double)elem->elem[chan].recv_byte/((double)elem->elem[chan].recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
|
||||
(elem->elem[chan].recvCopySend_cycle) ? (double)elem->elem[chan].recvCopySend_byte/((double)elem->elem[chan].recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
|
||||
#endif
|
||||
for (int i=0; i<comm->nChannels; i++) {
|
||||
for (int s=0; s<prof[MAXCHANNELS*i].seq; s++) {
|
||||
if (prof[MAXCHANNELS*s+i].count == 0) continue;
|
||||
for (int j=0; j<prof[MAXCHANNELS*s+i].count; j++) {
|
||||
INFO(NCCL_INIT, "# [%02d:%02d] %02d-%02d L:%04u %6.2fus", comm->rank, i, s, j, prof[MAXCHANNELS*s+i].elem[j].line, (prof[MAXCHANNELS*s+i].elem[j].timeStamp-prof[MAXCHANNELS*s+i].elem[0].timeStamp)/VEGA_GPU_RTC_FREQUENCY*1.0E6);
|
||||
}
|
||||
}
|
||||
if (valid == 0)
|
||||
break;
|
||||
}
|
||||
free(prof.elems);
|
||||
CUDACHECK(hipFree(comm->hostDevComm.devProf.elems));
|
||||
|
||||
for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
|
||||
if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
|
||||
comm->rank, channel, (comm->channels[channel].bw_count) ?
|
||||
(float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
|
||||
comm->channels[channel].send_byte, comm->channels[channel].bw_count);
|
||||
if (comm->channels[channel].recv_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Recv %6.2f GB/s (%ld bytes %d measurements)",
|
||||
comm->rank, channel, (comm->channels[channel].bw_count) ?
|
||||
(float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
|
||||
comm->channels[channel].recv_byte, comm->channels[channel].bw_count);
|
||||
}
|
||||
free(prof);
|
||||
CUDACHECK(hipFree(comm->hostDevComm.devProf));
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
@@ -432,7 +384,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank, int virtua
|
||||
comm->argsptrs[0] = &comm->devComm;
|
||||
comm->argsptrs[1] = &comm->args;
|
||||
#ifdef ENABLE_PROFILING
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf.elems, PROFILE_NUM_ITEMS));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, MAXCHANNELS*PROFILE_NUM_LAUNCHES));
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
|
||||
Reference in New Issue
Block a user