Remove atomic from profiling

[ROCm/rccl commit: 31bd4236f1]
This commit is contained in:
Wenkai Du
2021-09-08 14:20:32 -05:00
parent 310d51056f
commit d75504e9dc
4 changed files with 60 additions and 24 deletions
@@ -101,7 +101,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
ACCUMULATE_COUNTER(directRecv);
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __builtin_amdgcn_s_memrealtime() - clk, __ATOMIC_SEQ_CST);
if (tid == 0) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
#endif
}
};
@@ -146,14 +146,15 @@ class ncclPrimitives {
inline __device__ void waitRecv(ssize_t directOffset) {
spins = 0;
#ifdef ENABLE_PROFILING
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
uint64_t t0;
if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
#endif
while (connTailCache < step + SLICESTEPS) {
connTailCache = LOAD(connTailPtr);
if (checkAbort()) break;
}
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
if (tid == 0) comm->devProf->elems[blockIdx.x].wait_recv_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
#endif
if (connPtrsFifoPtr) srcs[SRC+index] = (const T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS);
else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
@@ -180,7 +181,8 @@ class ncclPrimitives {
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(dataSize, nelem-offset));
#ifdef ENABLE_PROFILING
uint64_t t0 = __builtin_amdgcn_s_memrealtime();
uint64_t t0;
if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime();
#endif
if (tid < nworkers) {
if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
@@ -189,7 +191,7 @@ class ncclPrimitives {
if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
if (realSize > 0) {
#ifdef ENABLE_PROFILING
if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST);
if (tid == 0) comm->devProf->elems[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0);
#endif
subBarrier();
ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
@@ -428,12 +430,12 @@ class ncclPrimitives {
#ifdef ENABLE_PROFILING
#define INIT_COUNTER \
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); }
if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = devProf->elems[blockIdx.x].wait_cycle; }
#define ACCUMULATE_COUNTER(prim) \
if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \
+ ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \
__atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); }
if (tid == 0) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \
+ ws - devProf->elems[blockIdx.x].wait_cycle); \
devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); }
#else
#define INIT_COUNTER
#define ACCUMULATE_COUNTER(prim)
+7 -3
View File
@@ -280,12 +280,12 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
struct ncclProf {
struct ncclProfElem {
union {
struct {
uint64_t total_cycle;
uint64_t wait_cycle[MAXCHANNELS]; // total wait cycle
uint64_t wait_recv_cycle[MAXCHANNELS]; // recv wait cycle
uint64_t wait_cycle; // total wait cycle
uint64_t wait_recv_cycle; // recv wait cycle
// primtive cycles
uint64_t send_cycle;
uint64_t directSend_cycle;
@@ -316,6 +316,10 @@ struct ncclProf {
int data[0x80];
};
};
struct ncclProf {
struct ncclProfElem elems[MAXCHANNELS];
};
#endif
#ifdef ENABLE_COLLTRACE
+42 -12
View File
@@ -298,10 +298,40 @@ static ncclResult_t commFree(ncclComm_t comm) {
#ifdef ENABLE_PROFILING
struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf));
CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost));
uint64_t wait_cycle = 0, wait_recv_cycle = 0;
uint64_t total_cycle = 0, wait_cycle = 0, wait_recv_cycle = 0, send_cycle = 0, directSend_cycle = 0, recv_cycle = 0, \
directRecv_cycle = 0, copySend_cycle = 0, directCopySend_cycle = 0, recvCopySend_cycle = 0, directRecvCopySend_cycle = 0, \
recvReduceCopy_cycle = 0, recvReduceSend_cycle = 0, recvReduceCopySend_cycle = 0, directRecvReduceCopySend_cycle = 0, \
send_byte = 0, directSend_byte = 0, recv_byte = 0, directRecv_byte = 0, copySend_byte = 0, directCopySend_byte = 0, \
recvCopySend_byte = 0, directRecvCopySend_byte = 0, recvReduceCopy_byte = 0, recvReduceSend_byte = 0, \
recvReduceCopySend_byte = 0, directRecvReduceCopySend_byte = 0;
for (int chan=0; chan<comm->nChannels; chan++) {
wait_cycle += prof->wait_cycle[chan];
wait_recv_cycle += prof->wait_recv_cycle[chan];
total_cycle += prof->elems[chan].total_cycle;
wait_cycle += prof->elems[chan].wait_cycle;
wait_recv_cycle += prof->elems[chan].wait_recv_cycle;
send_cycle += prof->elems[chan].send_cycle;
directSend_cycle += prof->elems[chan].directSend_cycle;
recv_cycle += prof->elems[chan].recv_cycle;
directRecv_cycle += prof->elems[chan].directRecv_cycle;
copySend_cycle += prof->elems[chan].copySend_cycle;
directCopySend_cycle += prof->elems[chan].directCopySend_cycle;
recvCopySend_cycle += prof->elems[chan].recvCopySend_cycle;
directRecvCopySend_cycle += prof->elems[chan].directRecvCopySend_cycle;
recvReduceCopy_cycle += prof->elems[chan].recvReduceCopy_cycle;
recvReduceSend_cycle += prof->elems[chan].recvReduceSend_cycle;
recvReduceCopySend_cycle += prof->elems[chan].recvReduceCopySend_cycle;
directRecvReduceCopySend_cycle += prof->elems[chan].directRecvReduceCopySend_cycle;
send_byte += prof->elems[chan].send_byte;
directSend_byte += prof->elems[chan].directSend_byte;
recv_byte += prof->elems[chan].recv_byte;
directRecv_byte += prof->elems[chan].directRecv_byte;
copySend_byte += prof->elems[chan].copySend_byte;
directCopySend_byte += prof->elems[chan].directCopySend_byte;
recvCopySend_byte += prof->elems[chan].recvCopySend_byte;
directRecvCopySend_byte += prof->elems[chan].directRecvCopySend_byte;
recvReduceCopy_byte += prof->elems[chan].recvReduceCopy_byte;
recvReduceSend_byte += prof->elems[chan].recvReduceSend_byte;
recvReduceCopySend_byte += prof->elems[chan].recvReduceCopySend_byte;
directRecvReduceCopySend_byte += prof->elems[chan].directRecvReduceCopySend_byte;
}
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
if (comm->rank == 0) {
@@ -309,17 +339,17 @@ static ncclResult_t commFree(ncclComm_t comm) {
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
}
INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
comm->rank, (double)total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(double)wait_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
(prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
(send_cycle) ? (double)send_byte*comm->nChannels/((double)send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(recvReduceSend_cycle) ? (double)recvReduceSend_byte*comm->nChannels/((double)recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(directRecvReduceCopySend_cycle) ? (double)directRecvReduceCopySend_byte*comm->nChannels/((double)directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(directRecvCopySend_cycle) ? (double)directRecvCopySend_byte*comm->nChannels/((double)directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(directRecv_cycle) ? (double)directRecv_byte*comm->nChannels/((double)directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(copySend_cycle) ? (double)copySend_byte*comm->nChannels/((double)copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(recv_cycle) ? (double)recv_byte*comm->nChannels/((double)recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0,
(recvCopySend_cycle) ? (double)recvCopySend_byte*comm->nChannels/((double)recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0);
free(prof);
CUDACHECK(hipFree(comm->hostDevComm.devProf));