From d75504e9dc1519e99c712adfb69dfa5092aadccb Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 8 Sep 2021 14:20:32 -0500 Subject: [PATCH] Remove atomic from profiling [ROCm/rccl commit: 31bd4236f1ae59eb1b7b942dc77ed3ef9d54bf1c] --- .../rccl/src/collectives/device/all_reduce.h | 2 +- .../rccl/src/collectives/device/primitives.h | 18 ++++--- projects/rccl/src/include/devcomm.h | 10 ++-- projects/rccl/src/init.cc | 54 ++++++++++++++----- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index a77b187870..1f54ec2b13 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -101,7 +101,7 @@ class ncclFunctiontotal_cycle), __builtin_amdgcn_s_memrealtime() - clk, __ATOMIC_SEQ_CST); + if (tid == 0) devProf->elems[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk); #endif } }; diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index 3fc3f45f0f..c64d0bd153 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -146,14 +146,15 @@ class ncclPrimitives { inline __device__ void waitRecv(ssize_t directOffset) { spins = 0; #ifdef ENABLE_PROFILING - uint64_t t0 = __builtin_amdgcn_s_memrealtime(); + uint64_t t0; + if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime(); #endif while (connTailCache < step + SLICESTEPS) { connTailCache = LOAD(connTailPtr); if (checkAbort()) break; } #ifdef ENABLE_PROFILING - if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST); + if (tid == 0) comm->devProf->elems[blockIdx.x].wait_recv_cycle += (__builtin_amdgcn_s_memrealtime() - t0); #endif if (connPtrsFifoPtr) srcs[SRC+index] = (const T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS); else srcs[SRC+index] = directPtr(directOffset); @@ -180,7 +181,8 @@ class ncclPrimitives { for (int slice=0; slice(directOffset+offset, realSize*sizeof(T)); if (realSize > 0) { #ifdef ENABLE_PROFILING - if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST); + if (tid == 0) comm->devProf->elems[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0); #endif subBarrier(); ReduceOrCopyMulti(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); @@ -428,12 +430,12 @@ class ncclPrimitives { #ifdef ENABLE_PROFILING #define INIT_COUNTER \ - if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); } + if (tid == 0) { t0 = __builtin_amdgcn_s_memrealtime(); ws = devProf->elems[blockIdx.x].wait_cycle; } #define ACCUMULATE_COUNTER(prim) \ - if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __builtin_amdgcn_s_memrealtime() - t0 \ - + ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \ - __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); } + if (tid == 0) { devProf->elems[blockIdx.x].prim##_cycle += (__builtin_amdgcn_s_memrealtime() - t0 \ + + ws - devProf->elems[blockIdx.x].wait_cycle); \ + devProf->elems[blockIdx.x].prim##_byte += nelem * sizeof(T); } #else #define INIT_COUNTER #define ACCUMULATE_COUNTER(prim) diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h index 51b57ec790..42976e36c7 100644 --- a/projects/rccl/src/include/devcomm.h +++ b/projects/rccl/src/include/devcomm.h @@ -280,12 +280,12 @@ static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must #pragma pack(pop) /* restore original alignment from stack */ #ifdef ENABLE_PROFILING -struct ncclProf { +struct ncclProfElem { union { struct { uint64_t total_cycle; - uint64_t wait_cycle[MAXCHANNELS]; // total wait cycle - uint64_t wait_recv_cycle[MAXCHANNELS]; // recv wait cycle + uint64_t wait_cycle; // total wait cycle + uint64_t wait_recv_cycle; // recv wait cycle // primtive cycles uint64_t send_cycle; uint64_t directSend_cycle; @@ -316,6 +316,10 @@ struct ncclProf { int data[0x80]; }; }; + +struct ncclProf { + struct ncclProfElem elems[MAXCHANNELS]; +}; #endif #ifdef ENABLE_COLLTRACE diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 73c34219b5..d24b33cb55 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -298,10 +298,40 @@ static ncclResult_t commFree(ncclComm_t comm) { #ifdef ENABLE_PROFILING struct ncclProf* prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)); CUDACHECK(hipMemcpy(prof, comm->hostDevComm.devProf, sizeof(struct ncclProf), hipMemcpyDeviceToHost)); - uint64_t wait_cycle = 0, wait_recv_cycle = 0; + uint64_t total_cycle = 0, wait_cycle = 0, wait_recv_cycle = 0, send_cycle = 0, directSend_cycle = 0, recv_cycle = 0, \ + directRecv_cycle = 0, copySend_cycle = 0, directCopySend_cycle = 0, recvCopySend_cycle = 0, directRecvCopySend_cycle = 0, \ + recvReduceCopy_cycle = 0, recvReduceSend_cycle = 0, recvReduceCopySend_cycle = 0, directRecvReduceCopySend_cycle = 0, \ + send_byte = 0, directSend_byte = 0, recv_byte = 0, directRecv_byte = 0, copySend_byte = 0, directCopySend_byte = 0, \ + recvCopySend_byte = 0, directRecvCopySend_byte = 0, recvReduceCopy_byte = 0, recvReduceSend_byte = 0, \ + recvReduceCopySend_byte = 0, directRecvReduceCopySend_byte = 0; for (int chan=0; channChannels; chan++) { - wait_cycle += prof->wait_cycle[chan]; - wait_recv_cycle += prof->wait_recv_cycle[chan]; + total_cycle += prof->elems[chan].total_cycle; + wait_cycle += prof->elems[chan].wait_cycle; + wait_recv_cycle += prof->elems[chan].wait_recv_cycle; + send_cycle += prof->elems[chan].send_cycle; + directSend_cycle += prof->elems[chan].directSend_cycle; + recv_cycle += prof->elems[chan].recv_cycle; + directRecv_cycle += prof->elems[chan].directRecv_cycle; + copySend_cycle += prof->elems[chan].copySend_cycle; + directCopySend_cycle += prof->elems[chan].directCopySend_cycle; + recvCopySend_cycle += prof->elems[chan].recvCopySend_cycle; + directRecvCopySend_cycle += prof->elems[chan].directRecvCopySend_cycle; + recvReduceCopy_cycle += prof->elems[chan].recvReduceCopy_cycle; + recvReduceSend_cycle += prof->elems[chan].recvReduceSend_cycle; + recvReduceCopySend_cycle += prof->elems[chan].recvReduceCopySend_cycle; + directRecvReduceCopySend_cycle += prof->elems[chan].directRecvReduceCopySend_cycle; + send_byte += prof->elems[chan].send_byte; + directSend_byte += prof->elems[chan].directSend_byte; + recv_byte += prof->elems[chan].recv_byte; + directRecv_byte += prof->elems[chan].directRecv_byte; + copySend_byte += prof->elems[chan].copySend_byte; + directCopySend_byte += prof->elems[chan].directCopySend_byte; + recvCopySend_byte += prof->elems[chan].recvCopySend_byte; + directRecvCopySend_byte += prof->elems[chan].directRecvCopySend_byte; + recvReduceCopy_byte += prof->elems[chan].recvReduceCopy_byte; + recvReduceSend_byte += prof->elems[chan].recvReduceSend_byte; + recvReduceCopySend_byte += prof->elems[chan].recvReduceCopySend_byte; + directRecvReduceCopySend_byte += prof->elems[chan].directRecvReduceCopySend_byte; } #define VEGA_GPU_RTC_FREQUENCY 2.5E7 if (comm->rank == 0) { @@ -309,17 +339,17 @@ static ncclResult_t commFree(ncclComm_t comm) { INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); } INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", - comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, + comm->rank, (double)total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, (double)wait_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, (double)wait_recv_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, - (prof->send_cycle) ? (double)prof->send_byte*comm->nChannels/((double)prof->send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->recvReduceSend_cycle) ? (double)prof->recvReduceSend_byte*comm->nChannels/((double)prof->recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->directRecvReduceCopySend_cycle) ? (double)prof->directRecvReduceCopySend_byte*comm->nChannels/((double)prof->directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->directRecvCopySend_cycle) ? (double)prof->directRecvCopySend_byte*comm->nChannels/((double)prof->directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->directRecv_cycle) ? (double)prof->directRecv_byte*comm->nChannels/((double)prof->directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->copySend_cycle) ? (double)prof->copySend_byte*comm->nChannels/((double)prof->copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->recv_cycle) ? (double)prof->recv_byte*comm->nChannels/((double)prof->recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, - (prof->recvCopySend_cycle) ? (double)prof->recvCopySend_byte*comm->nChannels/((double)prof->recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); + (send_cycle) ? (double)send_byte*comm->nChannels/((double)send_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (recvReduceSend_cycle) ? (double)recvReduceSend_byte*comm->nChannels/((double)recvReduceSend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (directRecvReduceCopySend_cycle) ? (double)directRecvReduceCopySend_byte*comm->nChannels/((double)directRecvReduceCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (directRecvCopySend_cycle) ? (double)directRecvCopySend_byte*comm->nChannels/((double)directRecvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (directRecv_cycle) ? (double)directRecv_byte*comm->nChannels/((double)directRecv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (copySend_cycle) ? (double)copySend_byte*comm->nChannels/((double)copySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (recv_cycle) ? (double)recv_byte*comm->nChannels/((double)recv_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0, + (recvCopySend_cycle) ? (double)recvCopySend_byte*comm->nChannels/((double)recvCopySend_cycle/VEGA_GPU_RTC_FREQUENCY*1.0E9) : 0); free(prof); CUDACHECK(hipFree(comm->hostDevComm.devProf));