diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 7803807a1c..3af59a4c14 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -99,7 +99,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { ACCUMULATE_COUNTER(directRecv); } #ifdef ENABLE_PROFILING - if (tid == 0 && args->opCount > 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST); + if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST); #endif } diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index 270bad4114..8f508a0780 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -66,7 +66,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { } } #ifdef ENABLE_PROFILING - if (tid == 0 && args->opCount > 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST); + if (tid == 0) __atomic_fetch_add(&(devProf->total_cycle), __rtc64() - clk, __ATOMIC_SEQ_CST); #endif } diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index ef8c753a1f..c73325fcd4 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -146,7 +146,7 @@ class ncclPrimitives { if (checkAbort(wid, 0)) break; } #ifdef ENABLE_PROFILING - if (opCount > 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST); + __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST); #endif recvConnTail += SLICESTEPS; } @@ -237,7 +237,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { if (realSize > 0) { barrier(); #ifdef ENABLE_PROFILING - if (tid == 0 && opCount > 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST); + if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_cycle[blockIdx.x], __rtc64() - t0, __ATOMIC_SEQ_CST); #endif #if defined(RCCL_USE_DIRECT_BUFFER) if (DIRECTRECV && recvDirectBuff[0]) { @@ -427,7 +427,7 @@ inline __device__ int directSendInc(int i, int directInc, int sliceInc) { if (tid == 0) { t0 = __rtc64(); ws = LOAD(&(devProf->wait_cycle[blockIdx.x])); } #define ACCUMULATE_COUNTER(prim) \ - if (tid == 0 && args->opCount > 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \ + if (tid == 0) { __atomic_fetch_add(&(devProf->prim##_cycle), __rtc64() - t0 \ + ws - LOAD(&(devProf->wait_cycle[blockIdx.x])), __ATOMIC_SEQ_CST); \ __atomic_fetch_add(&(devProf->prim##_byte), nelem * sizeof(T), __ATOMIC_SEQ_CST); } #else