diff --git a/src/device/msccl_kernel_impl.h b/src/device/msccl_kernel_impl.h index fdcc0502d2..ca29ea071c 100644 --- a/src/device/msccl_kernel_impl.h +++ b/src/device/msccl_kernel_impl.h @@ -26,27 +26,6 @@ extern __shared__ struct mscclShmemData mscclShmem; #define GET_WORKINDEX_FROM_FLAG(__FLAG__) \ (__FLAG__) / (MSCCL_MAX_ITER*MSCCL_MAX_NUM_STEPS) -#ifdef ENABLE_COLLTRACE - #define INC_COLL_TRACE \ - uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \ - collTrace->timeStamp = wall_clock64(); \ - collTrace->bid = blockIdx.x; - // TODO: switch to atomicInc after llvm crash is fixed - // uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS) - - #define traceData(data2, data4, data8_0, data8_1) { \ - INC_COLL_TRACE \ - collTrace->funcIndex = data2; \ - collTrace->data_0 = data4; \ - collTrace->opCount = data8_0; \ - collTrace->data_1 = data8_1; \ - collTrace->type = ncclCollTraceDataType; \ - } -#else -#define traceData(data2, data4, data8_0, data8_1) -#endif - inline __device__ static void barrier(int nthreads) { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) assert(nthreads == NCCL_MAX_NTHREADS);