diff --git a/projects/rccl/src/device/prims_ll128.h b/projects/rccl/src/device/prims_ll128.h index 4b2a9f8fa5..7be997fe65 100644 --- a/projects/rccl/src/device/prims_ll128.h +++ b/projects/rccl/src/device/prims_ll128.h @@ -73,6 +73,13 @@ public: private: #endif +#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)) + uint64_t npKitWaitRecvDataProcessSize = 0; + uint64_t npKitWaitRecvEntryTime = 0; + uint64_t npKitWaitRecvExitTime = 0; + uint64_t npKitWaitRecvTotalTime = 0; +#endif + inline __device__ void barrier() { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (nthreads != WARP_SIZE) @@ -101,6 +108,12 @@ private: } inline __device__ void waitSend(int nbytes) { +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif if (sendConnHeadPtr) { int spins = 0; while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { @@ -113,6 +126,12 @@ private: } sendConnHead += 1; } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } inline __device__ void postRecv() { @@ -354,6 +373,23 @@ private: if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t)); barrier(); + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) + if (tid == 0) { + npKitWaitRecvTotalTime = 0; + npKitWaitRecvDataProcessSize = nelem*sizeof(T); + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY, + npKitWaitRecvDataProcessSize, 0, NPKIT_GET_GPU_TIMESTAMP(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) + if (tid == 0) { + npKitWaitRecvTotalTime = 0; + npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP(); + } +#endif + nelem -= DataEltPerSlice*warp; srcPtr += DataEltPerSlice*warp; dstPtr += DataEltPerSlice*warp; @@ -384,6 +420,22 @@ private: } barrier(); + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) + if (tid == 0) { + npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP(); + npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime; + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT, + npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1; if (SEND) postSend(); if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1; @@ -567,28 +619,125 @@ public: } __device__ void send(intptr_t inpIx, int eltN) { - return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void sendFromOutput(intptr_t outIx, int eltN) { - return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) { - return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recvReduceSend(intptr_t inpIx, int eltN) { - return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { - return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { - return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); + #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } __device__ void recvSend(int eltN) { return GenericOp<1, 1, -1, -1>(-1, -1, eltN, false);