From f957c4fe225f6afcdb159d2cf8d5463568b006a1 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 4 Mar 2025 10:03:56 -0800 Subject: [PATCH] NPKit: enable reduce scatter profiling (#1580) --- install.sh | 8 ++++ src/device/reduce_scatter.h | 74 ++++++++++++++++++++++++++++++++- src/include/npkit/npkit_event.h | 9 ++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 9191585345..fd1a780437 100755 --- a/install.sh +++ b/install.sh @@ -377,6 +377,14 @@ if [[ "${npkit_enabled}" == true ]]; then -DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \ -DENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY \ -DENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY \ + -DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index 7d51c7ea12..e618b8c3b2 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -29,31 +29,103 @@ namespace { uint32_t nelem; int rankDest; +#if defined(ENABLE_NPKIT) + int npKitCtxIdx = ncclShmem.channelId; +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_GET_CPU_TIMESTAMP_FROM_BLOCK, + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + Primitives, 0, Proto, 0> prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex); +#if defined(ENABLE_NPKIT) + if (tid == 0) { + prims.npKitCtxIdx = npKitCtxIdx; + } +#endif + for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) { nelem = min(chunkCount, channelCount - elemOffset); dataOffset = gridOffset + elemOffset; /////////////// begin ReduceScatter steps /////////////// // step 0: push data to next GPU +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif rankDest = ringRanks[nranks-1]; offset = dataOffset + rankDest * count; prims.send(offset, nelem); - +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif // k-2 steps: reduce and copy to next GPU +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif for (int j=2; j