From a0cef69110f9a0f1b3f4954190f14d0f0fc5fa60 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 7 May 2024 14:00:16 -0700 Subject: [PATCH] npkit: add broadcast trace (#1166) --- install.sh | 2 ++ src/device/broadcast.h | 13 +++++++++++++ src/include/npkit/npkit_event.h | 2 ++ 3 files changed, 17 insertions(+) diff --git a/install.sh b/install.sh index 4705579804..771ef80f9a 100755 --- a/install.sh +++ b/install.sh @@ -340,6 +340,8 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \ -DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \ -DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \ + -DENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY \ + -DENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 1497931017..f65b7fb2e7 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -46,6 +46,13 @@ namespace { } #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_ENTRY, args->count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> @@ -73,6 +80,12 @@ namespace { prims.recvCopySend(offset, nelem); } } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_EXIT, args->count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } } diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index da33f52402..0c2d5ec02c 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -127,4 +127,6 @@ #define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66 #define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67 +#define NPKIT_EVENT_BROADCAST_RING_ENTRY 0x68 +#define NPKIT_EVENT_BROADCAST_RING_EXIT 0x69 #endif