diff --git a/install.sh b/install.sh index 4705579804..771ef80f9a 100755 --- a/install.sh +++ b/install.sh @@ -340,6 +340,8 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \ -DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \ -DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \ + -DENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY \ + -DENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 1497931017..f65b7fb2e7 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -46,6 +46,13 @@ namespace { } #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_ENTRY, args->count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> @@ -73,6 +80,12 @@ namespace { prims.recvCopySend(offset, nelem); } } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_EXIT, args->count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } } diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index da33f52402..0c2d5ec02c 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -127,4 +127,6 @@ #define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66 #define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67 +#define NPKIT_EVENT_BROADCAST_RING_ENTRY 0x68 +#define NPKIT_EVENT_BROADCAST_RING_EXIT 0x69 #endif