diff --git a/install.sh b/install.sh index 42859bd48b..eaad65bf14 100755 --- a/install.sh +++ b/install.sh @@ -301,6 +301,14 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT \ -DENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY \ -DENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index dbfca9b082..b27c7c6513 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -48,6 +48,13 @@ namespace { } #endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; Primitives, 0, Proto, 0> prims @@ -82,12 +89,35 @@ namespace { rankDest = ringRanks[0]; offset = chunkOffset + rankDest * size; +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + prims.npKitDataProcessTotalTime = 0; + } +#endif + if (inputBuf + chunkOffset == outputBuf + offset) { // In place prims.directSend(chunkOffset, offset, nelem); } else { prims.directCopySend(chunkOffset, offset, offset, nelem); } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY) + if (tid == 0 && nranks > 2) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + prims.npKitDataProcessTotalTime = 0; + } +#endif + // k-2 steps: copy to next GPU for (int j=1; j 2) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + // Make final copy from buffer to dest. rankDest = ringRanks[1]; offset = chunkOffset + rankDest * size; +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + prims.npKitDataProcessTotalTime = 0; + } +#endif // Final wait/copy. prims.directRecv(offset, nelem); + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + + + } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } } diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index fd1f940a88..c3892684c4 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -100,4 +100,14 @@ #define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F +#define NPKIT_EVENT_ALL_GATHER_RING_ENTRY 0x50 +#define NPKIT_EVENT_ALL_GATHER_RING_EXIT 0x51 +#define NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY 0x52 +#define NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT 0x53 +#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY 0x54 +#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT 0x55 +#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY 0x56 +#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT 0x57 + + #endif