From a283f55f12492ec4dec002316c4cc00cae3883bd Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 7 Sep 2023 10:38:23 -0500 Subject: [PATCH] msccl: add NPKIT profiling for MSCCL send-recv --- install.sh | 6 +++ src/collectives/device/msccl_kernel_impl.h | 48 +++++++++++++++++++++- src/include/npkit/npkit_event.h | 6 +++ 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index 6e5af1f216..a7d5ad5d84 100755 --- a/install.sh +++ b/install.sh @@ -322,6 +322,12 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT \ -DENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY \ -DENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT \ + -DENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY \ + -DENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT \ + -DENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY \ + -DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \ + -DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \ + -DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/collectives/device/msccl_kernel_impl.h b/src/collectives/device/msccl_kernel_impl.h index 416a8ee98c..fe4ff520bf 100644 --- a/src/collectives/device/msccl_kernel_impl.h +++ b/src/collectives/device/msccl_kernel_impl.h @@ -254,6 +254,17 @@ __device__ __forceinline__ void mscclRunInterpreter( const ssize_t sizePerMscclChunk = mscclShmem.work.count / mscclShmem.work.nChunksPerLoop; uint32_t maxAllowedCount = mscclShmem.work.maxAllowedCount; +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY) + if (tid == 0) { + int xcc_id = 0; +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) + asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id)); +#endif + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + // msccl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags // this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions. const int64_t workIndex = mscclShmem.work.workIndex; @@ -297,10 +308,37 @@ __device__ __forceinline__ void mscclRunInterpreter( dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk; int thisCount = min(maxAllowedCount, count - c); int thisNelem = nelem * thisCount; - if (t->type == MSCCL_SEND) + if (t->type == MSCCL_SEND) { +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif prims.sendWithBarrier(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end. - else if (t->type == MSCCL_RECV) + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + } + else if (t->type == MSCCL_RECV) { +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif prims.recv(dstOffset, thisNelem); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif + } else if (t->type == MSCCL_REDUCE) { int numReductions = t->numReductions; if (thisNelem < nthreads){ @@ -397,6 +435,12 @@ __device__ __forceinline__ void mscclRunInterpreter( step++; } } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT) + if (tid == 0) { + NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(), + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); + } +#endif } #define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type) \ diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index a1d24fd3fe..dbb05dbcc3 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -116,5 +116,11 @@ #define NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT 0x5B #define NPKIT_EVENT_MSCCL_REDUCE_ENTRY 0x5C #define NPKIT_EVENT_MSCCL_REDUCE_EXIT 0x5D +#define NPKIT_EVENT_MSCCL_SEND_ENTRY 0x5E +#define NPKIT_EVENT_MSCCL_SEND_EXIT 0x5F +#define NPKIT_EVENT_MSCCL_RECV_ENTRY 0x60 +#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61 +#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62 +#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63 #endif