From a49772289410eeb8f3e3152e7d6f67c829ffac77 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:00:12 -0700 Subject: [PATCH] NPkit: misc fixes for MSCCL (#936) * msccl: add xcc_id to timestamp sync * NPKit: add timestamp for rrc operator * NPKit: add timestamp for MSCCL init --- install.sh | 4 ++ src/collectives/device/msccl_kernel_impl.h | 49 ++++++++++++++++++---- src/include/npkit/npkit_event.h | 4 ++ 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/install.sh b/install.sh index 0c0aeaa4ef..71046c48cb 100755 --- a/install.sh +++ b/install.sh @@ -328,6 +328,10 @@ if ($npkit_enabled); then -DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \ -DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \ -DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \ + -DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY \ + -DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \ + -DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \ + -DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \ -DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME" fi diff --git a/src/collectives/device/msccl_kernel_impl.h b/src/collectives/device/msccl_kernel_impl.h index f1c444cce8..44505d1a13 100644 --- a/src/collectives/device/msccl_kernel_impl.h +++ b/src/collectives/device/msccl_kernel_impl.h @@ -135,6 +135,12 @@ __device__ __forceinline__ void mscclRunInterpreter( const int bid = blockIdx.x; const int nthreads = NCCL_MAX_NTHREADS; +#if defined(ENABLE_NPKIT) + uint64_t timestamp_entry = 0; + if (tid == 0) { + timestamp_entry = NPKIT_GET_GPU_TIMESTAMP(); + } +#endif // initialize mscclShmem.mscclTB threadBlockCopy( (uint64_t *)&mscclShmem.mscclTB, (uint64_t *)(algo->mscclTBs + bid), @@ -179,7 +185,13 @@ __device__ __forceinline__ void mscclRunInterpreter( #if defined(ENABLE_NPKIT) int npKitCtxIdx = bid; - if (tid == 0) ncclShmem.event_buffer_head = 0; + int xcc_id = 0; + if (tid == 0) { + ncclShmem.event_buffer_head = 0; +#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) + asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id)); +#endif + } #endif __synclds(); // publish shmem if (tid == 0) @@ -188,13 +200,13 @@ __device__ __forceinline__ void mscclRunInterpreter( #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; - NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp); + NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, xcc_id, *cpuTimestamp); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (tid == 0) { - NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP()); + NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP()); } #endif @@ -250,11 +262,19 @@ __device__ __forceinline__ void mscclRunInterpreter( #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY) if (tid == 0) { - int xcc_id = 0; -#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) - asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id)); + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, timestamp_entry); + } #endif - NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP()); + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_ENTRY, 0, xcc_id, timestamp_entry); + } +#endif + +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT) + if (tid == 0) { + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_EXIT, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP()); } #endif @@ -412,8 +432,19 @@ __device__ __forceinline__ void mscclRunInterpreter( prims.recvReduceSend(srcOffset, thisNelem); else if (t->type == MSCCL_RECV_REDUCE_COPY_SEND) prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem); - else if (t->type == MSCCL_RECV_REDUCE_COPY) + else if (t->type == MSCCL_RECV_REDUCE_COPY) { +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY) + if (tid == 0) { + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP()); + } +#endif prims.recvReduceCopy(srcOffset, dstOffset, thisNelem); +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT) + if (tid == 0) { + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP()); + } +#endif + } else if (t->type == MSCCL_LOCAL_COPY) prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem); else @@ -427,7 +458,7 @@ __device__ __forceinline__ void mscclRunInterpreter( } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT) if (tid == 0) { - NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, 0, NPKIT_GET_GPU_TIMESTAMP()); + NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP()); } #endif #if defined(ENABLE_NPKIT) diff --git a/src/include/npkit/npkit_event.h b/src/include/npkit/npkit_event.h index dbb05dbcc3..da33f52402 100644 --- a/src/include/npkit/npkit_event.h +++ b/src/include/npkit/npkit_event.h @@ -122,5 +122,9 @@ #define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61 #define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62 #define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63 +#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY 0x64 +#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT 0x65 +#define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66 +#define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67 #endif