msccl: add NPKIT profiling for MSCCL send-recv

This commit is contained in:
Nusrat Islam
2023-09-07 10:38:23 -05:00
rodzic 62a09100a6
commit a283f55f12
3 zmienionych plików z 58 dodań i 2 usunięć
+6
Wyświetl plik
@@ -322,6 +322,12 @@ if ($npkit_enabled); then
-DENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
fi
@@ -254,6 +254,17 @@ __device__ __forceinline__ void mscclRunInterpreter(
const ssize_t sizePerMscclChunk = mscclShmem.work.count / mscclShmem.work.nChunksPerLoop;
uint32_t maxAllowedCount = mscclShmem.work.maxAllowedCount;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
if (tid == 0) {
int xcc_id = 0;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
#endif
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// msccl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags
// this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions.
const int64_t workIndex = mscclShmem.work.workIndex;
@@ -297,10 +308,37 @@ __device__ __forceinline__ void mscclRunInterpreter(
dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk;
int thisCount = min(maxAllowedCount, count - c);
int thisNelem = nelem * thisCount;
if (t->type == MSCCL_SEND)
if (t->type == MSCCL_SEND) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
prims.sendWithBarrier(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end.
else if (t->type == MSCCL_RECV)
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
else if (t->type == MSCCL_RECV) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
prims.recv(dstOffset, thisNelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
else if (t->type == MSCCL_REDUCE) {
int numReductions = t->numReductions;
if (thisNelem < nthreads){
@@ -397,6 +435,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
step++;
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type) \
+6
Wyświetl plik
@@ -116,5 +116,11 @@
#define NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT 0x5B
#define NPKIT_EVENT_MSCCL_REDUCE_ENTRY 0x5C
#define NPKIT_EVENT_MSCCL_REDUCE_EXIT 0x5D
#define NPKIT_EVENT_MSCCL_SEND_ENTRY 0x5E
#define NPKIT_EVENT_MSCCL_SEND_EXIT 0x5F
#define NPKIT_EVENT_MSCCL_RECV_ENTRY 0x60
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
#endif