msccl: add NPKIT profiling for MSCCL send-recv
This commit is contained in:
@@ -322,6 +322,12 @@ if ($npkit_enabled); then
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
|
||||
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
||||
fi
|
||||
|
||||
|
||||
@@ -254,6 +254,17 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
const ssize_t sizePerMscclChunk = mscclShmem.work.count / mscclShmem.work.nChunksPerLoop;
|
||||
uint32_t maxAllowedCount = mscclShmem.work.maxAllowedCount;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
|
||||
if (tid == 0) {
|
||||
int xcc_id = 0;
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
||||
#endif
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// msccl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags
|
||||
// this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions.
|
||||
const int64_t workIndex = mscclShmem.work.workIndex;
|
||||
@@ -297,10 +308,37 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk;
|
||||
int thisCount = min(maxAllowedCount, count - c);
|
||||
int thisNelem = nelem * thisCount;
|
||||
if (t->type == MSCCL_SEND)
|
||||
if (t->type == MSCCL_SEND) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
prims.sendWithBarrier(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end.
|
||||
else if (t->type == MSCCL_RECV)
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (t->type == MSCCL_RECV) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
prims.recv(dstOffset, thisNelem);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RECV_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (t->type == MSCCL_REDUCE) {
|
||||
int numReductions = t->numReductions;
|
||||
if (thisNelem < nthreads){
|
||||
@@ -397,6 +435,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
step++;
|
||||
}
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type) \
|
||||
|
||||
@@ -116,5 +116,11 @@
|
||||
#define NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT 0x5B
|
||||
#define NPKIT_EVENT_MSCCL_REDUCE_ENTRY 0x5C
|
||||
#define NPKIT_EVENT_MSCCL_REDUCE_EXIT 0x5D
|
||||
#define NPKIT_EVENT_MSCCL_SEND_ENTRY 0x5E
|
||||
#define NPKIT_EVENT_MSCCL_SEND_EXIT 0x5F
|
||||
#define NPKIT_EVENT_MSCCL_RECV_ENTRY 0x60
|
||||
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
|
||||
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
|
||||
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user