NPKit: enable reduce scatter profiling (#1580)
This commit is contained in:
@@ -377,6 +377,14 @@ if [[ "${npkit_enabled}" == true ]]; then
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT \
|
||||
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
||||
fi
|
||||
|
||||
|
||||
@@ -29,31 +29,103 @@ namespace {
|
||||
uint32_t nelem;
|
||||
int rankDest;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = ncclShmem.channelId;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_GET_CPU_TIMESTAMP_FROM_BLOCK,
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (tid == 0) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
|
||||
nelem = min(chunkCount, channelCount - elemOffset);
|
||||
|
||||
dataOffset = gridOffset + elemOffset;
|
||||
/////////////// begin ReduceScatter steps ///////////////
|
||||
// step 0: push data to next GPU
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
rankDest = ringRanks[nranks-1];
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.send(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final result
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
rankDest = ringRanks[0];
|
||||
offset = dataOffset + rankDest * count;
|
||||
prims.recvReduceCopy(offset, dataOffset, nelem, /*postOp=*/true);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -129,4 +129,13 @@
|
||||
|
||||
#define NPKIT_EVENT_BROADCAST_RING_ENTRY 0x68
|
||||
#define NPKIT_EVENT_BROADCAST_RING_EXIT 0x69
|
||||
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY 0x70
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT 0x71
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY 0x72
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT 0x73
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY 0x74
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT 0x75
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY 0x76
|
||||
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT 0x77
|
||||
#endif
|
||||
|
||||
مرجع در شماره جدید
Block a user