NPKit: enable reduce scatter profiling (#1580)

Bu işleme şunda yer alıyor:
Wenkai Du
2025-03-04 10:03:56 -08:00
işlemeyi yapan: GitHub
ebeveyn 063c6cfc11
işleme f957c4fe22
3 değiştirilmiş dosya ile 90 ekleme ve 1 silme
+8
Dosyayı Görüntüle
@@ -377,6 +377,14 @@ if [[ "${npkit_enabled}" == true ]]; then
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
-DENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY \
-DENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY \
-DENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT \
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
fi
+73 -1
Dosyayı Görüntüle
@@ -29,31 +29,103 @@ namespace {
uint32_t nelem;
int rankDest;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = ncclShmem.channelId;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_GET_CPU_TIMESTAMP_FROM_BLOCK,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg, 0, work->connIndex, work->connIndex);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
nelem = min(chunkCount, channelCount - elemOffset);
dataOffset = gridOffset + elemOffset;
/////////////// begin ReduceScatter steps ///////////////
// step 0: push data to next GPU
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
rankDest = ringRanks[nranks-1];
offset = dataOffset + rankDest * count;
prims.send(offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// k-2 steps: reduce and copy to next GPU
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
for (int j=2; j<nranks; ++j) {
rankDest = ringRanks[nranks-j];
offset = dataOffset + rankDest * count;
prims.recvReduceSend(offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// step k-1: reduce this buffer and data, which will produce the final result
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
rankDest = ringRanks[0];
offset = dataOffset + rankDest * count;
prims.recvReduceCopy(offset, dataOffset, nelem, /*postOp=*/true);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
+9
Dosyayı Görüntüle
@@ -129,4 +129,13 @@
#define NPKIT_EVENT_BROADCAST_RING_ENTRY 0x68
#define NPKIT_EVENT_BROADCAST_RING_EXIT 0x69
#define NPKIT_EVENT_REDUCE_SCATTER_RING_ENTRY 0x70
#define NPKIT_EVENT_REDUCE_SCATTER_RING_EXIT 0x71
#define NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_ENTRY 0x72
#define NPKIT_EVENT_REDUCE_SCATTER_RING_SEND_EXIT 0x73
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_ENTRY 0x74
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_SEND_EXIT 0x75
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_ENTRY 0x76
#define NPKIT_EVENT_REDUCE_SCATTER_RING_RECV_REDUCE_COPY_EXIT 0x77
#endif