added npkit support into the all_gather run ring algorithm (#790)
このコミットが含まれているのは:
@@ -301,6 +301,14 @@ if ($npkit_enabled); then
|
||||
-DENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT \
|
||||
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
||||
fi
|
||||
|
||||
|
||||
@@ -48,6 +48,13 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
T *outputBuf = (T*)args->recvbuff;
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
@@ -82,12 +89,35 @@ namespace {
|
||||
rankDest = ringRanks[0];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
|
||||
prims.directSend(chunkOffset, offset, nelem);
|
||||
} else {
|
||||
prims.directCopySend(chunkOffset, offset, offset, nelem);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
rankDest = ringRanks[nranks-j];
|
||||
@@ -96,13 +126,43 @@ namespace {
|
||||
prims.directRecvCopySend(offset, offset, nelem);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
rankDest = ringRanks[1];
|
||||
offset = chunkOffset + rankDest * size;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
// Final wait/copy.
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -100,4 +100,14 @@
|
||||
|
||||
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
|
||||
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_ENTRY 0x50
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_EXIT 0x51
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY 0x52
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT 0x53
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY 0x54
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT 0x55
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY 0x56
|
||||
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT 0x57
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする