added npkit support into the all_gather run ring algorithm (#790)

This commit is contained in:
akolliasAMD
2023-06-29 13:59:54 -06:00
committed by GitHub
parent 2e00fa4f09
commit 9bba4a2f2a
3 changed files with 78 additions and 0 deletions
+8
View File
@@ -301,6 +301,14 @@ if ($npkit_enabled); then
-DENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT \
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT \
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
fi
+60
View File
@@ -48,6 +48,13 @@ namespace {
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
@@ -82,12 +89,35 @@ namespace {
rankDest = ringRanks[0];
offset = chunkOffset + rankDest * size;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
prims.directSend(chunkOffset, offset, nelem);
} else {
prims.directCopySend(chunkOffset, offset, offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ringRanks[nranks-j];
@@ -96,13 +126,43 @@ namespace {
prims.directRecvCopySend(offset, offset, nelem);
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// Make final copy from buffer to dest.
rankDest = ringRanks[1];
offset = chunkOffset + rankDest * size;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// Final wait/copy.
prims.directRecv(offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
+10
View File
@@ -100,4 +100,14 @@
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
#define NPKIT_EVENT_ALL_GATHER_RING_ENTRY 0x50
#define NPKIT_EVENT_ALL_GATHER_RING_EXIT 0x51
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY 0x52
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT 0x53
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY 0x54
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT 0x55
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY 0x56
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT 0x57
#endif