NPkit: misc fixes for MSCCL (#936)

* msccl: add xcc_id to timestamp sync

* NPKit: add timestamp for rrc operator

* NPKit: add timestamp for MSCCL init
Dieser Commit ist enthalten in:
Wenkai Du
2023-10-30 10:00:12 -07:00
committet von GitHub
Ursprung 1e5ca6820b
Commit a497722894
3 geänderte Dateien mit 48 neuen und 9 gelöschten Zeilen
+4
Datei anzeigen
@@ -328,6 +328,10 @@ if ($npkit_enabled); then
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
fi
+40 -9
Datei anzeigen
@@ -135,6 +135,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
const int bid = blockIdx.x;
const int nthreads = NCCL_MAX_NTHREADS;
#if defined(ENABLE_NPKIT)
uint64_t timestamp_entry = 0;
if (tid == 0) {
timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
// initialize mscclShmem.mscclTB
threadBlockCopy(
(uint64_t *)&mscclShmem.mscclTB, (uint64_t *)(algo->mscclTBs + bid),
@@ -179,7 +185,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
if (tid == 0) ncclShmem.event_buffer_head = 0;
int xcc_id = 0;
if (tid == 0) {
ncclShmem.event_buffer_head = 0;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
#endif
}
#endif
__synclds(); // publish shmem
if (tid == 0)
@@ -188,13 +200,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp);
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, xcc_id, *cpuTimestamp);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP());
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
@@ -250,11 +262,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
if (tid == 0) {
int xcc_id = 0;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, timestamp_entry);
}
#endif
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_ENTRY, 0, xcc_id, timestamp_entry);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_EXIT, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
@@ -412,8 +432,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
prims.recvReduceSend(srcOffset, thisNelem);
else if (t->type == MSCCL_RECV_REDUCE_COPY_SEND)
prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem);
else if (t->type == MSCCL_RECV_REDUCE_COPY)
else if (t->type == MSCCL_RECV_REDUCE_COPY) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
prims.recvReduceCopy(srcOffset, dstOffset, thisNelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
}
else if (t->type == MSCCL_LOCAL_COPY)
prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem);
else
@@ -427,7 +458,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, 0, NPKIT_GET_GPU_TIMESTAMP());
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#if defined(ENABLE_NPKIT)
+4
Datei anzeigen
@@ -122,5 +122,9 @@
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY 0x64
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT 0x65
#define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66
#define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67
#endif