NPkit: misc fixes for MSCCL (#936)
* msccl: add xcc_id to timestamp sync * NPKit: add timestamp for rrc operator * NPKit: add timestamp for MSCCL init
このコミットが含まれているのは:
@@ -328,6 +328,10 @@ if ($npkit_enabled); then
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \
|
||||
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
|
||||
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
||||
fi
|
||||
|
||||
|
||||
@@ -135,6 +135,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
const int bid = blockIdx.x;
|
||||
const int nthreads = NCCL_MAX_NTHREADS;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
uint64_t timestamp_entry = 0;
|
||||
if (tid == 0) {
|
||||
timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
|
||||
}
|
||||
#endif
|
||||
// initialize mscclShmem.mscclTB
|
||||
threadBlockCopy(
|
||||
(uint64_t *)&mscclShmem.mscclTB, (uint64_t *)(algo->mscclTBs + bid),
|
||||
@@ -179,7 +185,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
if (tid == 0) ncclShmem.event_buffer_head = 0;
|
||||
int xcc_id = 0;
|
||||
if (tid == 0) {
|
||||
ncclShmem.event_buffer_head = 0;
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
__synclds(); // publish shmem
|
||||
if (tid == 0)
|
||||
@@ -188,13 +200,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp);
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, xcc_id, *cpuTimestamp);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -250,11 +262,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
|
||||
if (tid == 0) {
|
||||
int xcc_id = 0;
|
||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, timestamp_entry);
|
||||
}
|
||||
#endif
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_ENTRY, 0, xcc_id, timestamp_entry);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_EXIT, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -412,8 +432,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
prims.recvReduceSend(srcOffset, thisNelem);
|
||||
else if (t->type == MSCCL_RECV_REDUCE_COPY_SEND)
|
||||
prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem);
|
||||
else if (t->type == MSCCL_RECV_REDUCE_COPY)
|
||||
else if (t->type == MSCCL_RECV_REDUCE_COPY) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
prims.recvReduceCopy(srcOffset, dstOffset, thisNelem);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else if (t->type == MSCCL_LOCAL_COPY)
|
||||
prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem);
|
||||
else
|
||||
@@ -427,7 +458,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
||||
}
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||
}
|
||||
#endif
|
||||
#if defined(ENABLE_NPKIT)
|
||||
|
||||
@@ -122,5 +122,9 @@
|
||||
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
|
||||
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
|
||||
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
|
||||
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY 0x64
|
||||
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT 0x65
|
||||
#define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66
|
||||
#define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67
|
||||
|
||||
#endif
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする