NPkit: misc fixes for MSCCL (#936)
* msccl: add xcc_id to timestamp sync * NPKit: add timestamp for rrc operator * NPKit: add timestamp for MSCCL init
This commit is contained in:
@@ -328,6 +328,10 @@ if ($npkit_enabled); then
|
|||||||
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
|
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
|
||||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
|
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
|
||||||
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
|
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
|
||||||
|
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY \
|
||||||
|
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \
|
||||||
|
-DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \
|
||||||
|
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
|
||||||
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -135,6 +135,12 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
const int nthreads = NCCL_MAX_NTHREADS;
|
const int nthreads = NCCL_MAX_NTHREADS;
|
||||||
|
|
||||||
|
#if defined(ENABLE_NPKIT)
|
||||||
|
uint64_t timestamp_entry = 0;
|
||||||
|
if (tid == 0) {
|
||||||
|
timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// initialize mscclShmem.mscclTB
|
// initialize mscclShmem.mscclTB
|
||||||
threadBlockCopy(
|
threadBlockCopy(
|
||||||
(uint64_t *)&mscclShmem.mscclTB, (uint64_t *)(algo->mscclTBs + bid),
|
(uint64_t *)&mscclShmem.mscclTB, (uint64_t *)(algo->mscclTBs + bid),
|
||||||
@@ -179,7 +185,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
|
|
||||||
#if defined(ENABLE_NPKIT)
|
#if defined(ENABLE_NPKIT)
|
||||||
int npKitCtxIdx = bid;
|
int npKitCtxIdx = bid;
|
||||||
if (tid == 0) ncclShmem.event_buffer_head = 0;
|
int xcc_id = 0;
|
||||||
|
if (tid == 0) {
|
||||||
|
ncclShmem.event_buffer_head = 0;
|
||||||
|
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
||||||
|
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
__synclds(); // publish shmem
|
__synclds(); // publish shmem
|
||||||
if (tid == 0)
|
if (tid == 0)
|
||||||
@@ -188,13 +200,13 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp);
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, xcc_id, *cpuTimestamp);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP());
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -250,11 +262,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
|
|
||||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
int xcc_id = 0;
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, timestamp_entry);
|
||||||
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
|
}
|
||||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
|
|
||||||
#endif
|
#endif
|
||||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
|
||||||
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
|
||||||
|
if (tid == 0) {
|
||||||
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_ENTRY, 0, xcc_id, timestamp_entry);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
|
||||||
|
if (tid == 0) {
|
||||||
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_EXIT, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -412,8 +432,19 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
prims.recvReduceSend(srcOffset, thisNelem);
|
prims.recvReduceSend(srcOffset, thisNelem);
|
||||||
else if (t->type == MSCCL_RECV_REDUCE_COPY_SEND)
|
else if (t->type == MSCCL_RECV_REDUCE_COPY_SEND)
|
||||||
prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem);
|
prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem);
|
||||||
else if (t->type == MSCCL_RECV_REDUCE_COPY)
|
else if (t->type == MSCCL_RECV_REDUCE_COPY) {
|
||||||
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
|
||||||
|
if (tid == 0) {
|
||||||
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||||
|
}
|
||||||
|
#endif
|
||||||
prims.recvReduceCopy(srcOffset, dstOffset, thisNelem);
|
prims.recvReduceCopy(srcOffset, dstOffset, thisNelem);
|
||||||
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
|
||||||
|
if (tid == 0) {
|
||||||
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
else if (t->type == MSCCL_LOCAL_COPY)
|
else if (t->type == MSCCL_LOCAL_COPY)
|
||||||
prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem);
|
prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem);
|
||||||
else
|
else
|
||||||
@@ -427,7 +458,7 @@ __device__ __forceinline__ void mscclRunInterpreter(
|
|||||||
}
|
}
|
||||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
|
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, 0, NPKIT_GET_GPU_TIMESTAMP());
|
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(ENABLE_NPKIT)
|
#if defined(ENABLE_NPKIT)
|
||||||
|
|||||||
@@ -122,5 +122,9 @@
|
|||||||
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
|
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
|
||||||
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
|
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
|
||||||
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
|
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
|
||||||
|
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY 0x64
|
||||||
|
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT 0x65
|
||||||
|
#define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66
|
||||||
|
#define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Referens i nytt ärende
Block a user