Add Feature - Add NPKit Support in RCCL (#564)
* apply npkit * fix bug * add npkit in readme
This commit is contained in:
@@ -188,6 +188,7 @@ set(CC_SOURCES
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/rocm_smi_wrap.cc
|
||||
src/misc/profiler.cc
|
||||
src/misc/npkit.cc
|
||||
src/misc/shmutils.cc
|
||||
src/misc/signals.cc # RCCL
|
||||
src/misc/socket.cc
|
||||
@@ -220,6 +221,10 @@ if(PROFILE)
|
||||
add_definitions(-DENABLE_PROFILING)
|
||||
endif()
|
||||
|
||||
if(NPKIT_FLAGS)
|
||||
add_definitions(${NPKIT_FLAGS})
|
||||
endif()
|
||||
|
||||
set(COLLTRACE 1 CACHE BOOL "Collective Trace Option")
|
||||
if(COLLTRACE)
|
||||
add_definitions(-DENABLE_COLLTRACE)
|
||||
|
||||
@@ -83,6 +83,18 @@ will run only AllReduce correctness tests with float32 datatype. See "Running a
|
||||
There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
|
||||
See the rccl-tests README for more information on how to build and run those tests.
|
||||
|
||||
## NPKit
|
||||
|
||||
RCCL integrates [NPKit](https://github.com/microsoft/npkit), a profiler framework that enables collecting fine-grained trace events in RCCL components, especially in giant collective GPU kernels.
|
||||
|
||||
Please check [NPKit sample workflow for RCCL](https://github.com/microsoft/NPKit/tree/main/rccl_samples) as a fully automated usage example. It also provides good templates for the following manual instructions.
|
||||
|
||||
To manually build RCCL with NPKit enabled, pass `-DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_...(other NPKit compile-time switches)"` with cmake command. All NPKit compile-time switches are declared in the RCCL code base as macros with prefix `ENABLE_NPKIT_`, and they control which information will be collected. Also note that currently NPKit only supports collecting non-overlapped events on GPU, and `-DNPKIT_FLAGS` should follow this rule.
|
||||
|
||||
To manually run RCCL with NPKit enabled, environment variable `NPKIT_DUMP_DIR` needs to be set as the NPKit event dump directory. Also note that currently NPKit only supports 1 GPU per process.
|
||||
|
||||
To manually analyze NPKit dump results, please leverage [npkit_trace_generator.py](https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py).
|
||||
|
||||
## Library and API Documentation
|
||||
|
||||
Please refer to the [Library documentation](https://rccl.readthedocs.io/) for current documentation.
|
||||
|
||||
@@ -10,6 +10,10 @@
|
||||
#include "primitives.h"
|
||||
//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
@@ -29,6 +33,32 @@ namespace {
|
||||
#endif
|
||||
const ssize_t size = args->count;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
int minChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_LL)
|
||||
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
|
||||
@@ -42,6 +72,12 @@ namespace {
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, args->connIndex << 16);
|
||||
ACCUMULATE_PRIM_COUNTER(prim);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (tid == 0) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t realChunkSize;
|
||||
if (Proto::Id == NCCL_PROTO_SIMPLE) {
|
||||
@@ -70,11 +106,36 @@ namespace {
|
||||
chunk = modRanks(ringIx + nranks-1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.send(offset, nelem);
|
||||
ACCUMULATE_COUNTER(send);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// k-2 steps: reduce and copy to next GPU
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int j=2; j<nranks; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
offset = calcOffset(chunk);
|
||||
@@ -84,15 +145,46 @@ namespace {
|
||||
ACCUMULATE_COUNTER(recvReduceSend);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// step k-1: reduce this buffer and data, which will produce the final
|
||||
// result that we store in this data and push to the next GPU
|
||||
chunk = ringIx + 0;
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*postOp=*/true);
|
||||
ACCUMULATE_COUNTER(directRecvReduceCopySend);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// k-2 steps: copy to next GPU
|
||||
for (int j=1; j<nranks-1; ++j) {
|
||||
chunk = modRanks(ringIx + nranks-j);
|
||||
@@ -103,13 +195,37 @@ namespace {
|
||||
ACCUMULATE_COUNTER(directRecvCopySend);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = modRanks(ringIx + 1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
INIT_COUNTER;
|
||||
prims.directRecv(offset, nelem);
|
||||
ACCUMULATE_COUNTER(directRecv);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
@@ -117,6 +233,14 @@ namespace {
|
||||
elem->elem[blockIdx.x].total_cycle += (__builtin_amdgcn_s_memrealtime() - clk);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
@@ -135,12 +259,53 @@ namespace {
|
||||
const ssize_t loopSize = int(nChannels*chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
int npKitCtxIdx = bid;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
|
||||
|
||||
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (tid == 0) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
@@ -162,11 +327,34 @@ namespace {
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0> prims
|
||||
(tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (tid == 0) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tree->up == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
@@ -188,7 +376,23 @@ namespace {
|
||||
prims.directRecvCopySend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
@@ -217,6 +421,40 @@ namespace {
|
||||
nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
bool isNpKitThread = false;
|
||||
int npKitCtxIdx = 0;
|
||||
if (threadIdx.x == 0) {
|
||||
isNpKitThread = true;
|
||||
npKitCtxIdx = bid * 2;
|
||||
} else if (tree->up != -1 && threadIdx.x == nthreadsSplit) {
|
||||
isNpKitThread = true;
|
||||
npKitCtxIdx = bid * 2 + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (loopSize > size)
|
||||
chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
|
||||
|
||||
@@ -224,11 +462,34 @@ namespace {
|
||||
// Reduce and broadcast. Max number of recv is 3, max number of send is 3
|
||||
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
int nelem = min(chunkSize, size-offset);
|
||||
prims.directRecvReduceCopySend(offset, offset, offset, nelem, /*doPost=*/true);
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
else if (tid < nthreadsSplit) {
|
||||
/* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local).
|
||||
@@ -241,6 +502,21 @@ namespace {
|
||||
*/
|
||||
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
@@ -255,11 +531,34 @@ namespace {
|
||||
prims.recvReduceSend(offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
else {
|
||||
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
|
||||
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
|
||||
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tree->down[0] == -1) {
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
ssize_t offset = gridOffset + bid*int(chunkSize);
|
||||
@@ -274,7 +573,23 @@ namespace {
|
||||
prims.directRecvCopySend(offset, offset, nelem);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
|
||||
class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
|
||||
@@ -34,6 +38,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
union ncclLLFifoLine* recvBuff[MaxRecv];
|
||||
union ncclLLFifoLine* sendBuff[MaxSend];
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
public:
|
||||
int npKitCtxIdx = 0;
|
||||
uint64_t npKitDataProcessEntryTime = 0;
|
||||
uint64_t npKitDataProcessExitTime = 0;
|
||||
uint64_t npKitDataProcessTotalTime = 0;
|
||||
private:
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
uint64_t npKitWaitRecvDataProcessSize = 0;
|
||||
uint64_t npKitWaitRecvEntryTime = 0;
|
||||
uint64_t npKitWaitRecvExitTime = 0;
|
||||
uint64_t npKitWaitRecvTotalTime = 0;
|
||||
#endif
|
||||
|
||||
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
|
||||
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
|
||||
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
|
||||
@@ -70,6 +90,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
}
|
||||
|
||||
inline __device__ void waitSend(int nbytes) {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
if (sendConnHeadPtr) {
|
||||
int spins = 0;
|
||||
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
|
||||
@@ -83,6 +109,12 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
sendConnHead += 1;
|
||||
}
|
||||
barrier();
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void incRecv(int i) {
|
||||
@@ -107,21 +139,43 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
uint32_t flag = recvFlag(i);
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
int spins = 0;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
int npkitWaitRecvSpins = 0;
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
union ncclLLFifoLine i4;
|
||||
do {
|
||||
i4.v[0] = __builtin_nontemporal_load(src->v);
|
||||
i4.v[1] = __builtin_nontemporal_load(src->v+1);
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
} while ((i4.flag1 != flag) || (i4.flag2 != flag));
|
||||
uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
|
||||
#else
|
||||
do {
|
||||
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
} while ((flag1 != flag) || (flag2 != flag));
|
||||
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
|
||||
}
|
||||
#endif
|
||||
|
||||
return val64;
|
||||
}
|
||||
|
||||
@@ -144,16 +198,35 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
uint32_t flag = recvFlag(i);
|
||||
int spins = 0;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
int npkitWaitRecvSpins = 0;
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
do {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
line[i].v[0] = __builtin_nontemporal_load(src->v);
|
||||
line[i].v[1] = __builtin_nontemporal_load(src->v+1);
|
||||
#else
|
||||
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
|
||||
#endif
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
npkitWaitRecvSpins++;
|
||||
#endif
|
||||
if (checkAbort(spins, 0)) break;
|
||||
} while(line[i].flag1 != flag || line[i].flag2 != flag);
|
||||
uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
|
||||
}
|
||||
#endif
|
||||
|
||||
return val64;
|
||||
}
|
||||
|
||||
@@ -296,6 +369,22 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
nelem = nelem < 0 ? 0 : nelem;
|
||||
if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine));
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvTotalTime = 0;
|
||||
npKitWaitRecvDataProcessSize = nelem*sizeof(T);
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
|
||||
npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitWaitRecvTotalTime = 0;
|
||||
npKitDataProcessEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
nelem -= tid*EltPerLine;
|
||||
srcElts += tid*EltPerLine;
|
||||
dstElts += tid*EltPerLine;
|
||||
@@ -344,6 +433,21 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
offset += nthreads;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
|
||||
npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (RECV) {
|
||||
for (int i=0; i < MaxRecv; i++) incRecv(i);
|
||||
postRecv();
|
||||
@@ -430,28 +534,124 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
|
||||
}
|
||||
|
||||
__device__ void send(intptr_t inpIx, int eltN) {
|
||||
return LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void sendFromOutput(intptr_t outIx, int eltN) {
|
||||
return LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
return LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recvReduceSend(intptr_t inpIx, int eltN) {
|
||||
return LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
return LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
return LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
|
||||
return LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
|
||||
return LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__device__ void recvSend(int eltN) {
|
||||
return LLGenericOp<1, 1, -1, -1>(-1, -1, eltN, false);
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct,
|
||||
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
|
||||
class Primitives<
|
||||
@@ -49,6 +53,15 @@ class Primitives<
|
||||
const uint64_t opCount;
|
||||
uint32_t* next_hdp_reg;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
public:
|
||||
int npKitCtxIdx = 0;
|
||||
uint64_t npKitDataProcessEntryTime = 0;
|
||||
uint64_t npKitDataProcessExitTime = 0;
|
||||
uint64_t npKitDataProcessTotalTime = 0;
|
||||
private:
|
||||
#endif
|
||||
|
||||
// Don't use barrier 0 as it's used by the final sync
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
@@ -206,21 +219,93 @@ class Primitives<
|
||||
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (Send) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
|
||||
(tid, nworkers, nullptr, false,
|
||||
1, (T const**)ncclShmem->groups[group].srcs,
|
||||
fan.nsend(), (T**)ncclShmem->groups[group].dsts+1,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) {
|
||||
// For broadcast in CollNet to do empty send
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
|
||||
(tid, nworkers, ncclShmem->redOpArgs, postOp,
|
||||
Recv, (T const**)ncclShmem->groups[group].srcs,
|
||||
Dst, (T**)ncclShmem->groups[group].dsts,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
} else {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessEntryTime = __builtin_amdgcn_s_memrealtime();
|
||||
}
|
||||
#endif
|
||||
|
||||
constexpr int PreOpN = SrcBuf != Input ? 0 :
|
||||
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
|
||||
@@ -228,6 +313,21 @@ class Primitives<
|
||||
Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs,
|
||||
Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
if (tid == 0) {
|
||||
npKitDataProcessExitTime = __builtin_amdgcn_s_memrealtime();
|
||||
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
barrier(); // This barrier has a counterpart in following loop
|
||||
#if defined(__gfx1030__)
|
||||
|
||||
@@ -8,14 +8,68 @@
|
||||
#include "devcomm.h"
|
||||
#include "collectives.h"
|
||||
#include "primitives.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
bool isNpKitThread = (tid == 0);
|
||||
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer == ncclShmem->comm.rank) {
|
||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||
if (args->buff != recvArgs->buff) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
} else {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
@@ -24,16 +78,59 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
ssize_t offset = 0;
|
||||
do {
|
||||
int nelem = min(chunkSize, count-offset);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
#if defined(ENABLE_NPKIT)
|
||||
bool isNpKitThread = (tid == 0);
|
||||
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + 1;
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer != ncclShmem->comm.rank) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
ssize_t const count = args->count;
|
||||
@@ -41,12 +138,35 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
prims.npKitCtxIdx = npKitCtxIdx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
ssize_t offset = 0;
|
||||
do {
|
||||
int nelem = min(chunkSize, count-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,6 +11,9 @@
|
||||
#include "nccl.h"
|
||||
#include "rccl_bfloat16.h"
|
||||
#include "align.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit_struct.h"
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
// [RCCL] Support for clique-based kernels
|
||||
//#include "clique/CliqueCommon.h"
|
||||
@@ -419,6 +422,11 @@ struct ncclDevComm {
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
uint64_t* cpuTimestamp;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf devProf;
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
#ifndef NPKIT_H_
|
||||
#define NPKIT_H_
|
||||
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include "npkit/npkit_event.h"
|
||||
#include "npkit/npkit_struct.h"
|
||||
|
||||
class NpKit {
|
||||
public:
|
||||
static const uint64_t kNumGpuEventBuffers = 512;
|
||||
|
||||
static const uint64_t kNumCpuEventBuffers = 32;
|
||||
|
||||
static ncclResult_t Init(int rank);
|
||||
|
||||
static ncclResult_t Dump(const std::string& dump_dir);
|
||||
|
||||
static ncclResult_t Shutdown();
|
||||
|
||||
static NpKitEventCollectContext* GetGpuEventCollectContexts();
|
||||
|
||||
static inline __device__ void CollectGpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp,
|
||||
NpKitEventCollectContext* ctx) {
|
||||
uint64_t event_buffer_head = ctx->event_buffer_head;
|
||||
if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
|
||||
NpKitEvent& event = ctx->event_buffer[event_buffer_head];
|
||||
event.fields.type = type;
|
||||
event.fields.size = size;
|
||||
event.fields.rsvd = rsvd;
|
||||
event.fields.timestamp = timestamp;
|
||||
ctx->event_buffer_head++;
|
||||
}
|
||||
}
|
||||
|
||||
static void CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
|
||||
|
||||
static uint64_t* GetCpuTimestamp();
|
||||
|
||||
private:
|
||||
static void CpuTimestampUpdateThread();
|
||||
|
||||
// 64K * 512 * 16B = 512MB per GPU
|
||||
static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 16;
|
||||
|
||||
// 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
|
||||
static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
|
||||
|
||||
static NpKitEvent** gpu_event_buffers_;
|
||||
static NpKitEvent** cpu_event_buffers_;
|
||||
|
||||
static NpKitEventCollectContext* gpu_collect_contexts_;
|
||||
static NpKitEventCollectContext* cpu_collect_contexts_;
|
||||
static uint64_t* cpu_timestamp_;
|
||||
|
||||
static uint64_t rank_;
|
||||
|
||||
static std::thread* cpu_timestamp_update_thread_;
|
||||
static volatile bool cpu_timestamp_update_thread_should_stop_;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,98 @@
|
||||
#ifndef NPKIT_EVENT_H_
|
||||
#define NPKIT_EVENT_H_
|
||||
|
||||
#define NPKIT_EVENT_INVALID 0x0
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_ENTRY 0x1
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_EXIT 0x2
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY 0x3
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT 0x4
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY 0x5
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT 0x6
|
||||
|
||||
#define NPKIT_EVENT_COPY_SEND_ENTRY 0x7
|
||||
#define NPKIT_EVENT_COPY_SEND_EXIT 0x8
|
||||
#define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY 0x9
|
||||
#define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT 0xA
|
||||
#define NPKIT_EVENT_DIRECT_RECV_ENTRY 0xB
|
||||
#define NPKIT_EVENT_DIRECT_RECV_EXIT 0xC
|
||||
#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY 0xD
|
||||
#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT 0xE
|
||||
#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0xF
|
||||
#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x10
|
||||
#define NPKIT_EVENT_DIRECT_SEND_ENTRY 0x11
|
||||
#define NPKIT_EVENT_DIRECT_SEND_EXIT 0x12
|
||||
#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY 0x13
|
||||
#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT 0x14
|
||||
#define NPKIT_EVENT_RECV_ENTRY 0x15
|
||||
#define NPKIT_EVENT_RECV_EXIT 0x16
|
||||
#define NPKIT_EVENT_RECV_COPY_SEND_ENTRY 0x17
|
||||
#define NPKIT_EVENT_RECV_COPY_SEND_EXIT 0x18
|
||||
#define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY 0x19
|
||||
#define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT 0x1A
|
||||
#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY 0x1B
|
||||
#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT 0x1C
|
||||
#define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY 0x1D
|
||||
#define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT 0x1E
|
||||
#define NPKIT_EVENT_SEND_ENTRY 0x1F
|
||||
#define NPKIT_EVENT_SEND_EXIT 0x20
|
||||
#define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY 0x21
|
||||
#define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT 0x22
|
||||
|
||||
#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY 0x23
|
||||
#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT 0x24
|
||||
#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY 0x25
|
||||
#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT 0x26
|
||||
|
||||
#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY 0x27
|
||||
#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT 0x28
|
||||
#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY 0x29
|
||||
#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT 0x2A
|
||||
|
||||
#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY 0x2B
|
||||
#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT 0x2C
|
||||
#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY 0x2D
|
||||
#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT 0x2E
|
||||
|
||||
#define NPKIT_EVENT_NET_SEND_ENTRY 0x2F
|
||||
#define NPKIT_EVENT_NET_SEND_EXIT 0x30
|
||||
|
||||
#define NPKIT_EVENT_NET_RECV_ENTRY 0x31
|
||||
#define NPKIT_EVENT_NET_RECV_EXIT 0x32
|
||||
|
||||
#define NPKIT_EVENT_TIME_SYNC_GPU 0x33
|
||||
#define NPKIT_EVENT_TIME_SYNC_CPU 0x34
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D
|
||||
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42
|
||||
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47
|
||||
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48
|
||||
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49
|
||||
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B
|
||||
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D
|
||||
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E
|
||||
|
||||
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,25 @@
|
||||
#ifndef NPKIT_STRUCT_H_
|
||||
#define NPKIT_STRUCT_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
union NpKitEvent {
|
||||
uint64_t bits[2];
|
||||
struct {
|
||||
uint64_t type : 8;
|
||||
uint64_t size : 32;
|
||||
uint64_t rsvd : 24;
|
||||
uint64_t timestamp;
|
||||
} fields;
|
||||
};
|
||||
|
||||
struct NpKitEventCollectContext {
|
||||
NpKitEvent* event_buffer;
|
||||
uint64_t event_buffer_head;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
#endif
|
||||
@@ -58,6 +58,10 @@ struct ncclProxySubArgs {
|
||||
uint64_t end;
|
||||
void* requests[NCCL_STEPS];
|
||||
void* profilingEvents[NCCL_STEPS];
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
int npKitSizesFifo[NCCL_STEPS];
|
||||
#endif
|
||||
};
|
||||
|
||||
struct ncclProxyArgs {
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
#include "enqueue.h"
|
||||
#include "graph.h"
|
||||
#include "argcheck.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
@@ -501,6 +504,13 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
|
||||
}
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
// Init NPKit
|
||||
NCCLCHECK(NpKit::Init(comm->rank));
|
||||
comm->hostDevComm.npKitEventCollectContexts = NpKit::GetGpuEventCollectContexts();
|
||||
comm->hostDevComm.cpuTimestamp = NpKit::GetCpuTimestamp();
|
||||
#endif
|
||||
|
||||
// Duplicate the dev comm on the device
|
||||
NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
|
||||
return ncclSuccess;
|
||||
@@ -1399,6 +1409,17 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
if (savedDevice != commDevice)
|
||||
CUDACHECK(hipSetDevice(savedDevice));
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
// Dump NPKit events and shutdown
|
||||
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
|
||||
if (npkitDumpDir == nullptr) {
|
||||
WARN("NPKIT_DUMP_DIR is empty");
|
||||
} else {
|
||||
NCCLCHECK(NpKit::Dump(npkitDumpDir));
|
||||
}
|
||||
NCCLCHECK(NpKit::Shutdown());
|
||||
#endif
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "alloc.h"
|
||||
#include "npkit/npkit.h"
|
||||
|
||||
uint64_t NpKit::rank_ = 0;
|
||||
|
||||
NpKitEvent** NpKit::gpu_event_buffers_ = nullptr;
|
||||
NpKitEvent** NpKit::cpu_event_buffers_ = nullptr;
|
||||
|
||||
NpKitEventCollectContext* NpKit::gpu_collect_contexts_ = nullptr;
|
||||
NpKitEventCollectContext* NpKit::cpu_collect_contexts_ = nullptr;
|
||||
uint64_t* NpKit::cpu_timestamp_ = nullptr;
|
||||
|
||||
std::thread* NpKit::cpu_timestamp_update_thread_ = nullptr;
|
||||
volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
|
||||
|
||||
void NpKit::CpuTimestampUpdateThread() {
|
||||
uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
|
||||
uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
uint64_t curr_steady_clock = 0;
|
||||
volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_;
|
||||
while (!cpu_timestamp_update_thread_should_stop_) {
|
||||
curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
|
||||
*volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock);
|
||||
}
|
||||
}
|
||||
|
||||
ncclResult_t NpKit::Init(int rank) {
|
||||
uint64_t i = 0;
|
||||
NpKitEventCollectContext ctx;
|
||||
ctx.event_buffer_head = 0;
|
||||
rank_ = rank;
|
||||
|
||||
// Init event data structures
|
||||
NCCLCHECK(ncclCalloc(&gpu_event_buffers_, kNumGpuEventBuffers));
|
||||
NCCLCHECK(ncclCudaCalloc(&gpu_collect_contexts_, kNumGpuEventBuffers));
|
||||
for (i = 0; i < kNumGpuEventBuffers; i++) {
|
||||
NCCLCHECK(ncclCudaCalloc(gpu_event_buffers_ + i, kMaxNumGpuEventsPerBuffer));
|
||||
ctx.event_buffer = gpu_event_buffers_[i];
|
||||
NCCLCHECK(ncclCudaMemcpy(gpu_collect_contexts_ + i, &ctx, 1));
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclCalloc(&cpu_event_buffers_, kNumCpuEventBuffers));
|
||||
NCCLCHECK(ncclCalloc(&cpu_collect_contexts_, kNumCpuEventBuffers));
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
NCCLCHECK(ncclCalloc(cpu_event_buffers_ + i, kMaxNumCpuEventsPerBuffer));
|
||||
ctx.event_buffer = cpu_event_buffers_[i];
|
||||
cpu_collect_contexts_[i] = ctx;
|
||||
}
|
||||
|
||||
// Init timestamp
|
||||
NCCLCHECK(ncclCudaHostCalloc(&cpu_timestamp_, 1));
|
||||
cpu_timestamp_update_thread_should_stop_ = false;
|
||||
cpu_timestamp_update_thread_ = new std::thread(CpuTimestampUpdateThread);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t NpKit::Dump(const std::string& dump_dir) {
|
||||
uint64_t i = 0;
|
||||
std::string dump_file_path;
|
||||
|
||||
// Dump CPU events
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/cpu_events_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
dump_file_path += "_channel_";
|
||||
dump_file_path += std::to_string(i);
|
||||
auto cpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
cpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[i]),
|
||||
cpu_collect_contexts_[i].event_buffer_head * sizeof(NpKitEvent));
|
||||
cpu_trace_file.close();
|
||||
}
|
||||
|
||||
// Dump CPU clock info
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/cpu_clock_period_num_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string clock_period_num_str = std::to_string(std::chrono::steady_clock::duration::period::num);
|
||||
auto clock_period_num_file = std::fstream(dump_file_path, std::ios::out);
|
||||
clock_period_num_file.write(clock_period_num_str.c_str(), clock_period_num_str.length());
|
||||
clock_period_num_file.close();
|
||||
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/cpu_clock_period_den_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
std::string clock_period_den_str = std::to_string(std::chrono::steady_clock::duration::period::den);
|
||||
auto clock_period_den_file = std::fstream(dump_file_path, std::ios::out);
|
||||
clock_period_den_file.write(clock_period_den_str.c_str(), clock_period_den_str.length());
|
||||
clock_period_den_file.close();
|
||||
|
||||
// Dump GPU events, reuse CPU struct
|
||||
for (i = 0; i < kNumGpuEventBuffers; i++) {
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/gpu_events_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
dump_file_path += "_buf_";
|
||||
dump_file_path += std::to_string(i);
|
||||
NCCLCHECK(ncclCudaMemcpy(cpu_event_buffers_[0], gpu_event_buffers_[i], kMaxNumGpuEventsPerBuffer));
|
||||
NCCLCHECK(ncclCudaMemcpy(cpu_collect_contexts_, gpu_collect_contexts_ + i, 1));
|
||||
auto gpu_trace_file = std::fstream(dump_file_path, std::ios::out | std::ios::binary);
|
||||
gpu_trace_file.write(reinterpret_cast<char*>(cpu_event_buffers_[0]),
|
||||
cpu_collect_contexts_[0].event_buffer_head * sizeof(NpKitEvent));
|
||||
gpu_trace_file.close();
|
||||
}
|
||||
|
||||
// Dump GPU clockRate
|
||||
dump_file_path = dump_dir;
|
||||
dump_file_path += "/gpu_clock_rate_rank_";
|
||||
dump_file_path += std::to_string(rank_);
|
||||
constexpr int vega_gpu_rtc_freq_in_khz = 25000;
|
||||
std::string clock_rate_str = std::to_string(vega_gpu_rtc_freq_in_khz);
|
||||
auto gpu_clock_rate_file = std::fstream(dump_file_path, std::ios::out);
|
||||
gpu_clock_rate_file.write(clock_rate_str.c_str(), clock_rate_str.length());
|
||||
gpu_clock_rate_file.close();
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t NpKit::Shutdown() {
|
||||
uint64_t i = 0;
|
||||
|
||||
// Stop CPU timestamp updating thread
|
||||
cpu_timestamp_update_thread_should_stop_ = true;
|
||||
cpu_timestamp_update_thread_->join();
|
||||
|
||||
// Free CPU event data structures
|
||||
for (i = 0; i < kNumCpuEventBuffers; i++) {
|
||||
free(cpu_event_buffers_[i]);
|
||||
}
|
||||
free(cpu_event_buffers_);
|
||||
free(cpu_collect_contexts_);
|
||||
|
||||
// Free GPU event data structures
|
||||
for (i = 0; i < kNumGpuEventBuffers; i++) {
|
||||
CUDACHECK(hipFree(gpu_event_buffers_[i]));
|
||||
}
|
||||
free(gpu_event_buffers_);
|
||||
CUDACHECK(hipFree(gpu_collect_contexts_));
|
||||
|
||||
// Free timestamp
|
||||
NCCLCHECK(ncclCudaHostFree(cpu_timestamp_));
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NpKitEventCollectContext* NpKit::GetGpuEventCollectContexts() {
|
||||
return gpu_collect_contexts_;
|
||||
}
|
||||
|
||||
void NpKit::CollectCpuEvent(uint8_t type, uint32_t size, uint32_t rsvd, uint64_t timestamp, int channel_id) {
|
||||
uint64_t event_buffer_head = cpu_collect_contexts_[channel_id].event_buffer_head;
|
||||
if (event_buffer_head < kMaxNumCpuEventsPerBuffer) {
|
||||
NpKitEvent& event = cpu_collect_contexts_[channel_id].event_buffer[event_buffer_head];
|
||||
event.fields.type = type;
|
||||
event.fields.size = size;
|
||||
event.fields.rsvd = rsvd;
|
||||
event.fields.timestamp = timestamp;
|
||||
cpu_collect_contexts_[channel_id].event_buffer_head++;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t* NpKit::GetCpuTimestamp() {
|
||||
return cpu_timestamp_;
|
||||
}
|
||||
@@ -14,6 +14,9 @@
|
||||
#include "gdrwrap.h"
|
||||
#include "shm.h"
|
||||
#include "profiler.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
|
||||
|
||||
@@ -777,7 +780,16 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
|
||||
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
static int g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt++;
|
||||
#endif
|
||||
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
@@ -831,6 +843,11 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = sizesFifo[buffSlot];
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
sub->npKitSizesFifo[buffSlot] = size;
|
||||
#endif
|
||||
|
||||
char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
@@ -865,6 +882,22 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_SEND_ENTRY,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
size,
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||
sizesFifo[buffSlot] = -1;
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
@@ -883,6 +916,22 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_SEND_EXIT,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sub->npKitSizesFifo[buffSlot],
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
@@ -908,6 +957,11 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt++;
|
||||
#endif
|
||||
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Initialize subs and group them by same recvComm.
|
||||
void* recvComm;
|
||||
@@ -989,6 +1043,22 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
if (*requestPtr) {
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup+i;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_RECV_ENTRY,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
|
||||
}
|
||||
@@ -1014,6 +1084,22 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_RECV_EXIT,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub->received += args->sliceSteps;
|
||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||
if (step < sub->nsteps) {
|
||||
|
||||
Reference in New Issue
Block a user