Define ncclShmem as global shared (#618)
* Use global defined shared memory * Add --hipcc-func-supp to compile option * Force inline some device functions * Add back threadfence
Этот коммит содержится в:
@@ -300,7 +300,7 @@ foreach(target ${AMDGPU_TARGETS})
|
||||
endforeach()
|
||||
|
||||
if("${HIP_COMPILER}" MATCHES "clang")
|
||||
target_compile_options(rccl PRIVATE -fvisibility=hidden)
|
||||
target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
|
||||
foreach(target ${AMDGPU_TARGETS})
|
||||
target_compile_options(rccl PRIVATE -fgpu-rdc)
|
||||
endforeach()
|
||||
|
||||
@@ -16,12 +16,12 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*int(chunkSize);
|
||||
const ssize_t size = args->count;
|
||||
|
||||
@@ -79,7 +79,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
@@ -87,14 +87,14 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -20,10 +20,10 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int ringIx = ring->index;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*nranks*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
@@ -33,23 +33,23 @@ namespace {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -102,7 +102,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -112,7 +112,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -121,7 +121,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -136,7 +136,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -149,7 +149,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -159,14 +159,14 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -182,14 +182,14 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
|
||||
if (tid == 0 && nranks > 2) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -203,7 +203,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -212,7 +212,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -224,7 +224,7 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem->channel.tree;
|
||||
ncclTree *tree = &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
|
||||
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
|
||||
@@ -240,23 +240,23 @@ namespace {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (tid == 0) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -276,7 +276,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -306,7 +306,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -325,7 +325,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -355,7 +355,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -364,7 +364,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -376,7 +376,7 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem->channel.binTree : &ncclShmem->channel.tree;
|
||||
ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem.channel.binTree : &ncclShmem.channel.tree;
|
||||
ssize_t chunkSize = int(
|
||||
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
|
||||
: Proto::calcBytePerStep()/sizeof(T));
|
||||
@@ -410,23 +410,23 @@ namespace {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -447,7 +447,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -461,7 +461,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -487,7 +487,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -510,7 +510,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -529,7 +529,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -552,7 +552,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -561,7 +561,7 @@ namespace {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -570,7 +570,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
@@ -578,7 +578,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
|
||||
}
|
||||
};
|
||||
@@ -590,7 +590,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
struct ncclDirect* tree = &ncclShmem->channel.collTree;
|
||||
struct ncclDirect* tree = &ncclShmem.channel.collTree;
|
||||
const ssize_t chunkSize = int(args->lastChunkSize);
|
||||
const ssize_t size = args->count;
|
||||
const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
|
||||
@@ -683,14 +683,14 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
|
||||
else runTreeSplit<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
@@ -698,7 +698,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
|
||||
}
|
||||
@@ -706,7 +706,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runTreeSplit<T, RedOp, ProtoLL128>(args);
|
||||
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
|
||||
}
|
||||
|
||||
@@ -14,8 +14,8 @@ namespace {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const ncclRing *ring = &ncclShmem->channel.ring;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const int num_bi_rings = args->pivotA2ANumBiRings;
|
||||
const int num_uni_rings = num_bi_rings * 2;
|
||||
const int num_chunks = args->nChannels / 2;
|
||||
@@ -71,7 +71,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
@@ -61,7 +61,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
@@ -69,14 +69,14 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -188,7 +188,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{
|
||||
|
||||
template<unsigned short f, unsigned short l, bool u>
|
||||
struct Caller {
|
||||
static __device__ __host__
|
||||
static __forceinline__ __device__ __host__
|
||||
void call(unsigned short funcIndex) noexcept
|
||||
{
|
||||
constexpr unsigned short m = f + (l - f) / 2;
|
||||
@@ -199,7 +199,7 @@ struct Caller {
|
||||
|
||||
template<unsigned short f, bool u>
|
||||
struct Caller<f, f + 1, u>{
|
||||
static __device__ __host__
|
||||
static __forceinline__ __device__ __host__
|
||||
void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
|
||||
};
|
||||
|
||||
@@ -207,7 +207,7 @@ static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index");
|
||||
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function index");
|
||||
|
||||
template<bool USING_LL128>
|
||||
inline
|
||||
__forceinline__
|
||||
__device__
|
||||
void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
|
||||
#if defined(BUILD_ALLREDUCE_ONLY)
|
||||
@@ -320,14 +320,14 @@ class ncclFunction {
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = shmem.work.header.funcIndex; \
|
||||
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
|
||||
if (shmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
|
||||
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
|
||||
@@ -341,8 +341,8 @@ class ncclFunction {
|
||||
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
|
||||
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (shmem.work.header.type == ncclWorkTypeColl) { \
|
||||
struct ncclWorkElem *elems = shmem.work.elems; \
|
||||
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
|
||||
struct ncclWorkElem *elems = ncclShmem.work.elems; \
|
||||
collTrace->opCount = elems[0].opCount; \
|
||||
collTrace->coll.nWarps = elems[0].nWarps; \
|
||||
collTrace->coll.bid = elems[0].bid; \
|
||||
@@ -355,23 +355,23 @@ class ncclFunction {
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->funcIndex = data2; \
|
||||
@@ -408,14 +408,14 @@ struct ncclShmemData {
|
||||
struct ncclProf prof;
|
||||
#endif
|
||||
};
|
||||
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
|
||||
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
|
||||
shmem.prof.elem[shmem.prof.count].line = line_num; \
|
||||
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.prof.count++; \
|
||||
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
|
||||
ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
|
||||
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
ncclShmem.prof.count++; \
|
||||
} \
|
||||
} while(0);
|
||||
#else
|
||||
@@ -459,7 +459,7 @@ struct RunWork {
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
if (we->isUsed && we->redOpArgIsPtr) {
|
||||
/* redOpArg is a pointer to the scalar value, so we'll dereference it
|
||||
* here so that redOpArg holds the bits of the scalar going forward.
|
||||
@@ -480,19 +480,17 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
}
|
||||
}
|
||||
|
||||
extern __device__ struct ncclShmemData *ncclShmem;
|
||||
extern __shared__ ncclShmemData ncclShmem;
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
|
||||
__device__ void ncclKernel(
|
||||
__forceinline__ __device__ void ncclKernel(
|
||||
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
|
||||
) {
|
||||
int tid = threadIdx.x;
|
||||
__shared__ struct ncclShmemData shmem;
|
||||
ncclShmem = &shmem;
|
||||
if (tid == 0) {
|
||||
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
|
||||
shmem.groups[i].barrier = 0;
|
||||
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
|
||||
ncclShmem.groups[i].barrier = 0;
|
||||
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) ncclShmem.groups[i].barrier_next[j] = 0;
|
||||
}
|
||||
}
|
||||
// To map blockId to channelId, we need the n'th set bit of channelMask which
|
||||
@@ -501,18 +499,18 @@ __device__ void ncclKernel(
|
||||
int x = tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) shmem.channelId = x;
|
||||
if (blockIdx.x == y) ncclShmem.channelId = x;
|
||||
}
|
||||
if (32 < MAXCHANNELS) {
|
||||
x = 32 + tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) shmem.channelId = x;
|
||||
if (blockIdx.x == y) ncclShmem.channelId = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads(); // publish shmem.channelId
|
||||
int channelId = shmem.channelId;
|
||||
__syncthreads(); // publish ncclShmem.channelId
|
||||
int channelId = ncclShmem.channelId;
|
||||
|
||||
if (true) {
|
||||
void *dst, *src;
|
||||
@@ -520,20 +518,20 @@ __device__ void ncclKernel(
|
||||
// Use first 3 warps to load comm, channel, and work into shmem
|
||||
switch (tid/WARP_SIZE) {
|
||||
case 0:
|
||||
dst = &shmem.comm;
|
||||
dst = &ncclShmem.comm;
|
||||
src = comm;
|
||||
bytes = sizeof(ncclDevComm);
|
||||
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 1:
|
||||
// Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
dst = &shmem.channel;
|
||||
dst = &ncclShmem.channel;
|
||||
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
|
||||
bytes = sizeof(ncclDevChannel);
|
||||
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 2:
|
||||
dst = &shmem.work;
|
||||
dst = &ncclShmem.work;
|
||||
src = workHead + blockIdx.x;
|
||||
bytes = sizeof(ncclWork);
|
||||
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
|
||||
@@ -547,8 +545,8 @@ __device__ void ncclKernel(
|
||||
__syncthreads(); // publish shmem
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
shmem.prof.count = 0;
|
||||
shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
|
||||
ncclShmem.prof.count = 0;
|
||||
ncclShmem.prof.seq = ncclShmem.comm.devProf[blockIdx.x].seq;
|
||||
}
|
||||
#endif
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
@@ -557,34 +555,34 @@ __device__ void ncclKernel(
|
||||
|
||||
while (true) {
|
||||
// Notify host that all fifo reads are complete.
|
||||
if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
|
||||
*shmem.channel.workFifoDone = shmem.work.header.doneAcks;
|
||||
if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
|
||||
*ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
|
||||
}
|
||||
|
||||
__syncwarp();
|
||||
if (shmem.work.header.type == ncclWorkTypeColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&shmem.work.elems[tid]);
|
||||
} else if (shmem.work.header.type == ncclWorkTypeRegColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&shmem.work.regElems[tid].elem);
|
||||
if (ncclShmem.work.header.type == ncclWorkTypeColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
|
||||
} else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
|
||||
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (shmem.work.header.funcIndex == FnIndex) {
|
||||
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
|
||||
if (ncclShmem.work.header.funcIndex == FnIndex) {
|
||||
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
|
||||
} else {
|
||||
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
|
||||
NCCL_CALL_FUNCTIONS<USING_LL128>(ncclShmem.work.header.funcIndex);
|
||||
}
|
||||
|
||||
int workIxNext = shmem.work.header.workNext;
|
||||
int workIxNext = ncclShmem.work.header.workNext;
|
||||
__syncthreads();
|
||||
if (shmem.work.header.isLast) break;
|
||||
if (ncclShmem.work.header.isLast) break;
|
||||
|
||||
copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
|
||||
copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
|
||||
|
||||
{ // Check whether the last operation was aborted and make sure all threads exit
|
||||
int aborted = tid == 0 ? *comm->abortFlag : 0;
|
||||
if (__any(aborted)) { // publish shmem.work
|
||||
if (__any(aborted)) { // publish ncclShmem.work
|
||||
traceAbort();
|
||||
break;
|
||||
}
|
||||
@@ -593,10 +591,10 @@ __device__ void ncclKernel(
|
||||
}
|
||||
if (COLLTRACE && tid == 0) traceKernelEnd();
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
|
||||
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
|
||||
__syncthreads();
|
||||
copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
|
||||
if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
|
||||
copyToShmem16(tid, ncclShmem.comm.devProf+MAXCHANNELS*ncclShmem.prof.seq+blockIdx.x, &ncclShmem.prof, sizeof(struct ncclProf));
|
||||
if (tid == 0) ncclShmem.comm.devProf[blockIdx.x].seq++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -626,7 +624,7 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st
|
||||
/* Functions for aggregation case */
|
||||
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
|
||||
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
|
||||
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \
|
||||
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
|
||||
}
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "collectives.h"
|
||||
#include "common.h"
|
||||
|
||||
__device__ struct ncclShmemData* ncclShmem;
|
||||
__shared__ ncclShmemData ncclShmem;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
namespace {
|
||||
template<typename T, typename RedOp>
|
||||
__device__ __attribute__((noinline)) void oneRankReduce() {
|
||||
ncclWork *w = &ncclShmem->work;
|
||||
ncclWork *w = &ncclShmem.work;
|
||||
int tid = threadIdx.x;
|
||||
int tn = blockDim.x;
|
||||
#pragma unroll 1
|
||||
|
||||
@@ -45,7 +45,7 @@ struct ProtoSimple {
|
||||
|
||||
// Data bytes (no flags etc) in one step of the fifo queue.
|
||||
__device__ static int calcBytePerStep() {
|
||||
return ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
}
|
||||
// Granularity of data bytes transferred per thread.
|
||||
__device__ static int calcBytePerGrain() {
|
||||
@@ -63,7 +63,7 @@ struct ProtoLL {
|
||||
|
||||
// Data bytes (no flags etc) in one step of the fifo queue.
|
||||
__device__ static int calcBytePerStep() {
|
||||
return ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
|
||||
return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
|
||||
}
|
||||
// Granularity of data bytes transferred per thread.
|
||||
__device__ static int calcBytePerGrain() {
|
||||
@@ -81,7 +81,7 @@ struct ProtoLL128 {
|
||||
|
||||
// Data bytes (no flags etc) in one step of the fifo queue.
|
||||
__device__ static int calcBytePerStep() {
|
||||
return (ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
|
||||
return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
|
||||
}
|
||||
// Granularity of data bytes transferred per thread.
|
||||
__device__ static int calcBytePerGrain() {
|
||||
|
||||
@@ -83,7 +83,7 @@ private:
|
||||
inline __device__ int checkAbort(int &spins, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = __atomic_load_n((ncclShmem->comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
@@ -93,7 +93,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
if (sendConnHeadPtr) {
|
||||
@@ -114,7 +114,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -376,7 +376,7 @@ private:
|
||||
npKitWaitRecvTotalTime = 0;
|
||||
npKitWaitRecvDataProcessSize = nelem*sizeof(T);
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
|
||||
npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -446,7 +446,7 @@ private:
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
|
||||
npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -494,11 +494,11 @@ private:
|
||||
):
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
|
||||
stepLines(ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
|
||||
barriers = &ncclShmem->groups[this->group].barrier;
|
||||
barrier_next = ncclShmem->groups[this->group].barrier_next;
|
||||
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
|
||||
barriers = &ncclShmem.groups[this->group].barrier;
|
||||
barrier_next = ncclShmem.groups[this->group].barrier_next;
|
||||
|
||||
auto *channel = &ncclShmem->channel;
|
||||
auto *channel = &ncclShmem.channel;
|
||||
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
@@ -539,14 +539,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -554,14 +554,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -569,14 +569,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -584,14 +584,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -599,14 +599,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -614,14 +614,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -629,14 +629,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -644,14 +644,14 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ private:
|
||||
inline __device__ int checkAbort(int &spins, int i, int send) {
|
||||
spins++;
|
||||
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
abort = __atomic_load_n(ncclShmem->comm.abortFlag, __ATOMIC_SEQ_CST);
|
||||
abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
|
||||
spins = 0;
|
||||
}
|
||||
return abort;
|
||||
@@ -408,11 +408,11 @@ public:
|
||||
redOp(redOpArg),
|
||||
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
|
||||
flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
|
||||
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
|
||||
barriers = &ncclShmem->groups[this->group].barrier;
|
||||
barrier_next = ncclShmem->groups[this->group].barrier_next;
|
||||
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
|
||||
barriers = &ncclShmem.groups[this->group].barrier;
|
||||
barrier_next = ncclShmem.groups[this->group].barrier_next;
|
||||
|
||||
auto *channel = &ncclShmem->channel;
|
||||
auto *channel = &ncclShmem.channel;
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
|
||||
|
||||
@@ -90,7 +90,7 @@ private:
|
||||
inline __device__ bool checkAbort(int &spins) {
|
||||
spins++;
|
||||
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
|
||||
flags |= atomicAdd_system((unsigned int *)ncclShmem->comm.abortFlag, 0) ? Aborted : 0;
|
||||
flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0;
|
||||
spins = 0;
|
||||
}
|
||||
return flags & Aborted;
|
||||
@@ -108,7 +108,7 @@ private:
|
||||
__builtin_amdgcn_s_sleep(8);
|
||||
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
|
||||
if (checkAbort(spins)) break;
|
||||
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
||||
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
||||
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
|
||||
}
|
||||
__asm__ __volatile__("s_wakeup");
|
||||
@@ -118,8 +118,8 @@ private:
|
||||
if (isSendNotRecv && (flags & SizesFifoEnabled))
|
||||
__atomic_store_n((connSizesFifoPtr+step%NCCL_STEPS), nelts*sizeof(T), __ATOMIC_SEQ_CST);
|
||||
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst)
|
||||
: (ncclShmem->groups[group].srcs + Src);
|
||||
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
|
||||
: (ncclShmem.groups[group].srcs + Src);
|
||||
if (flags & OffsFifoEnabled)
|
||||
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
|
||||
else if (isSendNotRecv && DirectSend) {
|
||||
@@ -200,19 +200,19 @@ private:
|
||||
do {
|
||||
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
|
||||
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
|
||||
ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
|
||||
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
|
||||
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
|
||||
ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
|
||||
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
|
||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
|
||||
subBarrier();
|
||||
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
|
||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
||||
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
|
||||
if (Send) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -225,8 +225,8 @@ private:
|
||||
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
|
||||
(tid, nworkers, nullptr, false,
|
||||
1, (T const**)ncclShmem->groups[group].srcs,
|
||||
fan.nsend(), (T**)ncclShmem->groups[group].dsts+1,
|
||||
1, (T const**)ncclShmem.groups[group].srcs,
|
||||
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
@@ -239,18 +239,18 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) {
|
||||
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
|
||||
// For broadcast in CollNet to do empty send
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -261,9 +261,9 @@ private:
|
||||
#endif
|
||||
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
|
||||
(tid, nworkers, ncclShmem->redOpArgs, postOp,
|
||||
Recv, (T const**)ncclShmem->groups[group].srcs,
|
||||
Dst, (T**)ncclShmem->groups[group].dsts,
|
||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
||||
Recv, (T const**)ncclShmem.groups[group].srcs,
|
||||
Dst, (T**)ncclShmem.groups[group].dsts,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
@@ -276,7 +276,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -285,7 +285,7 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -298,9 +298,9 @@ private:
|
||||
constexpr int PreOpN = SrcBuf != Input ? 0 :
|
||||
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
|
||||
(tid, nworkers, ncclShmem->redOpArgs, postOp,
|
||||
Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs,
|
||||
Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts,
|
||||
(tid, nworkers, ncclShmem.redOpArgs, postOp,
|
||||
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
|
||||
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
|
||||
sliceSize);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
|
||||
@@ -313,15 +313,13 @@ private:
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
barrier(); // This barrier has a counterpart in following loop
|
||||
#if defined(__gfx1030__)
|
||||
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
|
||||
#endif
|
||||
if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
postPeer<Recv, Send>();
|
||||
offset += sliceSize;
|
||||
@@ -341,9 +339,7 @@ private:
|
||||
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
|
||||
}
|
||||
barrier(); // Has couterpart in preceding worker-only loop.
|
||||
#if defined(__gfx1030__)
|
||||
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
|
||||
#endif
|
||||
if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
|
||||
__syncwarp();
|
||||
postPeer<Recv, Send>();
|
||||
offset += sliceSize;
|
||||
@@ -370,8 +366,8 @@ private:
|
||||
if (Send) {
|
||||
// Scatter pre-scales data of input buffer only in non-Direct case
|
||||
constexpr int PreOpN = DirectSend ? 0 : 1;
|
||||
if (flags & RoleInput) ncclShmem->groups[group].srcs[0] = userBuff + inpIx + offset;
|
||||
if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] = 0; // Skip the threadfence
|
||||
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
|
||||
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
|
||||
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
|
||||
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
|
||||
subBarrier();
|
||||
@@ -382,22 +378,22 @@ private:
|
||||
int peerOffset = i*peerElem;
|
||||
// Skip the data I am responsible of reducing myself
|
||||
if (skip >= 0 && i >= skip) peerOffset += peerElem;
|
||||
const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset;
|
||||
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
|
||||
int realPeerSize = min(realSize, totalElem-peerOffset);
|
||||
if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) {
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize);
|
||||
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
|
||||
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
|
||||
// Mark for threadfence at the end
|
||||
if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize;
|
||||
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
|
||||
}
|
||||
}
|
||||
} else if (Recv) {
|
||||
if (flags & RoleOutput) ncclShmem->groups[group].dsts[0] = userBuff + outIx + offset;
|
||||
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
|
||||
int peerOffset = index*peerElem;
|
||||
if (skip >= 0 && index >= skip) peerOffset += peerElem;
|
||||
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
|
||||
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
|
||||
subBarrier();
|
||||
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
|
||||
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
|
||||
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
|
||||
// Do nothing
|
||||
} else {
|
||||
@@ -406,16 +402,16 @@ private:
|
||||
int i = (j+shift)%fan.nrecv();
|
||||
peerOffset = i*peerElem;
|
||||
if (skip >= 0 && i >= skip) peerOffset += peerElem;
|
||||
T* dst0 = (T*)ncclShmem->groups[group].dsts[0] + peerOffset;
|
||||
T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
|
||||
int realPeerSize = min(realSize, totalElem-peerOffset);
|
||||
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem->redOpArgs, postOp, 1, (const T**)ncclShmem->groups[group].srcs+i, 1, &dst0, realPeerSize);
|
||||
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
// If we indeed send something, threadfence
|
||||
if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0)
|
||||
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
|
||||
__threadfence_system();
|
||||
__syncwarp();
|
||||
postPeer<Recv, Send>();
|
||||
@@ -433,7 +429,7 @@ private:
|
||||
atomicExch_system((unsigned long long *)connStepPtr, step); // Return credits in case we rounded up.
|
||||
}
|
||||
if (flags & RoleWaitRecv) {
|
||||
ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
||||
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
|
||||
connStepPtr = conn->tail;
|
||||
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
@@ -473,7 +469,7 @@ private:
|
||||
next_hdp_reg = conn->next_hdp_reg;
|
||||
}
|
||||
if (flags & RoleWaitSend) {
|
||||
ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||
connStepPtr = conn->head;
|
||||
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
|
||||
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
|
||||
@@ -508,20 +504,20 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
__device__ Primitives(
|
||||
__forceinline__ __device__ Primitives(
|
||||
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
|
||||
):
|
||||
tid(tid),
|
||||
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
|
||||
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
|
||||
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
this->nthreads = nthreads;
|
||||
this->nworkers = nthreads;
|
||||
this->group = group & (uint16_t)0xFFFF;
|
||||
int connIndex = group >> 16;
|
||||
barriers = &ncclShmem->groups[this->group].barrier;
|
||||
barrier_next = ncclShmem->groups[this->group].barrier_next;
|
||||
barriers = &ncclShmem.groups[this->group].barrier;
|
||||
barrier_next = ncclShmem.groups[this->group].barrier_next;
|
||||
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
|
||||
@@ -551,30 +547,30 @@ private:
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
|
||||
|
||||
loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
|
||||
loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);
|
||||
loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
|
||||
loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
|
||||
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
|
||||
}
|
||||
|
||||
__device__ ~Primitives() {
|
||||
// Ensure ncclShmem->groups[].send/recvConns are available
|
||||
__forceinline__ __device__ ~Primitives() {
|
||||
// Ensure ncclShmem.groups[].send/recvConns are available
|
||||
if (!(flags & ThreadsSynced))
|
||||
barrier();
|
||||
// Save steps for the next operation
|
||||
if (flags & (RolePostSend|RolePostRecv)) {
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem->groups[group].sendConns : ncclShmem->groups[group].recvConns;
|
||||
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
|
||||
conns[index]->step = step;
|
||||
}
|
||||
// Make sure all threads are done writing back conn->step and done using
|
||||
// ncclShmem->groups[group]
|
||||
// ncclShmem.groups[group]
|
||||
barrier();
|
||||
}
|
||||
|
||||
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
|
||||
if (flags & RoleInput) {
|
||||
userBuff = (T*)inputBuf;
|
||||
ncclShmem->redOpArgs[0] = redOpArg; // scaler for local input
|
||||
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
|
||||
}
|
||||
if (flags & RoleOutput) userBuff = (T*)outputBuf;
|
||||
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
|
||||
@@ -585,7 +581,7 @@ private:
|
||||
|
||||
if (Direct && recvProvider) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
|
||||
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
while ((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
|
||||
directBuff = (T*)outputBuf;
|
||||
@@ -596,7 +592,7 @@ private:
|
||||
}
|
||||
if (Direct && sendAcceptor) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
|
||||
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
void *ptr;
|
||||
while (true) {
|
||||
ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
|
||||
@@ -608,9 +604,9 @@ private:
|
||||
}
|
||||
if (Direct && sendProvider) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange+1;
|
||||
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
|
||||
// Wait for consumer to consume previous value before trampling it.
|
||||
while (((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
|
||||
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
|
||||
@@ -626,9 +622,9 @@ private:
|
||||
}
|
||||
if (Direct && recvAcceptor) {
|
||||
int spins = 0;
|
||||
void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange+1;
|
||||
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
|
||||
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
|
||||
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
|
||||
void *ptr;
|
||||
while (true) {
|
||||
ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
|
||||
@@ -644,7 +640,7 @@ private:
|
||||
arg1 = *argSlot1;
|
||||
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
|
||||
}
|
||||
ncclShmem->redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
|
||||
ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
|
||||
}
|
||||
*argSlot0 = 0; *argSlot1 = 0;
|
||||
*slot = nullptr;
|
||||
|
||||
@@ -16,13 +16,13 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ncclShmem->comm.rank;
|
||||
const int rank = ncclShmem.comm.rank;
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
@@ -71,7 +71,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
@@ -79,14 +79,14 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -16,12 +16,12 @@ namespace {
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
ncclRing *ring = &ncclShmem.channel.ring;
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const int nranks = ncclShmem.comm.nRanks;
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
|
||||
@@ -69,7 +69,7 @@ namespace {
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
|
||||
runRing<T, RedOp, Proto>(args);
|
||||
}
|
||||
@@ -77,14 +77,14 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL>(args);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
|
||||
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
|
||||
__device__ __forceinline__ void run(ncclWorkElem *args) {
|
||||
runRing<T, RedOp, ProtoLL128>(args);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,20 +25,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer == ncclShmem->comm.rank) {
|
||||
if (args->peer == ncclShmem.comm.rank) {
|
||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
||||
if (buff != recvBuff) {
|
||||
@@ -46,14 +46,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -62,14 +62,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -90,7 +90,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -105,7 +105,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -120,20 +120,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
|
||||
if (isNpKitThread) {
|
||||
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
|
||||
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (args->peer != ncclShmem->comm.rank) {
|
||||
if (args->peer != ncclShmem.comm.rank) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
@@ -151,7 +151,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
prims.npKitDataProcessTotalTime = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -166,14 +166,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void run(ncclWork *work) {
|
||||
__device__ __attribute__((noinline)) void run(ncclWork *work) {
|
||||
struct ncclWorkElemP2p* args = work->p2pElems;
|
||||
int ngroups = args->ngroups;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
Ссылка в новой задаче
Block a user