Define ncclShmem as global shared (#618)

* Use global defined shared memory

* Add --hipcc-func-supp to compile option

* Force inline some device functions

* Add back threadfence
Этот коммит содержится в:
Wenkai Du
2022-09-20 09:00:20 -07:00
коммит произвёл GitHub
родитель 05cc7bd850
Коммит 9e6c87a2bf
15 изменённых файлов: 231 добавлений и 237 удалений
+1 -1
Просмотреть файл
@@ -300,7 +300,7 @@ foreach(target ${AMDGPU_TARGETS})
endforeach()
if("${HIP_COMPILER}" MATCHES "clang")
target_compile_options(rccl PRIVATE -fvisibility=hidden)
target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp)
foreach(target ${AMDGPU_TARGETS})
target_compile_options(rccl PRIVATE -fgpu-rdc)
endforeach()
+5 -5
Просмотреть файл
@@ -16,12 +16,12 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
ncclRing *ring = &ncclShmem.channel.ring;
const int *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem->comm.nRanks;
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*int(chunkSize);
const ssize_t size = args->count;
@@ -79,7 +79,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
@@ -87,14 +87,14 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
+46 -46
Просмотреть файл
@@ -20,10 +20,10 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
const int nranks = ncclShmem->comm.nRanks;
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
const ssize_t size = args->count;
@@ -33,23 +33,23 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -102,7 +102,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -112,7 +112,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -121,7 +121,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -136,7 +136,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -149,7 +149,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -159,14 +159,14 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -182,14 +182,14 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -203,7 +203,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -212,7 +212,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -224,7 +224,7 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem->channel.tree;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
@@ -240,23 +240,23 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -276,7 +276,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -306,7 +306,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -325,7 +325,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -355,7 +355,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -364,7 +364,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -376,7 +376,7 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem->channel.binTree : &ncclShmem->channel.tree;
ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem.channel.binTree : &ncclShmem.channel.tree;
ssize_t chunkSize = int(
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
: Proto::calcBytePerStep()/sizeof(T));
@@ -410,23 +410,23 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -447,7 +447,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -461,7 +461,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -487,7 +487,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -510,7 +510,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -529,7 +529,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -552,7 +552,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -561,7 +561,7 @@ namespace {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -570,7 +570,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
@@ -578,7 +578,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
}
};
@@ -590,7 +590,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclDirect* tree = &ncclShmem->channel.collTree;
struct ncclDirect* tree = &ncclShmem.channel.collTree;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
@@ -683,14 +683,14 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
if (args->pad_0 == 0) runTreeUpDown<T, RedOp, ProtoLL>(args);
else runTreeSplit<T, RedOp, ProtoLL>(args);
}
@@ -698,7 +698,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
}
@@ -706,7 +706,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeSplit<T, RedOp, ProtoLL128>(args);
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
}
+3 -3
Просмотреть файл
@@ -14,8 +14,8 @@ namespace {
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nranks = ncclShmem->comm.nRanks;
const ncclRing *ring = &ncclShmem->channel.ring;
const int nranks = ncclShmem.comm.nRanks;
const ncclRing *ring = &ncclShmem.channel.ring;
const int num_bi_rings = args->pivotA2ANumBiRings;
const int num_uni_rings = num_bi_rings * 2;
const int num_chunks = args->nChannels / 2;
@@ -71,7 +71,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
+4 -4
Просмотреть файл
@@ -15,7 +15,7 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = nChannels*chunkSize;
@@ -61,7 +61,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
@@ -69,14 +69,14 @@ struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SI
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
+52 -54
Просмотреть файл
@@ -188,7 +188,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{
template<unsigned short f, unsigned short l, bool u>
struct Caller {
static __device__ __host__
static __forceinline__ __device__ __host__
void call(unsigned short funcIndex) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
@@ -199,7 +199,7 @@ struct Caller {
template<unsigned short f, bool u>
struct Caller<f, f + 1, u>{
static __device__ __host__
static __forceinline__ __device__ __host__
void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); }
};
@@ -207,7 +207,7 @@ static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function index");
template<bool USING_LL128>
inline
__forceinline__
__device__
void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
#if defined(BUILD_ALLREDUCE_ONLY)
@@ -320,14 +320,14 @@ class ncclFunction {
#ifdef ENABLE_COLLTRACE
#define traceColl(launch_type) { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->funcIndex = shmem.work.header.funcIndex; \
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
if (shmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
collTrace->p2p[0].connIndex = 0; \
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
@@ -341,8 +341,8 @@ class ncclFunction {
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
} else if (shmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = shmem.work.elems; \
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = ncclShmem.work.elems; \
collTrace->opCount = elems[0].opCount; \
collTrace->coll.nWarps = elems[0].nWarps; \
collTrace->coll.bid = elems[0].bid; \
@@ -355,23 +355,23 @@ class ncclFunction {
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
}
#define traceKernelEnd() { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceKernelEndType; \
}
#define traceAbort() { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceAbortType; \
}
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
#define traceData(data2, data4, data8_0, data8_1) { \
uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \
collTrace->bid = blockIdx.x; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->funcIndex = data2; \
@@ -408,14 +408,14 @@ struct ncclShmemData {
struct ncclProf prof;
#endif
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
shmem.prof.elem[shmem.prof.count].line = line_num; \
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.prof.count++; \
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
ncclShmem.prof.count++; \
} \
} while(0);
#else
@@ -459,7 +459,7 @@ struct RunWork {
}
};
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
if (we->isUsed && we->redOpArgIsPtr) {
/* redOpArg is a pointer to the scalar value, so we'll dereference it
* here so that redOpArg holds the bits of the scalar going forward.
@@ -480,19 +480,17 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
}
}
extern __device__ struct ncclShmemData *ncclShmem;
extern __shared__ ncclShmemData ncclShmem;
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
__device__ void ncclKernel(
__forceinline__ __device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
) {
int tid = threadIdx.x;
__shared__ struct ncclShmemData shmem;
ncclShmem = &shmem;
if (tid == 0) {
for (auto i = 0; i < NCCL_MAX_GROUPS; i++) {
shmem.groups[i].barrier = 0;
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
ncclShmem.groups[i].barrier = 0;
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) ncclShmem.groups[i].barrier_next[j] = 0;
}
}
// To map blockId to channelId, we need the n'th set bit of channelMask which
@@ -501,18 +499,18 @@ __device__ void ncclKernel(
int x = tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) shmem.channelId = x;
if (blockIdx.x == y) ncclShmem.channelId = x;
}
if (32 < MAXCHANNELS) {
x = 32 + tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) shmem.channelId = x;
if (blockIdx.x == y) ncclShmem.channelId = x;
}
}
}
__syncthreads(); // publish shmem.channelId
int channelId = shmem.channelId;
__syncthreads(); // publish ncclShmem.channelId
int channelId = ncclShmem.channelId;
if (true) {
void *dst, *src;
@@ -520,20 +518,20 @@ __device__ void ncclKernel(
// Use first 3 warps to load comm, channel, and work into shmem
switch (tid/WARP_SIZE) {
case 0:
dst = &shmem.comm;
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
dst = &shmem.channel;
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
break;
case 2:
dst = &shmem.work;
dst = &ncclShmem.work;
src = workHead + blockIdx.x;
bytes = sizeof(ncclWork);
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
@@ -547,8 +545,8 @@ __device__ void ncclKernel(
__syncthreads(); // publish shmem
#ifdef ENABLE_PROFILING
if (tid == 0) {
shmem.prof.count = 0;
shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
ncclShmem.prof.count = 0;
ncclShmem.prof.seq = ncclShmem.comm.devProf[blockIdx.x].seq;
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
@@ -557,34 +555,34 @@ __device__ void ncclKernel(
while (true) {
// Notify host that all fifo reads are complete.
if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
*shmem.channel.workFifoDone = shmem.work.header.doneAcks;
if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
*ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
}
__syncwarp();
if (shmem.work.header.type == ncclWorkTypeColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&shmem.work.elems[tid]);
} else if (shmem.work.header.type == ncclWorkTypeRegColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&shmem.work.regElems[tid].elem);
if (ncclShmem.work.header.type == ncclWorkTypeColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
} else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
}
__syncthreads();
if (tid == 0) __insert_timestamp(__LINE__);
if (shmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
if (ncclShmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
} else {
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
NCCL_CALL_FUNCTIONS<USING_LL128>(ncclShmem.work.header.funcIndex);
}
int workIxNext = shmem.work.header.workNext;
int workIxNext = ncclShmem.work.header.workNext;
__syncthreads();
if (shmem.work.header.isLast) break;
if (ncclShmem.work.header.isLast) break;
copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (__any(aborted)) { // publish shmem.work
if (__any(aborted)) { // publish ncclShmem.work
traceAbort();
break;
}
@@ -593,10 +591,10 @@ __device__ void ncclKernel(
}
if (COLLTRACE && tid == 0) traceKernelEnd();
#ifdef ENABLE_PROFILING
if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
__syncthreads();
copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
copyToShmem16(tid, ncclShmem.comm.devProf+MAXCHANNELS*ncclShmem.prof.seq+blockIdx.x, &ncclShmem.prof, sizeof(struct ncclProf));
if (tid == 0) ncclShmem.comm.devProf[blockIdx.x].seq++;
}
#endif
}
@@ -626,7 +624,7 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
}
// Only generate inline kernels for LL
+1 -1
Просмотреть файл
@@ -9,7 +9,7 @@
#include "collectives.h"
#include "common.h"
__device__ struct ncclShmemData* ncclShmem;
__shared__ ncclShmemData ncclShmem;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
+1 -1
Просмотреть файл
@@ -13,7 +13,7 @@
namespace {
template<typename T, typename RedOp>
__device__ __attribute__((noinline)) void oneRankReduce() {
ncclWork *w = &ncclShmem->work;
ncclWork *w = &ncclShmem.work;
int tid = threadIdx.x;
int tn = blockDim.x;
#pragma unroll 1
+3 -3
Просмотреть файл
@@ -45,7 +45,7 @@ struct ProtoSimple {
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
@@ -63,7 +63,7 @@ struct ProtoLL {
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
@@ -81,7 +81,7 @@ struct ProtoLL128 {
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return (ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
+25 -25
Просмотреть файл
@@ -83,7 +83,7 @@ private:
inline __device__ int checkAbort(int &spins, int send) {
spins++;
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
abort = __atomic_load_n((ncclShmem->comm.abortFlag), __ATOMIC_SEQ_CST);
abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
spins = 0;
}
return abort;
@@ -93,7 +93,7 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (sendConnHeadPtr) {
@@ -114,7 +114,7 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -376,7 +376,7 @@ private:
npKitWaitRecvTotalTime = 0;
npKitWaitRecvDataProcessSize = nelem*sizeof(T);
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -446,7 +446,7 @@ private:
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -494,11 +494,11 @@ private:
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF),
stepLines(ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
barriers = &ncclShmem->groups[this->group].barrier;
barrier_next = ncclShmem->groups[this->group].barrier_next;
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
auto *channel = &ncclShmem->channel;
auto *channel = &ncclShmem.channel;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
@@ -539,14 +539,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -554,14 +554,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -569,14 +569,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -584,14 +584,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -599,14 +599,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -614,14 +614,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -629,14 +629,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
@@ -644,14 +644,14 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
+5 -5
Просмотреть файл
@@ -78,7 +78,7 @@ private:
inline __device__ int checkAbort(int &spins, int i, int send) {
spins++;
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
abort = __atomic_load_n(ncclShmem->comm.abortFlag, __ATOMIC_SEQ_CST);
abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
spins = 0;
}
return abort;
@@ -408,11 +408,11 @@ public:
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF),
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
barriers = &ncclShmem->groups[this->group].barrier;
barrier_next = ncclShmem->groups[this->group].barrier_next;
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
auto *channel = &ncclShmem->channel;
auto *channel = &ncclShmem.channel;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
+57 -61
Просмотреть файл
@@ -90,7 +90,7 @@ private:
inline __device__ bool checkAbort(int &spins) {
spins++;
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
flags |= atomicAdd_system((unsigned int *)ncclShmem->comm.abortFlag, 0) ? Aborted : 0;
flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0;
spins = 0;
}
return flags & Aborted;
@@ -108,7 +108,7 @@ private:
__builtin_amdgcn_s_sleep(8);
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
}
__asm__ __volatile__("s_wakeup");
@@ -118,8 +118,8 @@ private:
if (isSendNotRecv && (flags & SizesFifoEnabled))
__atomic_store_n((connSizesFifoPtr+step%NCCL_STEPS), nelts*sizeof(T), __ATOMIC_SEQ_CST);
void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst)
: (ncclShmem->groups[group].srcs + Src);
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
: (ncclShmem.groups[group].srcs + Src);
if (flags & OffsFifoEnabled)
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
@@ -200,19 +200,19 @@ private:
do {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset;
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset;
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
subBarrier();
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -225,8 +225,8 @@ private:
// (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0).
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, (1-Send)+MaxSend, 0>
(tid, nworkers, nullptr, false,
1, (T const**)ncclShmem->groups[group].srcs,
fan.nsend(), (T**)ncclShmem->groups[group].dsts+1,
1, (T const**)ncclShmem.groups[group].srcs,
fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
sliceSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -239,18 +239,18 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) {
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
// For broadcast in CollNet to do empty send
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -261,9 +261,9 @@ private:
#endif
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>
(tid, nworkers, ncclShmem->redOpArgs, postOp,
Recv, (T const**)ncclShmem->groups[group].srcs,
Dst, (T**)ncclShmem->groups[group].dsts,
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv, (T const**)ncclShmem.groups[group].srcs,
Dst, (T**)ncclShmem.groups[group].dsts,
sliceSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -276,7 +276,7 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -285,7 +285,7 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -298,9 +298,9 @@ private:
constexpr int PreOpN = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
ReduceOrCopyMulti<Unroll, RedOp, T, Recv+Src, Recv*MaxRecv+Src, Send+Dst, Send*MaxSend+Dst, PreOpN>
(tid, nworkers, ncclShmem->redOpArgs, postOp,
Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs,
Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts,
(tid, nworkers, ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
sliceSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
@@ -313,15 +313,13 @@ private:
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
barrier(); // This barrier has a counterpart in following loop
#if defined(__gfx1030__)
if (Send && (flags & RolePostSend) && index == 0) __threadfence_system();
#endif
if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system();
__syncwarp();
postPeer<Recv, Send>();
offset += sliceSize;
@@ -341,9 +339,7 @@ private:
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
}
barrier(); // Has couterpart in preceding worker-only loop.
#if defined(__gfx1030__)
if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
#endif
if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system();
__syncwarp();
postPeer<Recv, Send>();
offset += sliceSize;
@@ -370,8 +366,8 @@ private:
if (Send) {
// Scatter pre-scales data of input buffer only in non-Direct case
constexpr int PreOpN = DirectSend ? 0 : 1;
if (flags & RoleInput) ncclShmem->groups[group].srcs[0] = userBuff + inpIx + offset;
if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] = 0; // Skip the threadfence
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
@@ -382,22 +378,22 @@ private:
int peerOffset = i*peerElem;
// Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) peerOffset += peerElem;
const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset;
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize;
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
}
}
} else if (Recv) {
if (flags & RoleOutput) ncclShmem->groups[group].dsts[0] = userBuff + outIx + offset;
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
int peerOffset = index*peerElem;
if (skip >= 0 && index >= skip) peerOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+peerOffset, offset, realSize);
subBarrier();
if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) {
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv
// Do nothing
} else {
@@ -406,16 +402,16 @@ private:
int i = (j+shift)%fan.nrecv();
peerOffset = i*peerElem;
if (skip >= 0 && i >= skip) peerOffset += peerElem;
T* dst0 = (T*)ncclShmem->groups[group].dsts[0] + peerOffset;
T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem->redOpArgs, postOp, 1, (const T**)ncclShmem->groups[group].srcs+i, 1, &dst0, realPeerSize);
if (realPeerSize > 0) ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, 0>(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
}
}
}
barrier();
// If we indeed send something, threadfence
if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0)
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
__threadfence_system();
__syncwarp();
postPeer<Recv, Send>();
@@ -433,7 +429,7 @@ private:
atomicExch_system((unsigned long long *)connStepPtr, step); // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
connStepPtr = conn->tail;
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -473,7 +469,7 @@ private:
next_hdp_reg = conn->next_hdp_reg;
}
if (flags & RoleWaitSend) {
ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
connStepPtr = conn->head;
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
@@ -508,20 +504,20 @@ private:
}
public:
__device__ Primitives(
__forceinline__ __device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
):
tid(tid),
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nthreads = nthreads;
this->nworkers = nthreads;
this->group = group & (uint16_t)0xFFFF;
int connIndex = group >> 16;
barriers = &ncclShmem->groups[this->group].barrier;
barrier_next = ncclShmem->groups[this->group].barrier_next;
barriers = &ncclShmem.groups[this->group].barrier;
barrier_next = ncclShmem.groups[this->group].barrier_next;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -551,30 +547,30 @@ private:
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);
loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e);
loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
__device__ ~Primitives() {
// Ensure ncclShmem->groups[].send/recvConns are available
__forceinline__ __device__ ~Primitives() {
// Ensure ncclShmem.groups[].send/recvConns are available
if (!(flags & ThreadsSynced))
barrier();
// Save steps for the next operation
if (flags & (RolePostSend|RolePostRecv)) {
auto *conns = (flags & RolePostSend) ? ncclShmem->groups[group].sendConns : ncclShmem->groups[group].recvConns;
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
conns[index]->step = step;
}
// Make sure all threads are done writing back conn->step and done using
// ncclShmem->groups[group]
// ncclShmem.groups[group]
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
if (flags & RoleInput) {
userBuff = (T*)inputBuf;
ncclShmem->redOpArgs[0] = redOpArg; // scaler for local input
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
}
if (flags & RoleOutput) userBuff = (T*)outputBuf;
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
@@ -585,7 +581,7 @@ private:
if (Direct && recvProvider) {
int spins = 0;
void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
while ((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
@@ -596,7 +592,7 @@ private:
}
if (Direct && sendAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void *ptr;
while (true) {
ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
@@ -608,9 +604,9 @@ private:
}
if (Direct && sendProvider) {
int spins = 0;
void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange+1;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
// Wait for consumer to consume previous value before trampling it.
while (((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
@@ -626,9 +622,9 @@ private:
}
if (Direct && recvAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange+1;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
void *ptr;
while (true) {
ptr = (void *)atomicAdd_system((unsigned long long *) slot,0);
@@ -644,7 +640,7 @@ private:
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
}
ncclShmem->redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
+6 -6
Просмотреть файл
@@ -16,13 +16,13 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const int nranks = ncclShmem->comm.nRanks;
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
const int rank = ncclShmem->comm.rank;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->userRanks[nranks-1];
const int root = args->root;
@@ -71,7 +71,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
@@ -79,14 +79,14 @@ struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
+5 -5
Просмотреть файл
@@ -16,12 +16,12 @@ namespace {
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
ncclRing *ring = &ncclShmem.channel.ring;
int const *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem->comm.nRanks;
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
@@ -69,7 +69,7 @@ namespace {
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
@@ -77,14 +77,14 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROT
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __attribute__((noinline)) void run(ncclWorkElem *args) {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
+17 -17
Просмотреть файл
@@ -25,20 +25,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer == ncclShmem->comm.rank) {
if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) {
@@ -46,14 +46,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -62,14 +62,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -90,7 +90,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -105,7 +105,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -120,20 +120,20 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp;
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer != ncclShmem->comm.rank) {
if (args->peer != ncclShmem.comm.rank) {
using Proto = ProtoSimple<1, 1>;
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
@@ -151,7 +151,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
@@ -166,14 +166,14 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
__device__ __forceinline__ void run(ncclWork *work) {
__device__ __attribute__((noinline)) void run(ncclWork *work) {
struct ncclWorkElemP2p* args = work->p2pElems;
int ngroups = args->ngroups;
int tid = threadIdx.x;