diff --git a/CMakeLists.txt b/CMakeLists.txt index bdd8ed5b92..f71929e5c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -300,7 +300,7 @@ foreach(target ${AMDGPU_TARGETS}) endforeach() if("${HIP_COMPILER}" MATCHES "clang") - target_compile_options(rccl PRIVATE -fvisibility=hidden) + target_compile_options(rccl PRIVATE -fvisibility=hidden --hipcc-func-supp) foreach(target ${AMDGPU_TARGETS}) target_compile_options(rccl PRIVATE -fgpu-rdc) endforeach() diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index d83425895d..81a091cb23 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -16,12 +16,12 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclRing *ring = &ncclShmem->channel.ring; + ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); - const int nranks = ncclShmem->comm.nRanks; + const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*int(chunkSize); const ssize_t size = args->count; @@ -79,7 +79,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } @@ -87,14 +87,14 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index ea90cbd37f..f71953aa26 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -20,10 +20,10 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclRing *ring = &ncclShmem->channel.ring; + ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1)); - const int nranks = ncclShmem->comm.nRanks; + const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*nranks*chunkSize; const ssize_t size = args->count; @@ -33,23 +33,23 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { - uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp; + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -102,7 +102,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -112,7 +112,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -121,7 +121,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY) if (tid == 0 && nranks > 2) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -136,7 +136,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT) if (tid == 0 && nranks > 2) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -149,7 +149,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -159,14 +159,14 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY) if (tid == 0 && nranks > 2) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -182,14 +182,14 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT) if (tid == 0 && nranks > 2) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -203,7 +203,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -212,7 +212,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -224,7 +224,7 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclTree *tree = &ncclShmem->channel.tree; + ncclTree *tree = &ncclShmem.channel.tree; ssize_t chunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize /* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T)); @@ -240,23 +240,23 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (tid == 0) { - uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp; + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -276,7 +276,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -306,7 +306,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -325,7 +325,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -355,7 +355,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -364,7 +364,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -376,7 +376,7 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem->channel.binTree : &ncclShmem->channel.tree; + ncclTree *tree = (args->pad_0 == 2) ? &ncclShmem.channel.binTree : &ncclShmem.channel.tree; ssize_t chunkSize = int( Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize : Proto::calcBytePerStep()/sizeof(T)); @@ -410,23 +410,23 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (isNpKitThread) { - uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp; + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -447,7 +447,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -461,7 +461,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -487,7 +487,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -510,7 +510,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -529,7 +529,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -552,7 +552,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -561,7 +561,7 @@ namespace { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -570,7 +570,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } @@ -578,7 +578,7 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runTreeUpDown>(args); } }; @@ -590,7 +590,7 @@ struct RunWorkElementbid; const int nChannels = args->nChannels; - struct ncclDirect* tree = &ncclShmem->channel.collTree; + struct ncclDirect* tree = &ncclShmem.channel.collTree; const ssize_t chunkSize = int(args->lastChunkSize); const ssize_t size = args->count; const ssize_t loopSize = nChannels*tree->nHeads*chunkSize; @@ -683,14 +683,14 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { if (args->pad_0 == 0) runTreeUpDown(args); else runTreeSplit(args); } @@ -698,7 +698,7 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); } @@ -706,7 +706,7 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runTreeSplit(args); //LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args); } diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h index 948bdc2827..0ffa7caee9 100644 --- a/src/collectives/device/alltoall_pivot.h +++ b/src/collectives/device/alltoall_pivot.h @@ -14,8 +14,8 @@ namespace { const int tid = threadIdx.x; const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; - const int nranks = ncclShmem->comm.nRanks; - const ncclRing *ring = &ncclShmem->channel.ring; + const int nranks = ncclShmem.comm.nRanks; + const ncclRing *ring = &ncclShmem.channel.ring; const int num_bi_rings = args->pivotA2ANumBiRings; const int num_uni_rings = num_bi_rings * 2; const int num_chunks = args->nChannels / 2; @@ -71,7 +71,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index c54a7d006c..85c1999ded 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -15,7 +15,7 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclRing *ring = &ncclShmem->channel.ring; + ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = nChannels*chunkSize; @@ -61,7 +61,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } @@ -69,14 +69,14 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 569515c20a..448e2f7b45 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -188,7 +188,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs_ll128[]{ template struct Caller { - static __device__ __host__ + static __forceinline__ __device__ __host__ void call(unsigned short funcIndex) noexcept { constexpr unsigned short m = f + (l - f) / 2; @@ -199,7 +199,7 @@ struct Caller { template struct Caller{ - static __device__ __host__ + static __forceinline__ __device__ __host__ void call(unsigned short funcIndex) noexcept { if (u) ncclFuncs_ll128[f](); else ncclFuncs[f](); } }; @@ -207,7 +207,7 @@ static_assert(FUNC_INDEX_P2P == 2710, "Wrong P2P function index"); static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 2711, "Wrong AllToAllPivot function index"); template -inline +__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { #if defined(BUILD_ALLREDUCE_ONLY) @@ -320,14 +320,14 @@ class ncclFunction { #ifdef ENABLE_COLLTRACE #define traceColl(launch_type) { \ - uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->bid = blockIdx.x; \ - collTrace->funcIndex = shmem.work.header.funcIndex; \ + collTrace->funcIndex = ncclShmem.work.header.funcIndex; \ asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \ - if (shmem.work.header.type == ncclWorkTypeP2p) { \ - struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \ + if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \ + struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \ collTrace->p2p[0].connIndex = 0; \ collTrace->p2pOpCount[0] = p2pElems[0].opCount; \ collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \ @@ -341,8 +341,8 @@ class ncclFunction { collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \ collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \ collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \ - } else if (shmem.work.header.type == ncclWorkTypeColl) { \ - struct ncclWorkElem *elems = shmem.work.elems; \ + } else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \ + struct ncclWorkElem *elems = ncclShmem.work.elems; \ collTrace->opCount = elems[0].opCount; \ collTrace->coll.nWarps = elems[0].nWarps; \ collTrace->coll.bid = elems[0].bid; \ @@ -355,23 +355,23 @@ class ncclFunction { traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \ } #define traceKernelEnd() { \ - uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceKernelEndType; \ } #define traceAbort() { \ - uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->bid = blockIdx.x; \ collTrace->type = ncclCollTraceAbortType; \ } // traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1) #define traceData(data2, data4, data8_0, data8_1) { \ - uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ - struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \ + uint32_t pos = __atomic_fetch_add(ncclShmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ + struct ncclCollTrace* collTrace = ncclShmem.comm.collTrace+pos; \ collTrace->bid = blockIdx.x; \ collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \ collTrace->funcIndex = data2; \ @@ -408,14 +408,14 @@ struct ncclShmemData { struct ncclProf prof; #endif }; -static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); +static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned"); #ifdef ENABLE_PROFILING #define __insert_timestamp(line_num) do { \ - if (shmem.prof.count < PROFILE_NUM_ITEMS) { \ - shmem.prof.elem[shmem.prof.count].line = line_num; \ - shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \ - shmem.prof.count++; \ + if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \ + ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \ + ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \ + ncclShmem.prof.count++; \ } \ } while(0); #else @@ -459,7 +459,7 @@ struct RunWork { } }; -static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { +static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { if (we->isUsed && we->redOpArgIsPtr) { /* redOpArg is a pointer to the scalar value, so we'll dereference it * here so that redOpArg holds the bits of the scalar going forward. @@ -480,19 +480,17 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { } } -extern __device__ struct ncclShmemData *ncclShmem; +extern __shared__ ncclShmemData ncclShmem; template -__device__ void ncclKernel( +__forceinline__ __device__ void ncclKernel( struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead ) { int tid = threadIdx.x; - __shared__ struct ncclShmemData shmem; - ncclShmem = &shmem; if (tid == 0) { for (auto i = 0; i < NCCL_MAX_GROUPS; i++) { - shmem.groups[i].barrier = 0; - for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0; + ncclShmem.groups[i].barrier = 0; + for (auto j = 0; j < NCCL_MAX_GROUPS; j++) ncclShmem.groups[i].barrier_next[j] = 0; } } // To map blockId to channelId, we need the n'th set bit of channelMask which @@ -501,18 +499,18 @@ __device__ void ncclKernel( int x = tid; if (channelMask & (1ull<channels[channelId]; bytes = sizeof(ncclDevChannel); static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn."); break; case 2: - dst = &shmem.work; + dst = &ncclShmem.work; src = workHead + blockIdx.x; bytes = sizeof(ncclWork); static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn."); @@ -547,8 +545,8 @@ __device__ void ncclKernel( __syncthreads(); // publish shmem #ifdef ENABLE_PROFILING if (tid == 0) { - shmem.prof.count = 0; - shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq; + ncclShmem.prof.count = 0; + ncclShmem.prof.seq = ncclShmem.comm.devProf[blockIdx.x].seq; } #endif if (tid == 0) __insert_timestamp(__LINE__); @@ -557,34 +555,34 @@ __device__ void ncclKernel( while (true) { // Notify host that all fifo reads are complete. - if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) { - *shmem.channel.workFifoDone = shmem.work.header.doneAcks; + if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) { + *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks; } __syncwarp(); - if (shmem.work.header.type == ncclWorkTypeColl) { - if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&shmem.work.elems[tid]); - } else if (shmem.work.header.type == ncclWorkTypeRegColl) { - if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&shmem.work.regElems[tid].elem); + if (ncclShmem.work.header.type == ncclWorkTypeColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]); + } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem); } __syncthreads(); if (tid == 0) __insert_timestamp(__LINE__); - if (shmem.work.header.funcIndex == FnIndex) { - RunWork().run(&shmem.work); + if (ncclShmem.work.header.funcIndex == FnIndex) { + RunWork().run(&ncclShmem.work); } else { - NCCL_CALL_FUNCTIONS(shmem.work.header.funcIndex); + NCCL_CALL_FUNCTIONS(ncclShmem.work.header.funcIndex); } - int workIxNext = shmem.work.header.workNext; + int workIxNext = ncclShmem.work.header.workNext; __syncthreads(); - if (shmem.work.header.isLast) break; + if (ncclShmem.work.header.isLast) break; - copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork)); + copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork)); { // Check whether the last operation was aborted and make sure all threads exit int aborted = tid == 0 ? *comm->abortFlag : 0; - if (__any(aborted)) { // publish shmem.work + if (__any(aborted)) { // publish ncclShmem.work traceAbort(); break; } @@ -593,10 +591,10 @@ __device__ void ncclKernel( } if (COLLTRACE && tid == 0) traceKernelEnd(); #ifdef ENABLE_PROFILING - if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) { + if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) { __syncthreads(); - copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf)); - if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++; + copyToShmem16(tid, ncclShmem.comm.devProf+MAXCHANNELS*ncclShmem.prof.seq+blockIdx.x, &ncclShmem.prof, sizeof(struct ncclProf)); + if (tid == 0) ncclShmem.comm.devProf[blockIdx.x].seq++; } #endif } @@ -626,7 +624,7 @@ __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(st /* Functions for aggregation case */ #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ - RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem->work); \ + RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ } // Only generate inline kernels for LL diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index e9958db464..c29744c975 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -9,7 +9,7 @@ #include "collectives.h" #include "common.h" -__device__ struct ncclShmemData* ncclShmem; +__shared__ ncclShmemData ncclShmem; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index d10345e7d2..256b236386 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -13,7 +13,7 @@ namespace { template __device__ __attribute__((noinline)) void oneRankReduce() { - ncclWork *w = &ncclShmem->work; + ncclWork *w = &ncclShmem.work; int tid = threadIdx.x; int tn = blockDim.x; #pragma unroll 1 diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 3246988342..3f6897de81 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -45,7 +45,7 @@ struct ProtoSimple { // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { - return ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { @@ -63,7 +63,7 @@ struct ProtoLL { // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { - return ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data + return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { @@ -81,7 +81,7 @@ struct ProtoLL128 { // Data bytes (no flags etc) in one step of the fifo queue. __device__ static int calcBytePerStep() { - return (ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS; + return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS; } // Granularity of data bytes transferred per thread. __device__ static int calcBytePerGrain() { diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index 4e91e188c5..0c96ca9022 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -83,7 +83,7 @@ private: inline __device__ int checkAbort(int &spins, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = __atomic_load_n((ncclShmem->comm.abortFlag), __ATOMIC_SEQ_CST); + abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST); spins = 0; } return abort; @@ -93,7 +93,7 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif if (sendConnHeadPtr) { @@ -114,7 +114,7 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -376,7 +376,7 @@ private: npKitWaitRecvTotalTime = 0; npKitWaitRecvDataProcessSize = nelem*sizeof(T); NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY, - npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + npKitWaitRecvDataProcessSize, 0, __builtin_amdgcn_s_memrealtime(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -446,7 +446,7 @@ private: if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT, npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -494,11 +494,11 @@ private: ): redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group&(uint16_t)0xFFFF), - stepLines(ncclShmem->comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { - barriers = &ncclShmem->groups[this->group].barrier; - barrier_next = ncclShmem->groups[this->group].barrier_next; + stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) { + barriers = &ncclShmem.groups[this->group].barrier; + barrier_next = ncclShmem.groups[this->group].barrier_next; - auto *channel = &ncclShmem->channel; + auto *channel = &ncclShmem.channel; // If we are going to support oneshot collNet + LL, then we would need to add connector index here int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { @@ -539,14 +539,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -554,14 +554,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -569,14 +569,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -584,14 +584,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -599,14 +599,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -614,14 +614,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -629,14 +629,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } @@ -644,14 +644,14 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 58653a7c9e..f5d7f31a66 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -78,7 +78,7 @@ private: inline __device__ int checkAbort(int &spins, int i, int send) { spins++; if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - abort = __atomic_load_n(ncclShmem->comm.abortFlag, __ATOMIC_SEQ_CST); + abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST); spins = 0; } return abort; @@ -408,11 +408,11 @@ public: redOp(redOpArg), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%4)==3), group(group&(uint16_t)0xFFFF), - stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { - barriers = &ncclShmem->groups[this->group].barrier; - barrier_next = ncclShmem->groups[this->group].barrier_next; + stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) { + barriers = &ncclShmem.groups[this->group].barrier; + barrier_next = ncclShmem.groups[this->group].barrier_next; - auto *channel = &ncclShmem->channel; + auto *channel = &ncclShmem.channel; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) { loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv); diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index 24012f1dad..e268d4f306 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -90,7 +90,7 @@ private: inline __device__ bool checkAbort(int &spins) { spins++; if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { - flags |= atomicAdd_system((unsigned int *)ncclShmem->comm.abortFlag, 0) ? Aborted : 0; + flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0; spins = 0; } return flags & Aborted; @@ -108,7 +108,7 @@ private: __builtin_amdgcn_s_sleep(8); connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0); if (checkAbort(spins)) break; - //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); + //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } __asm__ __volatile__("s_wakeup"); @@ -118,8 +118,8 @@ private: if (isSendNotRecv && (flags & SizesFifoEnabled)) __atomic_store_n((connSizesFifoPtr+step%NCCL_STEPS), nelts*sizeof(T), __ATOMIC_SEQ_CST); - void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst) - : (ncclShmem->groups[group].srcs + Src); + void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) + : (ncclShmem.groups[group].srcs + Src); if (flags & OffsFifoEnabled) ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { @@ -200,19 +200,19 @@ private: do { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput))) - ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset; + ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset; if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput))) - ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset; + ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset; waitPeer(dstIx, remoteIx, offset, sliceSize); subBarrier(); - if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { + if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -225,8 +225,8 @@ private: // (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0). ReduceOrCopyMulti (tid, nworkers, nullptr, false, - 1, (T const**)ncclShmem->groups[group].srcs, - fan.nsend(), (T**)ncclShmem->groups[group].dsts+1, + 1, (T const**)ncclShmem.groups[group].srcs, + fan.nsend(), (T**)ncclShmem.groups[group].dsts+1, sliceSize); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) @@ -239,18 +239,18 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } - } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) { + } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) { // For broadcast in CollNet to do empty send #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -261,9 +261,9 @@ private: #endif ReduceOrCopyMulti - (tid, nworkers, ncclShmem->redOpArgs, postOp, - Recv, (T const**)ncclShmem->groups[group].srcs, - Dst, (T**)ncclShmem->groups[group].dsts, + (tid, nworkers, ncclShmem.redOpArgs, postOp, + Recv, (T const**)ncclShmem.groups[group].srcs, + Dst, (T**)ncclShmem.groups[group].dsts, sliceSize); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) @@ -276,7 +276,7 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -285,7 +285,7 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -298,9 +298,9 @@ private: constexpr int PreOpN = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; ReduceOrCopyMulti - (tid, nworkers, ncclShmem->redOpArgs, postOp, - Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs, - Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts, + (tid, nworkers, ncclShmem.redOpArgs, postOp, + Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs, + Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts, sliceSize); #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME) @@ -313,15 +313,13 @@ private: #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) if (tid == 0) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } barrier(); // This barrier has a counterpart in following loop -#if defined(__gfx1030__) - if (Send && (flags & RolePostSend) && index == 0) __threadfence_system(); -#endif + if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system(); __syncwarp(); postPeer(); offset += sliceSize; @@ -341,9 +339,7 @@ private: waitPeer(0, 0, 0, 0); } barrier(); // Has couterpart in preceding worker-only loop. -#if defined(__gfx1030__) - if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); -#endif + if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); __syncwarp(); postPeer(); offset += sliceSize; @@ -370,8 +366,8 @@ private: if (Send) { // Scatter pre-scales data of input buffer only in non-Direct case constexpr int PreOpN = DirectSend ? 0 : 1; - if (flags & RoleInput) ncclShmem->groups[group].srcs[0] = userBuff + inpIx + offset; - if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] = 0; // Skip the threadfence + if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset; + if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] = 0; // Skip the threadfence // realSize is not accurate here; but intra-node does not rely on sizes FIFO waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize); subBarrier(); @@ -382,22 +378,22 @@ private: int peerOffset = i*peerElem; // Skip the data I am responsible of reducing myself if (skip >= 0 && i >= skip) peerOffset += peerElem; - const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset; + const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); - if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) { - ReduceOrCopyMulti(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize); + if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { + ReduceOrCopyMulti(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize); // Mark for threadfence at the end - if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize; + if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize; } } } else if (Recv) { - if (flags & RoleOutput) ncclShmem->groups[group].dsts[0] = userBuff + outIx + offset; + if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset; int peerOffset = index*peerElem; if (skip >= 0 && index >= skip) peerOffset += peerElem; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx, outIx+peerOffset, offset, realSize); subBarrier(); - if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { + if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) { // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv // Do nothing } else { @@ -406,16 +402,16 @@ private: int i = (j+shift)%fan.nrecv(); peerOffset = i*peerElem; if (skip >= 0 && i >= skip) peerOffset += peerElem; - T* dst0 = (T*)ncclShmem->groups[group].dsts[0] + peerOffset; + T* dst0 = (T*)ncclShmem.groups[group].dsts[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); - if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, ncclShmem->redOpArgs, postOp, 1, (const T**)ncclShmem->groups[group].srcs+i, 1, &dst0, realPeerSize); + if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, ncclShmem.redOpArgs, postOp, 1, (const T**)ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize); } } } } barrier(); // If we indeed send something, threadfence - if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0) + if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0) __threadfence_system(); __syncwarp(); postPeer(); @@ -433,7 +429,7 @@ private: atomicExch_system((unsigned long long *)connStepPtr, step); // Return credits in case we rounded up. } if (flags & RoleWaitRecv) { - ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() + ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() connStepPtr = conn->tail; connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0); flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; @@ -473,7 +469,7 @@ private: next_hdp_reg = conn->next_hdp_reg; } if (flags & RoleWaitSend) { - ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() + ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() connStepPtr = conn->head; connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0); flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; @@ -508,20 +504,20 @@ private: } public: - __device__ Primitives( + __forceinline__ __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr ): tid(tid), - stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { + stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nthreads = nthreads; this->nworkers = nthreads; this->group = group & (uint16_t)0xFFFF; int connIndex = group >> 16; - barriers = &ncclShmem->groups[this->group].barrier; - barrier_next = ncclShmem->groups[this->group].barrier_next; + barriers = &ncclShmem.groups[this->group].barrier; + barrier_next = ncclShmem.groups[this->group].barrier_next; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; @@ -551,30 +547,30 @@ private: if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; - loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e); - loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e); + loadRecvConn(&ncclShmem.channel.peers[peer], connIndex, e); + loadSendConn(&ncclShmem.channel.peers[peer], connIndex, e); setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } - __device__ ~Primitives() { - // Ensure ncclShmem->groups[].send/recvConns are available + __forceinline__ __device__ ~Primitives() { + // Ensure ncclShmem.groups[].send/recvConns are available if (!(flags & ThreadsSynced)) barrier(); // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) { - auto *conns = (flags & RolePostSend) ? ncclShmem->groups[group].sendConns : ncclShmem->groups[group].recvConns; + auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns; conns[index]->step = step; } // Make sure all threads are done writing back conn->step and done using - // ncclShmem->groups[group] + // ncclShmem.groups[group] barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) { if (flags & RoleInput) { userBuff = (T*)inputBuf; - ncclShmem->redOpArgs[0] = redOpArg; // scaler for local input + ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input } if (flags & RoleOutput) userBuff = (T*)outputBuf; bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); @@ -585,7 +581,7 @@ private: if (Direct && recvProvider) { int spins = 0; - void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange; + void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; // Wait for consumer to consume previous value before trampling it. while ((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr && !checkAbort(spins)); directBuff = (T*)outputBuf; @@ -596,7 +592,7 @@ private: } if (Direct && sendAcceptor) { int spins = 0; - void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange; + void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; void *ptr; while (true) { ptr = (void *)atomicAdd_system((unsigned long long *) slot,0); @@ -608,9 +604,9 @@ private: } if (Direct && sendProvider) { int spins = 0; - void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange; - volatile uint64_t* argSlot0 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange; - volatile uint64_t* argSlot1 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange+1; + void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange; + volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange; + volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1; // Wait for consumer to consume previous value before trampling it. while (((void *)atomicAdd_system((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) @@ -626,9 +622,9 @@ private: } if (Direct && recvAcceptor) { int spins = 0; - void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange; - volatile uint64_t* argSlot0 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange; - volatile uint64_t* argSlot1 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange+1; + void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange; + volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange; + volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1; void *ptr; while (true) { ptr = (void *)atomicAdd_system((unsigned long long *) slot,0); @@ -644,7 +640,7 @@ private: arg1 = *argSlot1; if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; } - ncclShmem->redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff); + ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff); } *argSlot0 = 0; *argSlot1 = 0; *slot = nullptr; diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 08bb7f9f1b..ac40072921 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -16,13 +16,13 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclRing *ring = &ncclShmem->channel.ring; + ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); - const int nranks = ncclShmem->comm.nRanks; + const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; - const int rank = ncclShmem->comm.rank; + const int rank = ncclShmem.comm.rank; const int prevRank = ring->userRanks[nranks-1]; const int root = args->root; @@ -71,7 +71,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } @@ -79,14 +79,14 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 975c4533ba..9600a900dc 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -16,12 +16,12 @@ namespace { const int nthreads = args->nWarps*WARP_SIZE; const int bid = args->bid; const int nChannels = args->nChannels; - ncclRing *ring = &ncclShmem->channel.ring; + ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->userRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); - const int nranks = ncclShmem->comm.nRanks; + const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->count; @@ -69,7 +69,7 @@ namespace { template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { using Proto = ProtoSimple; runRing(args); } @@ -77,14 +77,14 @@ struct RunWorkElement struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; template struct RunWorkElement { - __device__ __attribute__((noinline)) void run(ncclWorkElem *args) { + __device__ __forceinline__ void run(ncclWorkElem *args) { runRing(args); } }; diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index c6f082f859..ef5b26aeae 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -25,20 +25,20 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (isNpKitThread) { - uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp; + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif - if (args->peer == ncclShmem->comm.rank) { + if (args->peer == ncclShmem.comm.rank) { struct ncclWorkElemP2p* recvArgs = args-1; void* recvBuff = reinterpret_cast(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32); if (buff != recvBuff) { @@ -46,14 +46,14 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -62,14 +62,14 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -90,7 +90,7 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -105,7 +105,7 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif @@ -120,20 +120,20 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) if (isNpKitThread) { - uint64_t* cpuTimestamp = ncclShmem->comm.cpuTimestamp; + uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp; NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif - if (args->peer != ncclShmem->comm.rank) { + if (args->peer != ncclShmem.comm.rank) { using Proto = ProtoSimple<1, 1>; void* buff = reinterpret_cast(uintptr_t(args->buffHi32)<<32 | args->buffLo32); ssize_t count = reinterpret_cast(size_t(args->countHi32)<<32 | args->countLo32); @@ -151,7 +151,7 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); prims.npKitDataProcessTotalTime = 0; } #endif @@ -166,14 +166,14 @@ struct RunWork { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT) if (isNpKitThread) { NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, __builtin_amdgcn_s_memrealtime(), - ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx); + ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); } #endif } } - __device__ __forceinline__ void run(ncclWork *work) { + __device__ __attribute__((noinline)) void run(ncclWork *work) { struct ncclWorkElemP2p* args = work->p2pElems; int ngroups = args->ngroups; int tid = threadIdx.x;