From ec6efa9b265cc5eaa9965aef50bd20abe1e19619 Mon Sep 17 00:00:00 2001 From: Arm Patinyasakdikul Date: Fri, 6 Jun 2025 07:34:43 -0500 Subject: [PATCH] Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. (#1720) * Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. * Create ncclShmemScratchWarpSize on host side for enqueue.cc. * Update src/enqueue.cc Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> * address comments * fix number of threads --------- Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> --- src/enqueue.cc | 57 ++++++++++++++++++++++++++++++++++---------- src/include/device.h | 24 +++++++++++++++---- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/enqueue.cc b/src/enqueue.cc index 522bb18354..b4949e5206 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -52,6 +52,31 @@ static ncclKernelMatch const ncclKerns[3] = { }; #endif +static int rcclProtoGrainSize(int proto, ncclComm *comm){ + switch (proto) { + case NCCL_PROTO_LL: return 16; + case NCCL_PROTO_LL128: return comm->WarpSize*(NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS)*NCCL_LL128_DATAELEMS*sizeof(uint64_t); + case NCCL_PROTO_SIMPLE: return 512; + default: return -1; + } +} + +/* Copy of ncclShmemScratchWarpSize */ +constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) { + return (max_constexpr( + /*LL */0, + /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WarpSize)*sizeof(uint64_t), + /*SIMPLE*/(ncclCollUnroll(cudaArch)*WarpSize + 1)*16, + // NVLS needs an extra 16B to read unaligned data. + /*NVLS */WarpSize*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16 + ) + 15) & -16; // pad to 16 bytes +} + +/* Copy of ncclShmemDynamicSize */ +constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) { + return cudaArch < 700 ? 0 : rcclShmemScratchWarpSize(cudaArch, WarpSize)*(NCCL_MAX_NTHREADS/WarpSize); +} + NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0); // Returns maximum kernel stack size of all CUDA kernels @@ -62,7 +87,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma if (maxStackSize) *maxStackSize = 0; int carveout = ncclParamL1SharedMemoryCarveout(); - int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch); + + int WarpSize = -1; + int cudaDev = -1; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev)); + int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize); for (int k=0; k < KernelCount; k++) { void* fn = ncclKerns[k].kernelFn; @@ -197,7 +227,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) { size_t workBytes = plan->workBytes; size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch); - plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS); + plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/); // If we can fit everything into the kernel args we do so. if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) { @@ -668,7 +698,8 @@ static ncclResult_t scheduleCollTasksToPlan( } uint32_t chunkSize, directFlags=0; - size_t grainSize = ncclProtoGrainSize(task->protocol); + size_t grainSize = rcclProtoGrainSize(task->protocol, comm); + if (countLo != 0) { NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo)); devWork->cbd.chunkGrainsLo = chunkSize/grainSize; @@ -750,7 +781,7 @@ static ncclResult_t scheduleCollTasksToPlan( plan->channelMask.masks[maskIdx] |= (1ull<channelMask.masks[channelId/64] |= (2ull<channelHi) - (1ull<channelLo); - plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize); + plan->threadPerBlock = std::max(plan->threadPerBlock, 192 /* 3*WARP_SIZE */); if (!plan->kernelSpecialized) { plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn; plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized; @@ -775,9 +806,9 @@ static ncclResult_t scheduleCollTasksToPlan( ncclProtoToString(task->protocol), (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi, (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi, - int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)), - int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)), - int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol))); + int(devWork->cbd.chunkGrainsLo*rcclProtoGrainSize(task->protocol, comm)), + int(devWork->cbd.chunkGrainsMid*rcclProtoGrainSize(task->protocol, comm)), + int(devWork->cbd.chunkGrainsHi*rcclProtoGrainSize(task->protocol), comm)); } } @@ -1516,7 +1547,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan void* sym = plan->kernelFn; dim3 grid = {(unsigned)nChannels, 1, 1}; dim3 block = {(unsigned)plan->threadPerBlock, 1, 1}; - int smem = ncclShmemDynamicSize(comm->cudaArch); + int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize); cudaStream_t launchStream = planner->streams->stream; void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize}; @@ -1790,11 +1821,11 @@ static ncclResult_t topoGetAlgoInfo( } } if (info->protocol == NCCL_PROTO_SIMPLE) { - if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync + if (info->algorithm == NCCL_ALGO_RING) nt += comm->WarpSize; // Extra warp for sync // More threads or sync warps needed due to split thread model - if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE; + if (info->algorithm == NCCL_ALGO_TREE) nt += 4*comm->WarpSize; } - nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt; + nt = nt/comm->WarpSize < 3 ? 3*comm->WarpSize : nt; #endif if (info->func == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) { static int userTuneInput = -2; @@ -1826,7 +1857,7 @@ static ncclResult_t topoGetAlgoInfo( } if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always. if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS; - info->nWarps = nt/WARP_SIZE; + info->nWarps = nt/comm->WarpSize; return ncclSuccess; } @@ -1871,7 +1902,7 @@ static ncclResult_t calcCollChunking( /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp ) { ncclPattern_t pattern; - size_t grainSize = ncclProtoGrainSize(info->protocol); + size_t grainSize = rcclProtoGrainSize(info->protocol, comm); switch (info->func) { case ncclFuncBroadcast: diff --git a/src/include/device.h b/src/include/device.h index a240df6e2d..d9141d58f3 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -67,7 +67,21 @@ union ncclLLFifoLine { int4 i4; }; -#define WARP_SIZE warpSize +#if __HIP_DEVICE_COMPILE__ + #if defined(__GFX9__) + #define WARP_SIZE 64 + #else + #define WARP_SIZE 32 + #endif +#else + /* IMPORTANT: + * WARP_SIZE should NEVER be referenced by host code in RCCL. It is defined here + * solely as a workaround to allow RCCL to compile, since the host still compiles __device__ functions, + * and WARP_SIZE needs to be defined. These __device__ functions will not be called from the host. + * The host warp size is handled in src/enqueue.cc by calling hipDeviceGetAttributes(). */ + #define WARP_SIZE 32 +#endif + #define MAXCHANNELS 128 #define CHANNEL_LIMIT 16 #define NCCL_MAX_LOCAL_RANKS 72 @@ -316,7 +330,7 @@ struct alignas(16) ncclDevWorkColl { }; -__host__ __device__ constexpr int ncclProtoGrainSize(int proto) { +__device__ constexpr int ncclProtoGrainSize(int proto) { return proto == NCCL_PROTO_LL ? 16 : proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) : proto == NCCL_PROTO_SIMPLE ? 512 : @@ -324,7 +338,7 @@ __host__ __device__ constexpr int ncclProtoGrainSize(int proto) { } template -__host__ __device__ inline void ncclCollCbdPart( +__device__ inline void ncclCollCbdPart( struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize, Int* count, Int* partOffset, Int* partCount, Int* chunkCount ) { @@ -618,7 +632,7 @@ __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = } // The amount of dynamic shmem per warp -__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { +__device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { return (max_constexpr( /*LL */0, /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t), @@ -629,7 +643,7 @@ __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_C } // The amount of dynamic shmem per block -__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { +__device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); }