Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. (#1720)

* Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0.

* Create ncclShmemScratchWarpSize on host side for enqueue.cc.

* Update src/enqueue.cc

Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>

* address comments

* fix number of threads

---------

Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
This commit is contained in:
Arm Patinyasakdikul
2025-06-06 07:34:43 -05:00
committed by GitHub
parent d5b5f6b159
commit ec6efa9b26
2 changed files with 63 additions and 18 deletions
+44 -13
View File
@@ -52,6 +52,31 @@ static ncclKernelMatch const ncclKerns[3] = {
};
#endif
static int rcclProtoGrainSize(int proto, ncclComm *comm){
switch (proto) {
case NCCL_PROTO_LL: return 16;
case NCCL_PROTO_LL128: return comm->WarpSize*(NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS)*NCCL_LL128_DATAELEMS*sizeof(uint64_t);
case NCCL_PROTO_SIMPLE: return 512;
default: return -1;
}
}
/* Copy of ncclShmemScratchWarpSize */
constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
return (max_constexpr<int>(
/*LL */0,
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WarpSize)*sizeof(uint64_t),
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WarpSize + 1)*16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */WarpSize*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
) + 15) & -16; // pad to 16 bytes
}
/* Copy of ncclShmemDynamicSize */
constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
return cudaArch < 700 ? 0 : rcclShmemScratchWarpSize(cudaArch, WarpSize)*(NCCL_MAX_NTHREADS/WarpSize);
}
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
// Returns maximum kernel stack size of all CUDA kernels
@@ -62,7 +87,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
if (maxStackSize) *maxStackSize = 0;
int carveout = ncclParamL1SharedMemoryCarveout();
int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
int WarpSize = -1;
int cudaDev = -1;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
for (int k=0; k < KernelCount; k++) {
void* fn = ncclKerns[k].kernelFn;
@@ -197,7 +227,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
size_t workBytes = plan->workBytes;
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);
// If we can fit everything into the kernel args we do so.
if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) {
@@ -668,7 +698,8 @@ static ncclResult_t scheduleCollTasksToPlan(
}
uint32_t chunkSize, directFlags=0;
size_t grainSize = ncclProtoGrainSize(task->protocol);
size_t grainSize = rcclProtoGrainSize(task->protocol, comm);
if (countLo != 0) {
NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo));
devWork->cbd.chunkGrainsLo = chunkSize/grainSize;
@@ -750,7 +781,7 @@ static ncclResult_t scheduleCollTasksToPlan(
plan->channelMask.masks[maskIdx] |= (1ull<<relativeIdx);
}
//plan->channelMask.masks[channelId/64] |= (2ull<<devWork->channelHi) - (1ull<<devWork->channelLo);
plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
plan->threadPerBlock = std::max(plan->threadPerBlock, 192 /* 3*WARP_SIZE */);
if (!plan->kernelSpecialized) {
plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
@@ -775,9 +806,9 @@ static ncclResult_t scheduleCollTasksToPlan(
ncclProtoToString(task->protocol),
(long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
(long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi,
int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)),
int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)),
int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol)));
int(devWork->cbd.chunkGrainsLo*rcclProtoGrainSize(task->protocol, comm)),
int(devWork->cbd.chunkGrainsMid*rcclProtoGrainSize(task->protocol, comm)),
int(devWork->cbd.chunkGrainsHi*rcclProtoGrainSize(task->protocol), comm));
}
}
@@ -1516,7 +1547,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
void* sym = plan->kernelFn;
dim3 grid = {(unsigned)nChannels, 1, 1};
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
int smem = ncclShmemDynamicSize(comm->cudaArch);
int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
cudaStream_t launchStream = planner->streams->stream;
void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};
@@ -1790,11 +1821,11 @@ static ncclResult_t topoGetAlgoInfo(
}
}
if (info->protocol == NCCL_PROTO_SIMPLE) {
if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
if (info->algorithm == NCCL_ALGO_RING) nt += comm->WarpSize; // Extra warp for sync
// More threads or sync warps needed due to split thread model
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*comm->WarpSize;
}
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
nt = nt/comm->WarpSize < 3 ? 3*comm->WarpSize : nt;
#endif
if (info->func == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) {
static int userTuneInput = -2;
@@ -1826,7 +1857,7 @@ static ncclResult_t topoGetAlgoInfo(
}
if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
info->nWarps = nt/WARP_SIZE;
info->nWarps = nt/comm->WarpSize;
return ncclSuccess;
}
@@ -1871,7 +1902,7 @@ static ncclResult_t calcCollChunking(
/*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
) {
ncclPattern_t pattern;
size_t grainSize = ncclProtoGrainSize(info->protocol);
size_t grainSize = rcclProtoGrainSize(info->protocol, comm);
switch (info->func) {
case ncclFuncBroadcast:
+19 -5
View File
@@ -67,7 +67,21 @@ union ncclLLFifoLine {
int4 i4;
};
#define WARP_SIZE warpSize
#if __HIP_DEVICE_COMPILE__
#if defined(__GFX9__)
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
#endif
#else
/* IMPORTANT:
* WARP_SIZE should NEVER be referenced by host code in RCCL. It is defined here
* solely as a workaround to allow RCCL to compile, since the host still compiles __device__ functions,
* and WARP_SIZE needs to be defined. These __device__ functions will not be called from the host.
* The host warp size is handled in src/enqueue.cc by calling hipDeviceGetAttributes(). */
#define WARP_SIZE 32
#endif
#define MAXCHANNELS 128
#define CHANNEL_LIMIT 16
#define NCCL_MAX_LOCAL_RANKS 72
@@ -316,7 +330,7 @@ struct alignas(16) ncclDevWorkColl {
};
__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
__device__ constexpr int ncclProtoGrainSize(int proto) {
return proto == NCCL_PROTO_LL ? 16 :
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
proto == NCCL_PROTO_SIMPLE ? 512 :
@@ -324,7 +338,7 @@ __host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
}
template<typename Int>
__host__ __device__ inline void ncclCollCbdPart(
__device__ inline void ncclCollCbdPart(
struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
Int* count, Int* partOffset, Int* partCount, Int* chunkCount
) {
@@ -618,7 +632,7 @@ __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch =
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
__device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */0,
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
@@ -629,7 +643,7 @@ __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_C
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
__device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
}