Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. (#1720)
* Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. * Create ncclShmemScratchWarpSize on host side for enqueue.cc. * Update src/enqueue.cc Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> * address comments * fix number of threads --------- Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
d5b5f6b159
commit
ec6efa9b26
+44
-13
@@ -52,6 +52,31 @@ static ncclKernelMatch const ncclKerns[3] = {
|
||||
};
|
||||
#endif
|
||||
|
||||
static int rcclProtoGrainSize(int proto, ncclComm *comm){
|
||||
switch (proto) {
|
||||
case NCCL_PROTO_LL: return 16;
|
||||
case NCCL_PROTO_LL128: return comm->WarpSize*(NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS)*NCCL_LL128_DATAELEMS*sizeof(uint64_t);
|
||||
case NCCL_PROTO_SIMPLE: return 512;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Copy of ncclShmemScratchWarpSize */
|
||||
constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
|
||||
return (max_constexpr<int>(
|
||||
/*LL */0,
|
||||
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WarpSize)*sizeof(uint64_t),
|
||||
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WarpSize + 1)*16,
|
||||
// NVLS needs an extra 16B to read unaligned data.
|
||||
/*NVLS */WarpSize*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
|
||||
) + 15) & -16; // pad to 16 bytes
|
||||
}
|
||||
|
||||
/* Copy of ncclShmemDynamicSize */
|
||||
constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
|
||||
return cudaArch < 700 ? 0 : rcclShmemScratchWarpSize(cudaArch, WarpSize)*(NCCL_MAX_NTHREADS/WarpSize);
|
||||
}
|
||||
|
||||
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
|
||||
|
||||
// Returns maximum kernel stack size of all CUDA kernels
|
||||
@@ -62,7 +87,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
|
||||
|
||||
if (maxStackSize) *maxStackSize = 0;
|
||||
int carveout = ncclParamL1SharedMemoryCarveout();
|
||||
int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
|
||||
|
||||
int WarpSize = -1;
|
||||
int cudaDev = -1;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
|
||||
int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
|
||||
|
||||
for (int k=0; k < KernelCount; k++) {
|
||||
void* fn = ncclKerns[k].kernelFn;
|
||||
@@ -197,7 +227,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
|
||||
size_t workBytes = plan->workBytes;
|
||||
size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
|
||||
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);
|
||||
|
||||
// If we can fit everything into the kernel args we do so.
|
||||
if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) {
|
||||
@@ -668,7 +698,8 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
}
|
||||
|
||||
uint32_t chunkSize, directFlags=0;
|
||||
size_t grainSize = ncclProtoGrainSize(task->protocol);
|
||||
size_t grainSize = rcclProtoGrainSize(task->protocol, comm);
|
||||
|
||||
if (countLo != 0) {
|
||||
NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo));
|
||||
devWork->cbd.chunkGrainsLo = chunkSize/grainSize;
|
||||
@@ -750,7 +781,7 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
plan->channelMask.masks[maskIdx] |= (1ull<<relativeIdx);
|
||||
}
|
||||
//plan->channelMask.masks[channelId/64] |= (2ull<<devWork->channelHi) - (1ull<<devWork->channelLo);
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
|
||||
plan->threadPerBlock = std::max(plan->threadPerBlock, 192 /* 3*WARP_SIZE */);
|
||||
if (!plan->kernelSpecialized) {
|
||||
plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
|
||||
plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
|
||||
@@ -775,9 +806,9 @@ static ncclResult_t scheduleCollTasksToPlan(
|
||||
ncclProtoToString(task->protocol),
|
||||
(long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
|
||||
(long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi,
|
||||
int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)),
|
||||
int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)),
|
||||
int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol)));
|
||||
int(devWork->cbd.chunkGrainsLo*rcclProtoGrainSize(task->protocol, comm)),
|
||||
int(devWork->cbd.chunkGrainsMid*rcclProtoGrainSize(task->protocol, comm)),
|
||||
int(devWork->cbd.chunkGrainsHi*rcclProtoGrainSize(task->protocol), comm));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1516,7 +1547,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
|
||||
void* sym = plan->kernelFn;
|
||||
dim3 grid = {(unsigned)nChannels, 1, 1};
|
||||
dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
|
||||
int smem = ncclShmemDynamicSize(comm->cudaArch);
|
||||
int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
|
||||
cudaStream_t launchStream = planner->streams->stream;
|
||||
void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};
|
||||
|
||||
@@ -1790,11 +1821,11 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
}
|
||||
}
|
||||
if (info->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
|
||||
if (info->algorithm == NCCL_ALGO_RING) nt += comm->WarpSize; // Extra warp for sync
|
||||
// More threads or sync warps needed due to split thread model
|
||||
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
|
||||
if (info->algorithm == NCCL_ALGO_TREE) nt += 4*comm->WarpSize;
|
||||
}
|
||||
nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
|
||||
nt = nt/comm->WarpSize < 3 ? 3*comm->WarpSize : nt;
|
||||
#endif
|
||||
if (info->func == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) {
|
||||
static int userTuneInput = -2;
|
||||
@@ -1826,7 +1857,7 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
}
|
||||
if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
|
||||
if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
|
||||
info->nWarps = nt/WARP_SIZE;
|
||||
info->nWarps = nt/comm->WarpSize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1871,7 +1902,7 @@ static ncclResult_t calcCollChunking(
|
||||
/*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
|
||||
) {
|
||||
ncclPattern_t pattern;
|
||||
size_t grainSize = ncclProtoGrainSize(info->protocol);
|
||||
size_t grainSize = rcclProtoGrainSize(info->protocol, comm);
|
||||
|
||||
switch (info->func) {
|
||||
case ncclFuncBroadcast:
|
||||
|
||||
+19
-5
@@ -67,7 +67,21 @@ union ncclLLFifoLine {
|
||||
int4 i4;
|
||||
};
|
||||
|
||||
#define WARP_SIZE warpSize
|
||||
#if __HIP_DEVICE_COMPILE__
|
||||
#if defined(__GFX9__)
|
||||
#define WARP_SIZE 64
|
||||
#else
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
#else
|
||||
/* IMPORTANT:
|
||||
* WARP_SIZE should NEVER be referenced by host code in RCCL. It is defined here
|
||||
* solely as a workaround to allow RCCL to compile, since the host still compiles __device__ functions,
|
||||
* and WARP_SIZE needs to be defined. These __device__ functions will not be called from the host.
|
||||
* The host warp size is handled in src/enqueue.cc by calling hipDeviceGetAttributes(). */
|
||||
#define WARP_SIZE 32
|
||||
#endif
|
||||
|
||||
#define MAXCHANNELS 128
|
||||
#define CHANNEL_LIMIT 16
|
||||
#define NCCL_MAX_LOCAL_RANKS 72
|
||||
@@ -316,7 +330,7 @@ struct alignas(16) ncclDevWorkColl {
|
||||
};
|
||||
|
||||
|
||||
__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
|
||||
__device__ constexpr int ncclProtoGrainSize(int proto) {
|
||||
return proto == NCCL_PROTO_LL ? 16 :
|
||||
proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
|
||||
proto == NCCL_PROTO_SIMPLE ? 512 :
|
||||
@@ -324,7 +338,7 @@ __host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
__host__ __device__ inline void ncclCollCbdPart(
|
||||
__device__ inline void ncclCollCbdPart(
|
||||
struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
|
||||
Int* count, Int* partOffset, Int* partCount, Int* chunkCount
|
||||
) {
|
||||
@@ -618,7 +632,7 @@ __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch =
|
||||
}
|
||||
|
||||
// The amount of dynamic shmem per warp
|
||||
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
__device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
return (max_constexpr<int>(
|
||||
/*LL */0,
|
||||
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
|
||||
@@ -629,7 +643,7 @@ __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_C
|
||||
}
|
||||
|
||||
// The amount of dynamic shmem per block
|
||||
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
__device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
|
||||
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user