Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. (#1720)

* Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0. * Create ncclShmemScratchWarpSize on host side for enqueue.cc. * Update src/enqueue.cc Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com> * address comments * fix number of threads --------- Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
2025-06-06 07:34:43 -05:00
parent d5b5f6b159
commit ec6efa9b26
2 changed files with 63 additions and 18 deletions
@@ -52,6 +52,31 @@ static ncclKernelMatch const ncclKerns[3] = {
 };
 #endif

+static int rcclProtoGrainSize(int proto, ncclComm *comm){
+  switch (proto) {
+    case NCCL_PROTO_LL: return 16;
+    case NCCL_PROTO_LL128: return comm->WarpSize*(NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS)*NCCL_LL128_DATAELEMS*sizeof(uint64_t);
+    case NCCL_PROTO_SIMPLE: return 512;
+    default: return -1;
+  }
+}
+
+/* Copy of ncclShmemScratchWarpSize */
+constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
+  return (max_constexpr<int>(
+      /*LL    */0,
+      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WarpSize)*sizeof(uint64_t),
+      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WarpSize + 1)*16,
+      // NVLS needs an extra 16B to read unaligned data.
+      /*NVLS  */WarpSize*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
+    ) + 15) & -16; // pad to 16 bytes
+}
+
+/* Copy of ncclShmemDynamicSize */
+constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
+  return cudaArch < 700 ? 0 : rcclShmemScratchWarpSize(cudaArch, WarpSize)*(NCCL_MAX_NTHREADS/WarpSize);
+}
+
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);

 // Returns maximum kernel stack size of all CUDA kernels
@@ -62,7 +87,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma

  if (maxStackSize) *maxStackSize = 0;
  int carveout = ncclParamL1SharedMemoryCarveout();
-  int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
+
+  int WarpSize = -1;
+  int cudaDev = -1;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
+  int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);

  for (int k=0; k < KernelCount; k++) {
    void* fn = ncclKerns[k].kernelFn;
@@ -197,7 +227,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  size_t workBytes = plan->workBytes;
  size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);

-  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
+  plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);

  // If we can fit everything into the kernel args we do so.
  if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) {
@@ -668,7 +698,8 @@ static ncclResult_t scheduleCollTasksToPlan(
      }

      uint32_t chunkSize, directFlags=0;
-      size_t grainSize = ncclProtoGrainSize(task->protocol);
+      size_t grainSize = rcclProtoGrainSize(task->protocol, comm);
+
      if (countLo != 0) {
        NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo));
        devWork->cbd.chunkGrainsLo = chunkSize/grainSize;
@@ -750,7 +781,7 @@ static ncclResult_t scheduleCollTasksToPlan(
      plan->channelMask.masks[maskIdx] |= (1ull<<relativeIdx);
    }
    //plan->channelMask.masks[channelId/64] |= (2ull<<devWork->channelHi) - (1ull<<devWork->channelLo);
-    plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
+    plan->threadPerBlock = std::max(plan->threadPerBlock, 192 /* 3*WARP_SIZE */);
    if (!plan->kernelSpecialized) {
      plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
      plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
@@ -775,9 +806,9 @@ static ncclResult_t scheduleCollTasksToPlan(
          ncclProtoToString(task->protocol),
          (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
          (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi,
-          int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)),
-          int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)),
-          int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol)));
+          int(devWork->cbd.chunkGrainsLo*rcclProtoGrainSize(task->protocol, comm)),
+          int(devWork->cbd.chunkGrainsMid*rcclProtoGrainSize(task->protocol, comm)),
+          int(devWork->cbd.chunkGrainsHi*rcclProtoGrainSize(task->protocol), comm));
      }
    }

@@ -1516,7 +1547,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
  void* sym = plan->kernelFn;
  dim3 grid = {(unsigned)nChannels, 1, 1};
  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
-  int smem = ncclShmemDynamicSize(comm->cudaArch);
+  int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
  cudaStream_t launchStream = planner->streams->stream;
  void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};

@@ -1790,11 +1821,11 @@ static ncclResult_t topoGetAlgoInfo(
    }
  }
  if (info->protocol == NCCL_PROTO_SIMPLE) {
-    if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
+    if (info->algorithm == NCCL_ALGO_RING) nt += comm->WarpSize; // Extra warp for sync
    // More threads or sync warps needed due to split thread model
-    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
+    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*comm->WarpSize;
  }
-  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
+  nt = nt/comm->WarpSize < 3 ? 3*comm->WarpSize : nt;
 #endif
  if (info->func == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) {
    static int userTuneInput = -2;
@@ -1826,7 +1857,7 @@ static ncclResult_t topoGetAlgoInfo(
  }
  if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
  if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
-  info->nWarps = nt/WARP_SIZE;
+  info->nWarps = nt/comm->WarpSize;
  return ncclSuccess;
 }

@@ -1871,7 +1902,7 @@ static ncclResult_t calcCollChunking(
    /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
  ) {
  ncclPattern_t pattern;
-  size_t grainSize = ncclProtoGrainSize(info->protocol);
+  size_t grainSize = rcclProtoGrainSize(info->protocol, comm);

  switch (info->func) {
  case ncclFuncBroadcast:
@@ -67,7 +67,21 @@ union ncclLLFifoLine {
  int4 i4;
 };

-#define WARP_SIZE warpSize
+#if __HIP_DEVICE_COMPILE__
+  #if defined(__GFX9__)
+  #define WARP_SIZE 64
+  #else
+  #define WARP_SIZE 32
+  #endif
+#else
+ /* IMPORTANT:
+  * WARP_SIZE should NEVER be referenced by host code in RCCL. It is defined here
+  * solely as a workaround to allow RCCL to compile, since the host still compiles __device__ functions,
+  * and WARP_SIZE needs to be defined. These __device__ functions will not be called from the host.
+  * The host warp size is handled in src/enqueue.cc by calling hipDeviceGetAttributes(). */
+  #define WARP_SIZE 32
+#endif
+
 #define MAXCHANNELS 128
 #define CHANNEL_LIMIT 16
 #define NCCL_MAX_LOCAL_RANKS 72
@@ -316,7 +330,7 @@ struct alignas(16) ncclDevWorkColl {
 };


-__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
+__device__ constexpr int ncclProtoGrainSize(int proto) {
  return proto == NCCL_PROTO_LL ? 16 :
        proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
        proto == NCCL_PROTO_SIMPLE ? 512 :
@@ -324,7 +338,7 @@ __host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
 }

 template<typename Int>
-__host__ __device__ inline void ncclCollCbdPart(
+__device__ inline void ncclCollCbdPart(
    struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
    Int* count, Int* partOffset, Int* partCount, Int* chunkCount
  ) {
@@ -618,7 +632,7 @@ __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch =
 }

 // The amount of dynamic shmem per warp
-__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
+__device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
  return (max_constexpr<int>(
      /*LL    */0,
      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
@@ -629,7 +643,7 @@ __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_C
 }

 // The amount of dynamic shmem per block
-__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
+__device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
 }