From ec6efa9b265cc5eaa9965aef50bd20abe1e19619 Mon Sep 17 00:00:00 2001
From: Arm Patinyasakdikul <arm.patinyasakdikul@amd.com>
Date: Fri, 6 Jun 2025 07:34:43 -0500
Subject: [PATCH] Remove 'warpSize' compiler constant as it is deprecated in
 ROCm 7.0. (#1720)

* Remove 'warpSize' compiler constant as it is deprecated in ROCm 7.0.

* Create ncclShmemScratchWarpSize on host side for enqueue.cc.

* Update src/enqueue.cc

Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>

* address comments

* fix number of threads

---------

Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
---
 src/enqueue.cc       | 57 ++++++++++++++++++++++++++++++++++----------
 src/include/device.h | 24 +++++++++++++++----
 2 files changed, 63 insertions(+), 18 deletions(-)
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 522bb18354..b4949e5206 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -52,6 +52,31 @@ static ncclKernelMatch const ncclKerns[3] = {
 };
 #endif
 
+static int rcclProtoGrainSize(int proto, ncclComm *comm){
+  switch (proto) {
+    case NCCL_PROTO_LL: return 16;
+    case NCCL_PROTO_LL128: return comm->WarpSize*(NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS)*NCCL_LL128_DATAELEMS*sizeof(uint64_t);
+    case NCCL_PROTO_SIMPLE: return 512;
+    default: return -1;
+  }
+}
+
+/* Copy of ncclShmemScratchWarpSize */
+constexpr int rcclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
+  return (max_constexpr<int>(
+      /*LL    */0,
+      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WarpSize)*sizeof(uint64_t),
+      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WarpSize + 1)*16,
+      // NVLS needs an extra 16B to read unaligned data.
+      /*NVLS  */WarpSize*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
+    ) + 15) & -16; // pad to 16 bytes
+}
+
+/* Copy of ncclShmemDynamicSize */
+constexpr int rcclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH, int WarpSize = 32) {
+  return cudaArch < 700 ? 0 : rcclShmemScratchWarpSize(cudaArch, WarpSize)*(NCCL_MAX_NTHREADS/WarpSize);
+}
+
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 // Returns maximum kernel stack size of all CUDA kernels
@@ -62,7 +87,12 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
-  int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
+
+  int WarpSize = -1;
+  int cudaDev = -1;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, cudaDev));
+  int ncclMaxSharedMem = rcclShmemDynamicSize(cudaArch, WarpSize);
 
   for (int k=0; k < KernelCount; k++) {
     void* fn = ncclKerns[k].kernelFn;
@@ -197,7 +227,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   size_t workBytes = plan->workBytes;
   size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
 
-  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
+  plan->threadPerBlock = std::max(plan->threadPerBlock, 256 /*NCCL_MIN_NTHREADS*/);
 
   // If we can fit everything into the kernel args we do so.
   if (sizeof(ncclDevKernelArgs) + batchBytes + workBytes <= comm->workArgsBytes) {
@@ -668,7 +698,8 @@ static ncclResult_t scheduleCollTasksToPlan(
       }
 
       uint32_t chunkSize, directFlags=0;
-      size_t grainSize = ncclProtoGrainSize(task->protocol);
+      size_t grainSize = rcclProtoGrainSize(task->protocol, comm);
+
       if (countLo != 0) {
         NCCLCHECK(calcCollChunking(comm, task, /*nChannels=*/1, globalBytesPerElement*countLo, &chunkSize, &directFlags, &proxyOpLo));
         devWork->cbd.chunkGrainsLo = chunkSize/grainSize;
@@ -750,7 +781,7 @@ static ncclResult_t scheduleCollTasksToPlan(
       plan->channelMask.masks[maskIdx] |= (1ull<<relativeIdx);
     }
     //plan->channelMask.masks[channelId/64] |= (2ull<<devWork->channelHi) - (1ull<<devWork->channelLo);
-    plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
+    plan->threadPerBlock = std::max(plan->threadPerBlock, 192 /* 3*WARP_SIZE */);
     if (!plan->kernelSpecialized) {
       plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
       plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
@@ -775,9 +806,9 @@ static ncclResult_t scheduleCollTasksToPlan(
           ncclProtoToString(task->protocol),
           (long)task->count, task->devFuncId, devWork->channelLo, devWork->channelHi,
           (long)devWork->cbd.countLo, (long)devWork->cbd.countMid, (long)devWork->cbd.countHi,
-          int(devWork->cbd.chunkGrainsLo*ncclProtoGrainSize(task->protocol)),
-          int(devWork->cbd.chunkGrainsMid*ncclProtoGrainSize(task->protocol)),
-          int(devWork->cbd.chunkGrainsHi*ncclProtoGrainSize(task->protocol)));
+          int(devWork->cbd.chunkGrainsLo*rcclProtoGrainSize(task->protocol, comm)),
+          int(devWork->cbd.chunkGrainsMid*rcclProtoGrainSize(task->protocol, comm)),
+          int(devWork->cbd.chunkGrainsHi*rcclProtoGrainSize(task->protocol), comm));
       }
     }
 
@@ -1516,7 +1547,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   void* sym = plan->kernelFn;
   dim3 grid = {(unsigned)nChannels, 1, 1};
   dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
-  int smem = ncclShmemDynamicSize(comm->cudaArch);
+  int smem = rcclShmemDynamicSize(comm->cudaArch, comm->WarpSize);
   cudaStream_t launchStream = planner->streams->stream;
   void* extra[] = {plan->kernelArgs, &plan->kernelArgsSize};
 
@@ -1790,11 +1821,11 @@ static ncclResult_t topoGetAlgoInfo(
     }
   }
   if (info->protocol == NCCL_PROTO_SIMPLE) {
-    if (info->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
+    if (info->algorithm == NCCL_ALGO_RING) nt += comm->WarpSize; // Extra warp for sync
     // More threads or sync warps needed due to split thread model
-    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
+    if (info->algorithm == NCCL_ALGO_TREE) nt += 4*comm->WarpSize;
   }
-  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
+  nt = nt/comm->WarpSize < 3 ? 3*comm->WarpSize : nt;
 #endif
   if (info->func == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) {
     static int userTuneInput = -2;
@@ -1826,7 +1857,7 @@ static ncclResult_t topoGetAlgoInfo(
   }
   if (info->algorithm == NCCL_ALGO_TREE) nt = NCCL_MAX_NTHREADS; // Tree now uses all threads always.
   if (info->algorithm == NCCL_ALGO_PAT) nt = NCCL_MAX_NTHREADS;
-  info->nWarps = nt/WARP_SIZE;
+  info->nWarps = nt/comm->WarpSize;
   return ncclSuccess;
 }
 
@@ -1871,7 +1902,7 @@ static ncclResult_t calcCollChunking(
     /*outputs*/uint32_t* outChunkSize, uint32_t* outDirectFlags, struct ncclProxyOp* proxyOp
   ) {
   ncclPattern_t pattern;
-  size_t grainSize = ncclProtoGrainSize(info->protocol);
+  size_t grainSize = rcclProtoGrainSize(info->protocol, comm);
 
   switch (info->func) {
   case ncclFuncBroadcast:
diff --git a/src/include/device.h b/src/include/device.h
index a240df6e2d..d9141d58f3 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -67,7 +67,21 @@ union ncclLLFifoLine {
   int4 i4;
 };
 
-#define WARP_SIZE warpSize
+#if __HIP_DEVICE_COMPILE__
+  #if defined(__GFX9__)
+  #define WARP_SIZE 64
+  #else
+  #define WARP_SIZE 32
+  #endif
+#else
+ /* IMPORTANT:
+  * WARP_SIZE should NEVER be referenced by host code in RCCL. It is defined here
+  * solely as a workaround to allow RCCL to compile, since the host still compiles __device__ functions,
+  * and WARP_SIZE needs to be defined. These __device__ functions will not be called from the host.
+  * The host warp size is handled in src/enqueue.cc by calling hipDeviceGetAttributes(). */
+  #define WARP_SIZE 32
+#endif
+
 #define MAXCHANNELS 128
 #define CHANNEL_LIMIT 16
 #define NCCL_MAX_LOCAL_RANKS 72
@@ -316,7 +330,7 @@ struct alignas(16) ncclDevWorkColl {
 };
 
 
-__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
+__device__ constexpr int ncclProtoGrainSize(int proto) {
   return proto == NCCL_PROTO_LL ? 16 :
         proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
         proto == NCCL_PROTO_SIMPLE ? 512 :
@@ -324,7 +338,7 @@ __host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
 }
 
 template<typename Int>
-__host__ __device__ inline void ncclCollCbdPart(
+__device__ inline void ncclCollCbdPart(
     struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
     Int* count, Int* partOffset, Int* partCount, Int* chunkCount
   ) {
@@ -618,7 +632,7 @@ __host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch =
 }
 
 // The amount of dynamic shmem per warp
-__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
+__device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
   return (max_constexpr<int>(
       /*LL    */0,
       /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
@@ -629,7 +643,7 @@ __host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_C
 }
 
 // The amount of dynamic shmem per block
-__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
+__device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
   return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
 }