Re-apply unroll=1 and 112 channels for gfx950 (#1706)

* Reapply "[SRC] Enable unroll=1 for gfx950 (#1602)" (#1667) This reverts commit a6972c0d09. * Reapply "[GRAPH] Increase default nChannels to 112 for gfx950 (#1596)" (#1620) This reverts commit 1a2eca1756. [ROCm/rccl commit: 12517a957e]
2025-05-28 14:58:10 -05:00
@@ -5,12 +5,15 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 ## Unreleased - RCCL 2.25.1 for ROCm 7.0.0

 ### Resolved issues
+
 * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
 * Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.

 ### Added

 * Added new GPU target `gfx950`.
+* Added support for `unroll=1` in device-code generation to improve performance
+* Set a default of 112 channels for a single node with `8 * gfx950`
 * Added MSCCL support for multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the
  environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.

@@ -17,14 +17,20 @@ struct RunWorkNop {
  __device__ void run() {}
 };

-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
+}
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
 }
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
 }
 #ifdef ENABLE_COLLTRACE
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
+}
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
 }
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
@@ -573,15 +573,19 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
      SpecializedRunWorkBatch().run();
    } else {
 #ifdef USE_INDIRECT_FUNCTION_CALL
-      if (COLL_UNROLL == 4)
+      if (COLL_UNROLL == 1)
+        ncclDevFuncTable_1[ncclShmem.funcId]();
+      else if (COLL_UNROLL == 2)
+        ncclDevFuncTable_2[ncclShmem.funcId]();
+      else
        ncclDevFuncTable_4[ncclShmem.funcId]();
-      else
-        ncclDevFuncTable[ncclShmem.funcId]();
 #else
-      if (COLL_UNROLL == 4)
-        NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId);
+      if (COLL_UNROLL == 1)
+        NCCL_CALL_FUNCTIONS_1(ncclShmem.funcId);
+      else if (COLL_UNROLL == 2)
+        NCCL_CALL_FUNCTIONS_2(ncclShmem.funcId);
      else
-        NCCL_CALL_FUNCTIONS(ncclShmem.funcId);
+        NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId);
 #endif
    }

@@ -624,10 +628,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
 #endif
 }

-__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 #ifdef ENABLE_COLLTRACE
-__global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 #endif

@@ -263,9 +263,29 @@ __device__ __forceinline__ void reduceCopy(
    }
  }

+/*
+ * For gfx90a,
+* Before we had `Unroll/2*(16/sizeof(T))/2`, which does not work with unroll=1
+* as unroll=1; `Unroll/2` = 0, which results in the above expression to 0, and is not supported
+* This was reformulated to `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
+*
+* Before: `Unroll/2*(16/sizeof(T))/2`
+*         sizeof(T)
+* unroll  1   2   4   8
+*   4     16  8   4   2
+*   2     8   4   2   1
+*   1     0   0   0   0
+*
+* After: `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
+*         sizeof(T)
+* unroll  1   2   4   8
+*   4     16  8   4   2
+*   2     8   4   2   1
+*   1     4   2   1   1
+*/
 #if defined(__gfx90a__)
  if (MinSrcs > 1) {
-    reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
+    reduceCopyPacks<RedFn, T, (Unroll*4 + sizeof(T) - 1)/sizeof(T), sizeof(T),
    MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
    (nThreads, thread, redArg, preOpArgs, postOp,
     nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
@@ -307,4 +327,4 @@ __device__ __forceinline__ void reduceCopy(
     nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
 }

-#endif // COMMON_KERNEL_H_
+#endif // COMMON_KERNEL_H_
@@ -9,7 +9,7 @@ all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
 all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos = ["LL","LL128","SIMPLE"]
 all_algos =  ["TREE","RING"]
-all_unroll = ["2", "4"]
+all_unroll = ["1", "2", "4"]

 all_params = [all_colls, all_algos, all_protos, all_redops, all_tys, all_unroll]

@@ -164,7 +164,9 @@ def calc_unroll_for_local_arch():
  # Homogeneous system is required to build for only 1 varient of unroll factor
  if len(gfx_targets) == 1:
    gfx_name, cu_count = gfx_targets[0]
-    if "gfx908" == gfx_name or (gfx_name in ["gfx942", "gfx950"] and cu_count > 80):
+    if "gfx950" == gfx_name:
+        return 1
+    elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
      return 2
    else:
      return 4
@@ -318,21 +320,38 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
  out("\n")

  out("typedef void(*ncclDevFuncPtr_t)();\n\n")
-  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n")
-  index = 0
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_1[] = {\n")
+  index1 = 0
+  for fn in primary_funcs:
+    coll, algo, proto, redop, ty, unroll = fn
+    if unroll != "1": continue
+    sym = paste("_", "ncclDevFunc", *fn)
+    if fn[2] == "LL128":
+      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
+      out("/*%4d*/ %s,\n#else\n" % (index1, sym))
+      fn_ll = fn[:2] + ("LL",) + fn[3:]
+      sym_ll = paste("_", "ncclDevFunc", *fn_ll)
+      out("/*%4d*/ %s,\n#endif\n" % (index1, sym_ll))
+    else:
+      out("/*%4d*/ %s,\n" % (index1, sym))
+    index1 += 1
+  out("nullptr};\n")
+  out("\n")
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_2[] = {\n")
+  index2 = 0
  for fn in primary_funcs:
    coll, algo, proto, redop, ty, unroll = fn
    if unroll != "2": continue
    sym = paste("_", "ncclDevFunc", *fn)
    if fn[2] == "LL128":
      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
-      out("/*%4d*/ %s,\n#else\n" % (index, sym))
+      out("/*%4d*/ %s,\n#else\n" % (index2, sym))
      fn_ll = fn[:2] + ("LL",) + fn[3:]
      sym_ll = paste("_", "ncclDevFunc", *fn_ll)
-      out("/*%4d*/ %s,\n#endif\n" % (index, sym_ll))
+      out("/*%4d*/ %s,\n#endif\n" % (index2, sym_ll))
    else:
-      out("/*%4d*/ %s,\n" % (index, sym))
-    index += 1
+      out("/*%4d*/ %s,\n" % (index2, sym))
+    index2 += 1
  out("nullptr};\n")
  out("\n")
  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_4[] = {\n")
@@ -355,22 +374,40 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
  
  if not is_ifc:
    out("template<unsigned short f, unsigned short l>\n"
-      "struct Caller {\n"
+      "struct Caller1 {\n"
      "  static __forceinline__ __device__ __host__\n"
-      "  void call(unsigned short funcIndex) noexcept\n"
+      "  void call1(unsigned short funcIndex) noexcept\n"
      "  {\n"
      "    constexpr unsigned short m = f + (l - f) / 2;\n"
-      "    return (funcIndex < m) ? Caller<f, m>::call(funcIndex) : Caller<m, l>::call(funcIndex);\n"
+      "    return (funcIndex < m) ? Caller1<f, m>::call1(funcIndex) : Caller1<m, l>::call1(funcIndex);\n"
      "  }\n"
      "};\n"
      "\n"
      "template<unsigned short f>\n"
-      "struct Caller<f, f + 1>{\n"
+      "struct Caller1<f, f + 1>{\n"
      "  static __forceinline__ __device__ __host__\n"
-      "  void call(unsigned short funcIndex) noexcept { ncclDevFuncTable[f](); }\n"
+      "  void call1(unsigned short funcIndex) noexcept { ncclDevFuncTable_1[f](); }\n"
      "};\n")
-    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {\n")
-    out(f"  Caller<0, {index}>::call(funcIndex);\n")
+    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {\n")
+    out(f"  Caller1<0, {index1}>::call1(funcIndex);\n")
+    out("}\n\n")
+    out("template<unsigned short f, unsigned short l>\n"
+      "struct Caller2 {\n"
+      "  static __forceinline__ __device__ __host__\n"
+      "  void call2(unsigned short funcIndex) noexcept\n"
+      "  {\n"
+      "    constexpr unsigned short m = f + (l - f) / 2;\n"
+      "    return (funcIndex < m) ? Caller2<f, m>::call2(funcIndex) : Caller2<m, l>::call2(funcIndex);\n"
+      "  }\n"
+      "};\n"
+      "\n"
+      "template<unsigned short f>\n"
+      "struct Caller2<f, f + 1>{\n"
+      "  static __forceinline__ __device__ __host__\n"
+      "  void call2(unsigned short funcIndex) noexcept { ncclDevFuncTable_2[f](); }\n"
+      "};\n")
+    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {\n")
+    out(f"  Caller2<0, {index2}>::call2(funcIndex);\n")
    out("}\n\n")
    out("template<unsigned short f, unsigned short l>\n"
      "struct Caller4 {\n"
@@ -422,7 +459,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
  # The mapping from function rows to valid primary function ids.
  out("extern int const ncclDevFuncRowToId[] = {\n")
  index = 0
-  for fn in func_rows[:len(func_rows)//2]:
+  for fn in func_rows[:len(func_rows)//3]:
    fn_id, comment = -1, ""
    if fn is not None:
      fn_id = primary_to_index[equivalent_primary(*fn)]
@@ -11,7 +11,9 @@
 #include "common.h"
 #include <cuda_runtime.h>

-#if defined(__gfx908__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx950__)
+#define COLL_UNROLL 1
+#elif defined(__gfx908__) || defined(__gfx942__)
 #define COLL_UNROLL 2
 #else
 #define COLL_UNROLL 4
@@ -34,17 +34,20 @@ struct ncclKernelMatch {
 };

 #ifdef ENABLE_COLLTRACE
-#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 2 : 0))
-static ncclKernelMatch const ncclKerns[4] = {
-  {(void *)ncclDevKernel_Generic, true},
+#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 3 : 0))
+static ncclKernelMatch const ncclKerns[6] = {
+  {(void *)ncclDevKernel_Generic_1, true},
+  {(void *)ncclDevKernel_Generic_2, true},
  {(void *)ncclDevKernel_Generic_4, true},
-  {(void *)ncclDevKernelDebug_Generic, true},
+  {(void *)ncclDevKernelDebug_Generic_1, true},
+  {(void *)ncclDevKernelDebug_Generic_2, true},
  {(void *)ncclDevKernelDebug_Generic_4, true}
 };
 #else
 #define ncclGetKernelIndex(p_comm) ((p_comm)->unroll)
-static ncclKernelMatch const ncclKerns[2] = {
-  {(void*)ncclDevKernel_Generic, true},
+static ncclKernelMatch const ncclKerns[3] = {
+  {(void*)ncclDevKernel_Generic_1, true},
+  {(void*)ncclDevKernel_Generic_2, true},
  {(void*)ncclDevKernel_Generic_4, true}
 };
 #endif
@@ -74,9 +74,10 @@ typedef enum {

 #define NCCL_ALGO_PROTO_IGNORE -1.0

-#define NCCL_NUM_UNROLLS 2 // 2/4
-#define NCCL_UNROLL_2 0
-#define NCCL_UNROLL_4 1
+#define NCCL_NUM_UNROLLS 3 // 1/2/4
+#define NCCL_UNROLL_1 0
+#define NCCL_UNROLL_2 1
+#define NCCL_UNROLL_4 2

 #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
 #endif
@@ -101,8 +101,9 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
  hipDeviceProp_t devProp;
  CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
-  if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950"))
-    && devProp.multiProcessorCount > 80))
+  if(IsArchMatch(devProp.gcnArchName, "gfx950"))
+    comm->unroll = NCCL_UNROLL_1;
+  else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80)))
    comm->unroll = NCCL_UNROLL_2;
  else
    comm->unroll = NCCL_UNROLL_4;
@@ -1348,7 +1349,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels);
  if (ringGraph->nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;
-  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) {
    // Multi-node MI300A
    int managed = 0;
    CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
@@ -1365,6 +1366,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
        allGather3Data[rank].nc = 4;
    }
  }
+  if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
+    allGather3Data[rank].nc = 4;
+  }

  allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
  comm->topo->ll128Enabled =  comm->topo->ll128Enabled || rcclParamLL128ForceEnable();