From 329e13efff8b59fc2e11af0afa23237cee100165 Mon Sep 17 00:00:00 2001
From: Nilesh M Negi <Nilesh.Negi@amd.com>
Date: Wed, 30 Apr 2025 23:33:08 -0500
Subject: [PATCH] Revert "[SRC] Enable unroll=1 for gfx950 (#1602)" (#1667)

* Revert "[SRC] Enable unroll=1 for gfx950 (#1602)"
This reverts commit 307bc10781083f393ce50ababa2d1befb8c4e772.

* Update Changelog

---------

Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
---
 CHANGELOG.md               | 36 +++++++++++++++-----
 src/device/common.cu       | 10 ++----
 src/device/common.h        | 22 +++++-------
 src/device/common_kernel.h | 24 ++-----------
 src/device/generate.py     | 69 +++++++++-----------------------------
 src/device/onerank.cu      |  4 +--
 src/enqueue.cc             | 15 ++++-----
 src/include/nccl_common.h  |  7 ++--
 src/init.cc                |  5 ++-
 9 files changed, 67 insertions(+), 125 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 952591a967..2b360cfb3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,26 +2,44 @@
 
 Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
 
-## Unreleased - RCCL 2.22.3 for ROCm 6.5.0
+## Unreleased - RCCL 2.24.3 for ROCm 6.5.0
 
 ### Added
 
-* Added new GPU target `gfx950`
-* Added support for `unroll=1` in device-code generation to improve performance
+* Added new GPU target `gfx950`.
+
+### Changed
+
+* Compatibility with NCCL 2.24.3
 
 ### Known issue
 
-* Using more than 64 channels can cause a segmentation fault when multiple different collectives are used in the same `ncclGroup()` call
+* Using more than 64 channels can cause a segmentation fault when multiple collectives are used in the same `ncclGroup()` call.
 
-## Unreleased - RCCL 2.22.3 for ROCm 6.4.0
+## Unreleased - RCCL 2.23.4 for ROCm 6.4.1
 
 ### Added
 
-* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters
-* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data ibv_post_sends
-* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`)
 * Added MSCCL support for AllGather single node and multinode (i.e., 8, 16 and 32 GPUs). To enable on multinode, set the
-  environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is 12292 * sizeof(datatype) * nGPUs 
+  environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
+* Added synchronization before destroying proxy thread to fix a rare hang caused by early termination.
+
+### Changed
+
+* Compatibility with NCCL 2.23.4
+
+### Resolved issues
+
+* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
+* Fixed IntraNet performance.
+
+## RCCL 2.22.3 for ROCm 6.4.0
+
+### Added
+
+* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
+* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
+* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
 
 ### Changed
 
diff --git a/src/device/common.cu b/src/device/common.cu
index 36d396fbb8..e5645905b8 100644
--- a/src/device/common.cu
+++ b/src/device/common.cu
@@ -17,20 +17,14 @@ struct RunWorkNop {
   __device__ void run() {}
 };
 
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
-}
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
 }
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
 }
 #ifdef ENABLE_COLLTRACE
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
-  ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
-}
-__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
   ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
 }
 __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
diff --git a/src/device/common.h b/src/device/common.h
index 97eb1a04d3..c90d3e78d5 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -566,19 +566,15 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
       SpecializedRunWorkBatch().run();
     } else {
 #ifdef USE_INDIRECT_FUNCTION_CALL
-      if (COLL_UNROLL == 1)
-        ncclDevFuncTable_1[ncclShmem.funcId]();
-      else if (COLL_UNROLL == 2)
-        ncclDevFuncTable_2[ncclShmem.funcId]();
-      else
+      if (COLL_UNROLL == 4)
         ncclDevFuncTable_4[ncclShmem.funcId]();
-#else
-      if (COLL_UNROLL == 1)
-        NCCL_CALL_FUNCTIONS_1(ncclShmem.funcId);
-      else if (COLL_UNROLL == 2)
-        NCCL_CALL_FUNCTIONS_2(ncclShmem.funcId);
       else
+        ncclDevFuncTable[ncclShmem.funcId]();
+#else
+      if (COLL_UNROLL == 4)
         NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId);
+      else
+        NCCL_CALL_FUNCTIONS(ncclShmem.funcId);
 #endif
     }
 
@@ -619,12 +615,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
 #endif
 }
 
-__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 #ifdef ENABLE_COLLTRACE
-__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
-__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
+__global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 #endif
 
diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h
index a601618aef..203e107cd4 100644
--- a/src/device/common_kernel.h
+++ b/src/device/common_kernel.h
@@ -263,29 +263,9 @@ __device__ __forceinline__ void reduceCopy(
     }
   }
 
-/*
- * For gfx90a,
-* Before we had `Unroll/2*(16/sizeof(T))/2`, which does not work with unroll=1
-* as unroll=1; `Unroll/2` = 0, which results in the above expression to 0, and is not supported
-* This was reformulated to `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
-*
-* Before: `Unroll/2*(16/sizeof(T))/2`
-*         sizeof(T)
-* unroll  1   2   4   8
-*   4     16  8   4   2
-*   2     8   4   2   1
-*   1     0   0   0   0
-*
-* After: `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
-*         sizeof(T)
-* unroll  1   2   4   8
-*   4     16  8   4   2
-*   2     8   4   2   1
-*   1     4   2   1   1
-*/
 #if defined(__gfx90a__)
   if (MinSrcs > 1) {
-    reduceCopyPacks<RedFn, T, (Unroll*4 + sizeof(T) - 1)/sizeof(T), sizeof(T),
+    reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
     MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
     (nThreads, thread, redArg, preOpArgs, postOp,
      nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
@@ -327,4 +307,4 @@ __device__ __forceinline__ void reduceCopy(
      nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
 }
 
-#endif // COMMON_KERNEL_H_
+#endif // COMMON_KERNEL_H_
\ No newline at end of file
diff --git a/src/device/generate.py b/src/device/generate.py
index bcc15aefb6..23f1f0ee3d 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -9,7 +9,7 @@ all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
 all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos = ["LL","LL128","SIMPLE"]
 all_algos =  ["TREE","RING"]
-all_unroll = ["1", "2", "4"]
+all_unroll = ["2", "4"]
 
 all_params = [all_colls, all_algos, all_protos, all_redops, all_tys, all_unroll]
 
@@ -164,9 +164,7 @@ def calc_unroll_for_local_arch():
   # Homogeneous system is required to build for only 1 varient of unroll factor
   if len(gfx_targets) == 1:
     gfx_name, cu_count = gfx_targets[0]
-    if "gfx950" == gfx_name:
-        return 1
-    elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
+    if "gfx908" == gfx_name or (gfx_name in ["gfx942", "gfx950"] and cu_count > 80):
       return 2
     else:
       return 4
@@ -320,38 +318,21 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
   out("\n")
 
   out("typedef void(*ncclDevFuncPtr_t)();\n\n")
-  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_1[] = {\n")
-  index1 = 0
-  for fn in primary_funcs:
-    coll, algo, proto, redop, ty, unroll = fn
-    if unroll != "1": continue
-    sym = paste("_", "ncclDevFunc", *fn)
-    if fn[2] == "LL128":
-      out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
-      out("/*%4d*/ %s,\n#else\n" % (index1, sym))
-      fn_ll = fn[:2] + ("LL",) + fn[3:]
-      sym_ll = paste("_", "ncclDevFunc", *fn_ll)
-      out("/*%4d*/ %s,\n#endif\n" % (index1, sym_ll))
-    else:
-      out("/*%4d*/ %s,\n" % (index1, sym))
-    index1 += 1
-  out("nullptr};\n")
-  out("\n")
-  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_2[] = {\n")
-  index2 = 0
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n")
+  index = 0
   for fn in primary_funcs:
     coll, algo, proto, redop, ty, unroll = fn
     if unroll != "2": continue
     sym = paste("_", "ncclDevFunc", *fn)
     if fn[2] == "LL128":
       out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
-      out("/*%4d*/ %s,\n#else\n" % (index2, sym))
+      out("/*%4d*/ %s,\n#else\n" % (index, sym))
       fn_ll = fn[:2] + ("LL",) + fn[3:]
       sym_ll = paste("_", "ncclDevFunc", *fn_ll)
-      out("/*%4d*/ %s,\n#endif\n" % (index2, sym_ll))
+      out("/*%4d*/ %s,\n#endif\n" % (index, sym_ll))
     else:
-      out("/*%4d*/ %s,\n" % (index2, sym))
-    index2 += 1
+      out("/*%4d*/ %s,\n" % (index, sym))
+    index += 1
   out("nullptr};\n")
   out("\n")
   out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_4[] = {\n")
@@ -374,40 +355,22 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
   
   if not is_ifc:
     out("template<unsigned short f, unsigned short l>\n"
-      "struct Caller1 {\n"
+      "struct Caller {\n"
       "  static __forceinline__ __device__ __host__\n"
-      "  void call1(unsigned short funcIndex) noexcept\n"
+      "  void call(unsigned short funcIndex) noexcept\n"
       "  {\n"
       "    constexpr unsigned short m = f + (l - f) / 2;\n"
-      "    return (funcIndex < m) ? Caller1<f, m>::call1(funcIndex) : Caller1<m, l>::call1(funcIndex);\n"
+      "    return (funcIndex < m) ? Caller<f, m>::call(funcIndex) : Caller<m, l>::call(funcIndex);\n"
       "  }\n"
       "};\n"
       "\n"
       "template<unsigned short f>\n"
-      "struct Caller1<f, f + 1>{\n"
+      "struct Caller<f, f + 1>{\n"
       "  static __forceinline__ __device__ __host__\n"
-      "  void call1(unsigned short funcIndex) noexcept { ncclDevFuncTable_1[f](); }\n"
+      "  void call(unsigned short funcIndex) noexcept { ncclDevFuncTable[f](); }\n"
       "};\n")
-    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {\n")
-    out(f"  Caller1<0, {index1}>::call1(funcIndex);\n")
-    out("}\n\n")
-    out("template<unsigned short f, unsigned short l>\n"
-      "struct Caller2 {\n"
-      "  static __forceinline__ __device__ __host__\n"
-      "  void call2(unsigned short funcIndex) noexcept\n"
-      "  {\n"
-      "    constexpr unsigned short m = f + (l - f) / 2;\n"
-      "    return (funcIndex < m) ? Caller2<f, m>::call2(funcIndex) : Caller2<m, l>::call2(funcIndex);\n"
-      "  }\n"
-      "};\n"
-      "\n"
-      "template<unsigned short f>\n"
-      "struct Caller2<f, f + 1>{\n"
-      "  static __forceinline__ __device__ __host__\n"
-      "  void call2(unsigned short funcIndex) noexcept { ncclDevFuncTable_2[f](); }\n"
-      "};\n")
-    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {\n")
-    out(f"  Caller2<0, {index2}>::call2(funcIndex);\n")
+    out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {\n")
+    out(f"  Caller<0, {index}>::call(funcIndex);\n")
     out("}\n\n")
     out("template<unsigned short f, unsigned short l>\n"
       "struct Caller4 {\n"
@@ -459,7 +422,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
   # The mapping from function rows to valid primary function ids.
   out("extern int const ncclDevFuncRowToId[] = {\n")
   index = 0
-  for fn in func_rows[:len(func_rows)//3]:
+  for fn in func_rows[:len(func_rows)//2]:
     fn_id, comment = -1, ""
     if fn is not None:
       fn_id = primary_to_index[equivalent_primary(*fn)]
diff --git a/src/device/onerank.cu b/src/device/onerank.cu
index 25bb2ea442..61eef8f620 100644
--- a/src/device/onerank.cu
+++ b/src/device/onerank.cu
@@ -11,9 +11,7 @@
 #include "common.h"
 #include <cuda_runtime.h>
 
-#if defined(__gfx950__)
-#define COLL_UNROLL 1
-#elif defined(__gfx908__) || defined(__gfx942__)
+#if defined(__gfx908__) || defined(__gfx942__) || defined(__gfx950__)
 #define COLL_UNROLL 2
 #else
 #define COLL_UNROLL 4
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 08ce05f82b..1500a610b7 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -34,20 +34,17 @@ struct ncclKernelMatch {
 };
 
 #ifdef ENABLE_COLLTRACE
-#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 3 : 0))
-static ncclKernelMatch const ncclKerns[6] = {
-  {(void *)ncclDevKernel_Generic_1, true},
-  {(void *)ncclDevKernel_Generic_2, true},
+#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 2 : 0))
+static ncclKernelMatch const ncclKerns[4] = {
+  {(void *)ncclDevKernel_Generic, true},
   {(void *)ncclDevKernel_Generic_4, true},
-  {(void *)ncclDevKernelDebug_Generic_1, true},
-  {(void *)ncclDevKernelDebug_Generic_2, true},
+  {(void *)ncclDevKernelDebug_Generic, true},
   {(void *)ncclDevKernelDebug_Generic_4, true}
 };
 #else
 #define ncclGetKernelIndex(p_comm) ((p_comm)->unroll)
-static ncclKernelMatch const ncclKerns[3] = {
-  {(void*)ncclDevKernel_Generic_1, true},
-  {(void*)ncclDevKernel_Generic_2, true},
+static ncclKernelMatch const ncclKerns[2] = {
+  {(void*)ncclDevKernel_Generic, true},
   {(void*)ncclDevKernel_Generic_4, true}
 };
 #endif
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index 6a9d18d3b5..e533ac2375 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -74,10 +74,9 @@ typedef enum {
 
 #define NCCL_ALGO_PROTO_IGNORE -1.0
 
-#define NCCL_NUM_UNROLLS 3 // 1/2/4
-#define NCCL_UNROLL_1 0
-#define NCCL_UNROLL_2 1
-#define NCCL_UNROLL_4 2
+#define NCCL_NUM_UNROLLS 2 // 2/4
+#define NCCL_UNROLL_2 0
+#define NCCL_UNROLL_4 1
 
 #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
 #endif
diff --git a/src/init.cc b/src/init.cc
index e149d0ce9f..41d81a2cc8 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -99,9 +99,8 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
   hipDeviceProp_t devProp;
   CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
-  if(IsArchMatch(devProp.gcnArchName, "gfx950"))
-    comm->unroll = NCCL_UNROLL_1;
-  else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80)))
+  if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950"))
+    && devProp.multiProcessorCount > 80))
     comm->unroll = NCCL_UNROLL_2;
   else
     comm->unroll = NCCL_UNROLL_4;