From 329e13efff8b59fc2e11af0afa23237cee100165 Mon Sep 17 00:00:00 2001 From: Nilesh M Negi Date: Wed, 30 Apr 2025 23:33:08 -0500 Subject: [PATCH] Revert "[SRC] Enable unroll=1 for gfx950 (#1602)" (#1667) * Revert "[SRC] Enable unroll=1 for gfx950 (#1602)" This reverts commit 307bc10781083f393ce50ababa2d1befb8c4e772. * Update Changelog --------- Signed-off-by: nileshnegi --- CHANGELOG.md | 36 +++++++++++++++----- src/device/common.cu | 10 ++---- src/device/common.h | 22 +++++------- src/device/common_kernel.h | 24 ++----------- src/device/generate.py | 69 +++++++++----------------------------- src/device/onerank.cu | 4 +-- src/enqueue.cc | 15 ++++----- src/include/nccl_common.h | 7 ++-- src/init.cc | 5 ++- 9 files changed, 67 insertions(+), 125 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 952591a967..2b360cfb3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,26 +2,44 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io) -## Unreleased - RCCL 2.22.3 for ROCm 6.5.0 +## Unreleased - RCCL 2.24.3 for ROCm 6.5.0 ### Added -* Added new GPU target `gfx950` -* Added support for `unroll=1` in device-code generation to improve performance +* Added new GPU target `gfx950`. + +### Changed + +* Compatibility with NCCL 2.24.3 ### Known issue -* Using more than 64 channels can cause a segmentation fault when multiple different collectives are used in the same `ncclGroup()` call +* Using more than 64 channels can cause a segmentation fault when multiple collectives are used in the same `ncclGroup()` call. -## Unreleased - RCCL 2.22.3 for ROCm 6.4.0 +## Unreleased - RCCL 2.23.4 for ROCm 6.4.1 ### Added -* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters -* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data ibv_post_sends -* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`) * Added MSCCL support for AllGather single node and multinode (i.e., 8, 16 and 32 GPUs). To enable on multinode, set the - environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is 12292 * sizeof(datatype) * nGPUs + environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. +* Added synchronization before destroying proxy thread to fix a rare hang caused by early termination. + +### Changed + +* Compatibility with NCCL 2.23.4 + +### Resolved issues + +* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode. +* Fixed IntraNet performance. + +## RCCL 2.22.3 for ROCm 6.4.0 + +### Added + +* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters. +* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`. +* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`). ### Changed diff --git a/src/device/common.cu b/src/device/common.cu index 36d396fbb8..e5645905b8 100644 --- a/src/device/common.cu +++ b/src/device/common.cu @@ -17,20 +17,14 @@ struct RunWorkNop { __device__ void run() {} }; -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { - ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args); -} -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args); } __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args); } #ifdef ENABLE_COLLTRACE -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { - ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args); -} -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args); } __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { diff --git a/src/device/common.h b/src/device/common.h index 97eb1a04d3..c90d3e78d5 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -566,19 +566,15 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a SpecializedRunWorkBatch().run(); } else { #ifdef USE_INDIRECT_FUNCTION_CALL - if (COLL_UNROLL == 1) - ncclDevFuncTable_1[ncclShmem.funcId](); - else if (COLL_UNROLL == 2) - ncclDevFuncTable_2[ncclShmem.funcId](); - else + if (COLL_UNROLL == 4) ncclDevFuncTable_4[ncclShmem.funcId](); -#else - if (COLL_UNROLL == 1) - NCCL_CALL_FUNCTIONS_1(ncclShmem.funcId); - else if (COLL_UNROLL == 2) - NCCL_CALL_FUNCTIONS_2(ncclShmem.funcId); else + ncclDevFuncTable[ncclShmem.funcId](); +#else + if (COLL_UNROLL == 4) NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId); + else + NCCL_CALL_FUNCTIONS(ncclShmem.funcId); #endif } @@ -619,12 +615,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a #endif } -__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); -__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); #ifdef ENABLE_COLLTRACE -__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); -__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); #endif diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h index a601618aef..203e107cd4 100644 --- a/src/device/common_kernel.h +++ b/src/device/common_kernel.h @@ -263,29 +263,9 @@ __device__ __forceinline__ void reduceCopy( } } -/* - * For gfx90a, -* Before we had `Unroll/2*(16/sizeof(T))/2`, which does not work with unroll=1 -* as unroll=1; `Unroll/2` = 0, which results in the above expression to 0, and is not supported -* This was reformulated to `(Unroll*4 + sizeof(T) - 1)/sizeof(T)` -* -* Before: `Unroll/2*(16/sizeof(T))/2` -* sizeof(T) -* unroll 1 2 4 8 -* 4 16 8 4 2 -* 2 8 4 2 1 -* 1 0 0 0 0 -* -* After: `(Unroll*4 + sizeof(T) - 1)/sizeof(T)` -* sizeof(T) -* unroll 1 2 4 8 -* 4 16 8 4 2 -* 2 8 4 2 1 -* 1 4 2 1 1 -*/ #if defined(__gfx90a__) if (MinSrcs > 1) { - reduceCopyPacks (nThreads, thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead); @@ -327,4 +307,4 @@ __device__ __forceinline__ void reduceCopy( nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts); } -#endif // COMMON_KERNEL_H_ +#endif // COMMON_KERNEL_H_ \ No newline at end of file diff --git a/src/device/generate.py b/src/device/generate.py index bcc15aefb6..23f1f0ee3d 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -9,7 +9,7 @@ all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"] all_protos = ["LL","LL128","SIMPLE"] all_algos = ["TREE","RING"] -all_unroll = ["1", "2", "4"] +all_unroll = ["2", "4"] all_params = [all_colls, all_algos, all_protos, all_redops, all_tys, all_unroll] @@ -164,9 +164,7 @@ def calc_unroll_for_local_arch(): # Homogeneous system is required to build for only 1 varient of unroll factor if len(gfx_targets) == 1: gfx_name, cu_count = gfx_targets[0] - if "gfx950" == gfx_name: - return 1 - elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80): + if "gfx908" == gfx_name or (gfx_name in ["gfx942", "gfx950"] and cu_count > 80): return 2 else: return 4 @@ -320,38 +318,21 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: out("\n") out("typedef void(*ncclDevFuncPtr_t)();\n\n") - out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_1[] = {\n") - index1 = 0 - for fn in primary_funcs: - coll, algo, proto, redop, ty, unroll = fn - if unroll != "1": continue - sym = paste("_", "ncclDevFunc", *fn) - if fn[2] == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") - out("/*%4d*/ %s,\n#else\n" % (index1, sym)) - fn_ll = fn[:2] + ("LL",) + fn[3:] - sym_ll = paste("_", "ncclDevFunc", *fn_ll) - out("/*%4d*/ %s,\n#endif\n" % (index1, sym_ll)) - else: - out("/*%4d*/ %s,\n" % (index1, sym)) - index1 += 1 - out("nullptr};\n") - out("\n") - out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_2[] = {\n") - index2 = 0 + out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n") + index = 0 for fn in primary_funcs: coll, algo, proto, redop, ty, unroll = fn if unroll != "2": continue sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") - out("/*%4d*/ %s,\n#else\n" % (index2, sym)) + out("/*%4d*/ %s,\n#else\n" % (index, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) - out("/*%4d*/ %s,\n#endif\n" % (index2, sym_ll)) + out("/*%4d*/ %s,\n#endif\n" % (index, sym_ll)) else: - out("/*%4d*/ %s,\n" % (index2, sym)) - index2 += 1 + out("/*%4d*/ %s,\n" % (index, sym)) + index += 1 out("nullptr};\n") out("\n") out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_4[] = {\n") @@ -374,40 +355,22 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: if not is_ifc: out("template\n" - "struct Caller1 {\n" + "struct Caller {\n" " static __forceinline__ __device__ __host__\n" - " void call1(unsigned short funcIndex) noexcept\n" + " void call(unsigned short funcIndex) noexcept\n" " {\n" " constexpr unsigned short m = f + (l - f) / 2;\n" - " return (funcIndex < m) ? Caller1::call1(funcIndex) : Caller1::call1(funcIndex);\n" + " return (funcIndex < m) ? Caller::call(funcIndex) : Caller::call(funcIndex);\n" " }\n" "};\n" "\n" "template\n" - "struct Caller1{\n" + "struct Caller{\n" " static __forceinline__ __device__ __host__\n" - " void call1(unsigned short funcIndex) noexcept { ncclDevFuncTable_1[f](); }\n" + " void call(unsigned short funcIndex) noexcept { ncclDevFuncTable[f](); }\n" "};\n") - out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {\n") - out(f" Caller1<0, {index1}>::call1(funcIndex);\n") - out("}\n\n") - out("template\n" - "struct Caller2 {\n" - " static __forceinline__ __device__ __host__\n" - " void call2(unsigned short funcIndex) noexcept\n" - " {\n" - " constexpr unsigned short m = f + (l - f) / 2;\n" - " return (funcIndex < m) ? Caller2::call2(funcIndex) : Caller2::call2(funcIndex);\n" - " }\n" - "};\n" - "\n" - "template\n" - "struct Caller2{\n" - " static __forceinline__ __device__ __host__\n" - " void call2(unsigned short funcIndex) noexcept { ncclDevFuncTable_2[f](); }\n" - "};\n") - out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {\n") - out(f" Caller2<0, {index2}>::call2(funcIndex);\n") + out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {\n") + out(f" Caller<0, {index}>::call(funcIndex);\n") out("}\n\n") out("template\n" "struct Caller4 {\n" @@ -459,7 +422,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f: # The mapping from function rows to valid primary function ids. out("extern int const ncclDevFuncRowToId[] = {\n") index = 0 - for fn in func_rows[:len(func_rows)//3]: + for fn in func_rows[:len(func_rows)//2]: fn_id, comment = -1, "" if fn is not None: fn_id = primary_to_index[equivalent_primary(*fn)] diff --git a/src/device/onerank.cu b/src/device/onerank.cu index 25bb2ea442..61eef8f620 100644 --- a/src/device/onerank.cu +++ b/src/device/onerank.cu @@ -11,9 +11,7 @@ #include "common.h" #include -#if defined(__gfx950__) -#define COLL_UNROLL 1 -#elif defined(__gfx908__) || defined(__gfx942__) +#if defined(__gfx908__) || defined(__gfx942__) || defined(__gfx950__) #define COLL_UNROLL 2 #else #define COLL_UNROLL 4 diff --git a/src/enqueue.cc b/src/enqueue.cc index 08ce05f82b..1500a610b7 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -34,20 +34,17 @@ struct ncclKernelMatch { }; #ifdef ENABLE_COLLTRACE -#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 3 : 0)) -static ncclKernelMatch const ncclKerns[6] = { - {(void *)ncclDevKernel_Generic_1, true}, - {(void *)ncclDevKernel_Generic_2, true}, +#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 2 : 0)) +static ncclKernelMatch const ncclKerns[4] = { + {(void *)ncclDevKernel_Generic, true}, {(void *)ncclDevKernel_Generic_4, true}, - {(void *)ncclDevKernelDebug_Generic_1, true}, - {(void *)ncclDevKernelDebug_Generic_2, true}, + {(void *)ncclDevKernelDebug_Generic, true}, {(void *)ncclDevKernelDebug_Generic_4, true} }; #else #define ncclGetKernelIndex(p_comm) ((p_comm)->unroll) -static ncclKernelMatch const ncclKerns[3] = { - {(void*)ncclDevKernel_Generic_1, true}, - {(void*)ncclDevKernel_Generic_2, true}, +static ncclKernelMatch const ncclKerns[2] = { + {(void*)ncclDevKernel_Generic, true}, {(void*)ncclDevKernel_Generic_4, true} }; #endif diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h index 6a9d18d3b5..e533ac2375 100644 --- a/src/include/nccl_common.h +++ b/src/include/nccl_common.h @@ -74,10 +74,9 @@ typedef enum { #define NCCL_ALGO_PROTO_IGNORE -1.0 -#define NCCL_NUM_UNROLLS 3 // 1/2/4 -#define NCCL_UNROLL_1 0 -#define NCCL_UNROLL_2 1 -#define NCCL_UNROLL_4 2 +#define NCCL_NUM_UNROLLS 2 // 2/4 +#define NCCL_UNROLL_2 0 +#define NCCL_UNROLL_4 1 #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8 #endif diff --git a/src/init.cc b/src/init.cc index e149d0ce9f..41d81a2cc8 100644 --- a/src/init.cc +++ b/src/init.cc @@ -99,9 +99,8 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) { ncclResult_t commSetUnrollFactor(struct ncclComm* comm) { hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev)); - if(IsArchMatch(devProp.gcnArchName, "gfx950")) - comm->unroll = NCCL_UNROLL_1; - else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80))) + if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950")) + && devProp.multiProcessorCount > 80)) comm->unroll = NCCL_UNROLL_2; else comm->unroll = NCCL_UNROLL_4;