diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 91c5a3b857..f97d4fe066 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -5,12 +5,15 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ## Unreleased - RCCL 2.25.1 for ROCm 7.0.0 ### Resolved issues + * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call. * Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes. ### Added * Added new GPU target `gfx950`. +* Added support for `unroll=1` in device-code generation to improve performance +* Set a default of 112 channels for a single node with `8 * gfx950` * Added MSCCL support for multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. diff --git a/projects/rccl/src/device/common.cu b/projects/rccl/src/device/common.cu index e5645905b8..36d396fbb8 100644 --- a/projects/rccl/src/device/common.cu +++ b/projects/rccl/src/device/common.cu @@ -17,14 +17,20 @@ struct RunWorkNop { __device__ void run() {} }; -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { + ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args); +} +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args); } __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args); } #ifdef ENABLE_COLLTRACE -__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { + ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args); +} +__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args); } __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { diff --git a/projects/rccl/src/device/common.h b/projects/rccl/src/device/common.h index 26ac84af9c..472b923fd1 100644 --- a/projects/rccl/src/device/common.h +++ b/projects/rccl/src/device/common.h @@ -573,15 +573,19 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a SpecializedRunWorkBatch().run(); } else { #ifdef USE_INDIRECT_FUNCTION_CALL - if (COLL_UNROLL == 4) + if (COLL_UNROLL == 1) + ncclDevFuncTable_1[ncclShmem.funcId](); + else if (COLL_UNROLL == 2) + ncclDevFuncTable_2[ncclShmem.funcId](); + else ncclDevFuncTable_4[ncclShmem.funcId](); - else - ncclDevFuncTable[ncclShmem.funcId](); #else - if (COLL_UNROLL == 4) - NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId); + if (COLL_UNROLL == 1) + NCCL_CALL_FUNCTIONS_1(ncclShmem.funcId); + else if (COLL_UNROLL == 2) + NCCL_CALL_FUNCTIONS_2(ncclShmem.funcId); else - NCCL_CALL_FUNCTIONS(ncclShmem.funcId); + NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId); #endif } @@ -624,10 +628,12 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a #endif } -__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); #ifdef ENABLE_COLLTRACE -__global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); +__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K); #endif diff --git a/projects/rccl/src/device/common_kernel.h b/projects/rccl/src/device/common_kernel.h index 203e107cd4..a601618aef 100644 --- a/projects/rccl/src/device/common_kernel.h +++ b/projects/rccl/src/device/common_kernel.h @@ -263,9 +263,29 @@ __device__ __forceinline__ void reduceCopy( } } +/* + * For gfx90a, +* Before we had `Unroll/2*(16/sizeof(T))/2`, which does not work with unroll=1 +* as unroll=1; `Unroll/2` = 0, which results in the above expression to 0, and is not supported +* This was reformulated to `(Unroll*4 + sizeof(T) - 1)/sizeof(T)` +* +* Before: `Unroll/2*(16/sizeof(T))/2` +* sizeof(T) +* unroll 1 2 4 8 +* 4 16 8 4 2 +* 2 8 4 2 1 +* 1 0 0 0 0 +* +* After: `(Unroll*4 + sizeof(T) - 1)/sizeof(T)` +* sizeof(T) +* unroll 1 2 4 8 +* 4 16 8 4 2 +* 2 8 4 2 1 +* 1 4 2 1 1 +*/ #if defined(__gfx90a__) if (MinSrcs > 1) { - reduceCopyPacks (nThreads, thread, redArg, preOpArgs, postOp, nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead); @@ -307,4 +327,4 @@ __device__ __forceinline__ void reduceCopy( nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts); } -#endif // COMMON_KERNEL_H_ \ No newline at end of file +#endif // COMMON_KERNEL_H_ diff --git a/projects/rccl/src/device/generate.py b/projects/rccl/src/device/generate.py index 23f1f0ee3d..bcc15aefb6 100755 --- a/projects/rccl/src/device/generate.py +++ b/projects/rccl/src/device/generate.py @@ -9,7 +9,7 @@ all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"] all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"] all_protos = ["LL","LL128","SIMPLE"] all_algos = ["TREE","RING"] -all_unroll = ["2", "4"] +all_unroll = ["1", "2", "4"] all_params = [all_colls, all_algos, all_protos, all_redops, all_tys, all_unroll] @@ -164,7 +164,9 @@ def calc_unroll_for_local_arch(): # Homogeneous system is required to build for only 1 varient of unroll factor if len(gfx_targets) == 1: gfx_name, cu_count = gfx_targets[0] - if "gfx908" == gfx_name or (gfx_name in ["gfx942", "gfx950"] and cu_count > 80): + if "gfx950" == gfx_name: + return 1 + elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80): return 2 else: return 4 @@ -318,21 +320,38 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: out("\n") out("typedef void(*ncclDevFuncPtr_t)();\n\n") - out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n") - index = 0 + out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_1[] = {\n") + index1 = 0 + for fn in primary_funcs: + coll, algo, proto, redop, ty, unroll = fn + if unroll != "1": continue + sym = paste("_", "ncclDevFunc", *fn) + if fn[2] == "LL128": + out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("/*%4d*/ %s,\n#else\n" % (index1, sym)) + fn_ll = fn[:2] + ("LL",) + fn[3:] + sym_ll = paste("_", "ncclDevFunc", *fn_ll) + out("/*%4d*/ %s,\n#endif\n" % (index1, sym_ll)) + else: + out("/*%4d*/ %s,\n" % (index1, sym)) + index1 += 1 + out("nullptr};\n") + out("\n") + out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_2[] = {\n") + index2 = 0 for fn in primary_funcs: coll, algo, proto, redop, ty, unroll = fn if unroll != "2": continue sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") - out("/*%4d*/ %s,\n#else\n" % (index, sym)) + out("/*%4d*/ %s,\n#else\n" % (index2, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) - out("/*%4d*/ %s,\n#endif\n" % (index, sym_ll)) + out("/*%4d*/ %s,\n#endif\n" % (index2, sym_ll)) else: - out("/*%4d*/ %s,\n" % (index, sym)) - index += 1 + out("/*%4d*/ %s,\n" % (index2, sym)) + index2 += 1 out("nullptr};\n") out("\n") out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_4[] = {\n") @@ -355,22 +374,40 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: if not is_ifc: out("template\n" - "struct Caller {\n" + "struct Caller1 {\n" " static __forceinline__ __device__ __host__\n" - " void call(unsigned short funcIndex) noexcept\n" + " void call1(unsigned short funcIndex) noexcept\n" " {\n" " constexpr unsigned short m = f + (l - f) / 2;\n" - " return (funcIndex < m) ? Caller::call(funcIndex) : Caller::call(funcIndex);\n" + " return (funcIndex < m) ? Caller1::call1(funcIndex) : Caller1::call1(funcIndex);\n" " }\n" "};\n" "\n" "template\n" - "struct Caller{\n" + "struct Caller1{\n" " static __forceinline__ __device__ __host__\n" - " void call(unsigned short funcIndex) noexcept { ncclDevFuncTable[f](); }\n" + " void call1(unsigned short funcIndex) noexcept { ncclDevFuncTable_1[f](); }\n" "};\n") - out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {\n") - out(f" Caller<0, {index}>::call(funcIndex);\n") + out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {\n") + out(f" Caller1<0, {index1}>::call1(funcIndex);\n") + out("}\n\n") + out("template\n" + "struct Caller2 {\n" + " static __forceinline__ __device__ __host__\n" + " void call2(unsigned short funcIndex) noexcept\n" + " {\n" + " constexpr unsigned short m = f + (l - f) / 2;\n" + " return (funcIndex < m) ? Caller2::call2(funcIndex) : Caller2::call2(funcIndex);\n" + " }\n" + "};\n" + "\n" + "template\n" + "struct Caller2{\n" + " static __forceinline__ __device__ __host__\n" + " void call2(unsigned short funcIndex) noexcept { ncclDevFuncTable_2[f](); }\n" + "};\n") + out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {\n") + out(f" Caller2<0, {index2}>::call2(funcIndex);\n") out("}\n\n") out("template\n" "struct Caller4 {\n" @@ -422,7 +459,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f: # The mapping from function rows to valid primary function ids. out("extern int const ncclDevFuncRowToId[] = {\n") index = 0 - for fn in func_rows[:len(func_rows)//2]: + for fn in func_rows[:len(func_rows)//3]: fn_id, comment = -1, "" if fn is not None: fn_id = primary_to_index[equivalent_primary(*fn)] diff --git a/projects/rccl/src/device/onerank.cu b/projects/rccl/src/device/onerank.cu index 61eef8f620..25bb2ea442 100644 --- a/projects/rccl/src/device/onerank.cu +++ b/projects/rccl/src/device/onerank.cu @@ -11,7 +11,9 @@ #include "common.h" #include -#if defined(__gfx908__) || defined(__gfx942__) || defined(__gfx950__) +#if defined(__gfx950__) +#define COLL_UNROLL 1 +#elif defined(__gfx908__) || defined(__gfx942__) #define COLL_UNROLL 2 #else #define COLL_UNROLL 4 diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index ebad6f68fe..98c862ba7f 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -34,17 +34,20 @@ struct ncclKernelMatch { }; #ifdef ENABLE_COLLTRACE -#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 2 : 0)) -static ncclKernelMatch const ncclKerns[4] = { - {(void *)ncclDevKernel_Generic, true}, +#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 3 : 0)) +static ncclKernelMatch const ncclKerns[6] = { + {(void *)ncclDevKernel_Generic_1, true}, + {(void *)ncclDevKernel_Generic_2, true}, {(void *)ncclDevKernel_Generic_4, true}, - {(void *)ncclDevKernelDebug_Generic, true}, + {(void *)ncclDevKernelDebug_Generic_1, true}, + {(void *)ncclDevKernelDebug_Generic_2, true}, {(void *)ncclDevKernelDebug_Generic_4, true} }; #else #define ncclGetKernelIndex(p_comm) ((p_comm)->unroll) -static ncclKernelMatch const ncclKerns[2] = { - {(void*)ncclDevKernel_Generic, true}, +static ncclKernelMatch const ncclKerns[3] = { + {(void*)ncclDevKernel_Generic_1, true}, + {(void*)ncclDevKernel_Generic_2, true}, {(void*)ncclDevKernel_Generic_4, true} }; #endif diff --git a/projects/rccl/src/include/nccl_common.h b/projects/rccl/src/include/nccl_common.h index e533ac2375..6a9d18d3b5 100644 --- a/projects/rccl/src/include/nccl_common.h +++ b/projects/rccl/src/include/nccl_common.h @@ -74,9 +74,10 @@ typedef enum { #define NCCL_ALGO_PROTO_IGNORE -1.0 -#define NCCL_NUM_UNROLLS 2 // 2/4 -#define NCCL_UNROLL_2 0 -#define NCCL_UNROLL_4 1 +#define NCCL_NUM_UNROLLS 3 // 1/2/4 +#define NCCL_UNROLL_1 0 +#define NCCL_UNROLL_2 1 +#define NCCL_UNROLL_4 2 #define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8 #endif diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 9feaeadac8..cda7f1359f 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -101,8 +101,9 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) { ncclResult_t commSetUnrollFactor(struct ncclComm* comm) { hipDeviceProp_t devProp; CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev)); - if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950")) - && devProp.multiProcessorCount > 80)) + if(IsArchMatch(devProp.gcnArchName, "gfx950")) + comm->unroll = NCCL_UNROLL_1; + else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80))) comm->unroll = NCCL_UNROLL_2; else comm->unroll = NCCL_UNROLL_4; @@ -1348,7 +1349,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph->nChannels); if (ringGraph->nChannels > MAXCHANNELS/2) allGather3Data[rank].nc = 1; - if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) { + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx942")) { // Multi-node MI300A int managed = 0; CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0)); @@ -1365,6 +1366,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].nc = 4; } } + if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) { + allGather3Data[rank].nc = 4; + } allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable(); comm->topo->ll128Enabled = comm->topo->ll128Enabled || rcclParamLL128ForceEnable();