* Revert "[SRC] Enable unroll=1 for gfx950 (#1602)"
This reverts commit 307bc10781.
* Update Changelog
---------
Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
This commit is contained in:
+27
-9
@@ -2,26 +2,44 @@
|
||||
|
||||
Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
|
||||
|
||||
## Unreleased - RCCL 2.22.3 for ROCm 6.5.0
|
||||
## Unreleased - RCCL 2.24.3 for ROCm 6.5.0
|
||||
|
||||
### Added
|
||||
|
||||
* Added new GPU target `gfx950`
|
||||
* Added support for `unroll=1` in device-code generation to improve performance
|
||||
* Added new GPU target `gfx950`.
|
||||
|
||||
### Changed
|
||||
|
||||
* Compatibility with NCCL 2.24.3
|
||||
|
||||
### Known issue
|
||||
|
||||
* Using more than 64 channels can cause a segmentation fault when multiple different collectives are used in the same `ncclGroup()` call
|
||||
* Using more than 64 channels can cause a segmentation fault when multiple collectives are used in the same `ncclGroup()` call.
|
||||
|
||||
## Unreleased - RCCL 2.22.3 for ROCm 6.4.0
|
||||
## Unreleased - RCCL 2.23.4 for ROCm 6.4.1
|
||||
|
||||
### Added
|
||||
|
||||
* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters
|
||||
* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data ibv_post_sends
|
||||
* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`)
|
||||
* Added MSCCL support for AllGather single node and multinode (i.e., 8, 16 and 32 GPUs). To enable on multinode, set the
|
||||
environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is 12292 * sizeof(datatype) * nGPUs
|
||||
environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
|
||||
* Added synchronization before destroying proxy thread to fix a rare hang caused by early termination.
|
||||
|
||||
### Changed
|
||||
|
||||
* Compatibility with NCCL 2.23.4
|
||||
|
||||
### Resolved issues
|
||||
|
||||
* Fixed the accuracy issue for MSCCLPP `allreduce7` kernel in graph mode.
|
||||
* Fixed IntraNet performance.
|
||||
|
||||
## RCCL 2.22.3 for ROCm 6.4.0
|
||||
|
||||
### Added
|
||||
|
||||
* `RCCL_SOCKET_REUSEADDR` and `RCCL_SOCKET_LINGER` environment parameters.
|
||||
* Setting `NCCL_DEBUG=TRACE NCCL_DEBUG_SUBSYS=VERBS` will generate traces for fifo and data `ibv_post_sends`.
|
||||
* Added `--log-trace` flag to enable traces through the install.sh script (e.g. `./install.sh --log-trace`).
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -17,20 +17,14 @@ struct RunWorkNop {
|
||||
__device__ void run() {}
|
||||
};
|
||||
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/1>(&args4K.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/2>(&args4K.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/false, /*Unroll*/4>(&args4K.args);
|
||||
}
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/1>(&args4K.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
ncclKernelMain<-1, RunWorkNop, /*COLLTRACE*/true, /*Unroll*/2>(&args4K.args);
|
||||
}
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
|
||||
|
||||
@@ -566,19 +566,15 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
SpecializedRunWorkBatch().run();
|
||||
} else {
|
||||
#ifdef USE_INDIRECT_FUNCTION_CALL
|
||||
if (COLL_UNROLL == 1)
|
||||
ncclDevFuncTable_1[ncclShmem.funcId]();
|
||||
else if (COLL_UNROLL == 2)
|
||||
ncclDevFuncTable_2[ncclShmem.funcId]();
|
||||
else
|
||||
if (COLL_UNROLL == 4)
|
||||
ncclDevFuncTable_4[ncclShmem.funcId]();
|
||||
#else
|
||||
if (COLL_UNROLL == 1)
|
||||
NCCL_CALL_FUNCTIONS_1(ncclShmem.funcId);
|
||||
else if (COLL_UNROLL == 2)
|
||||
NCCL_CALL_FUNCTIONS_2(ncclShmem.funcId);
|
||||
else
|
||||
ncclDevFuncTable[ncclShmem.funcId]();
|
||||
#else
|
||||
if (COLL_UNROLL == 4)
|
||||
NCCL_CALL_FUNCTIONS_4(ncclShmem.funcId);
|
||||
else
|
||||
NCCL_CALL_FUNCTIONS(ncclShmem.funcId);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -619,12 +615,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ void ncclDevKernel_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernel_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
__global__ void ncclDevKernelDebug_Generic_1(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic_2(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
__global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -263,29 +263,9 @@ __device__ __forceinline__ void reduceCopy(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* For gfx90a,
|
||||
* Before we had `Unroll/2*(16/sizeof(T))/2`, which does not work with unroll=1
|
||||
* as unroll=1; `Unroll/2` = 0, which results in the above expression to 0, and is not supported
|
||||
* This was reformulated to `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
|
||||
*
|
||||
* Before: `Unroll/2*(16/sizeof(T))/2`
|
||||
* sizeof(T)
|
||||
* unroll 1 2 4 8
|
||||
* 4 16 8 4 2
|
||||
* 2 8 4 2 1
|
||||
* 1 0 0 0 0
|
||||
*
|
||||
* After: `(Unroll*4 + sizeof(T) - 1)/sizeof(T)`
|
||||
* sizeof(T)
|
||||
* unroll 1 2 4 8
|
||||
* 4 16 8 4 2
|
||||
* 2 8 4 2 1
|
||||
* 1 4 2 1 1
|
||||
*/
|
||||
#if defined(__gfx90a__)
|
||||
if (MinSrcs > 1) {
|
||||
reduceCopyPacks<RedFn, T, (Unroll*4 + sizeof(T) - 1)/sizeof(T), sizeof(T),
|
||||
reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
|
||||
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts, MaxDsts, PreOpSrcs>
|
||||
(nThreads, thread, redArg, preOpArgs, postOp,
|
||||
nSrcs, srcPtrFn, nDsts, dstPtrFn, nBytesBehind, nBytesAhead);
|
||||
@@ -327,4 +307,4 @@ __device__ __forceinline__ void reduceCopy(
|
||||
nDsts, [=]__device__(int i) { return dstPtrs[i]; }, nElts);
|
||||
}
|
||||
|
||||
#endif // COMMON_KERNEL_H_
|
||||
#endif // COMMON_KERNEL_H_
|
||||
+16
-53
@@ -9,7 +9,7 @@ all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
|
||||
all_tys = ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
|
||||
all_protos = ["LL","LL128","SIMPLE"]
|
||||
all_algos = ["TREE","RING"]
|
||||
all_unroll = ["1", "2", "4"]
|
||||
all_unroll = ["2", "4"]
|
||||
|
||||
all_params = [all_colls, all_algos, all_protos, all_redops, all_tys, all_unroll]
|
||||
|
||||
@@ -164,9 +164,7 @@ def calc_unroll_for_local_arch():
|
||||
# Homogeneous system is required to build for only 1 varient of unroll factor
|
||||
if len(gfx_targets) == 1:
|
||||
gfx_name, cu_count = gfx_targets[0]
|
||||
if "gfx950" == gfx_name:
|
||||
return 1
|
||||
elif "gfx908" == gfx_name or ("gfx942" == gfx_name and cu_count > 80):
|
||||
if "gfx908" == gfx_name or (gfx_name in ["gfx942", "gfx950"] and cu_count > 80):
|
||||
return 2
|
||||
else:
|
||||
return 4
|
||||
@@ -320,38 +318,21 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
out("\n")
|
||||
|
||||
out("typedef void(*ncclDevFuncPtr_t)();\n\n")
|
||||
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_1[] = {\n")
|
||||
index1 = 0
|
||||
for fn in primary_funcs:
|
||||
coll, algo, proto, redop, ty, unroll = fn
|
||||
if unroll != "1": continue
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("/*%4d*/ %s,\n#else\n" % (index1, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
out("/*%4d*/ %s,\n#endif\n" % (index1, sym_ll))
|
||||
else:
|
||||
out("/*%4d*/ %s,\n" % (index1, sym))
|
||||
index1 += 1
|
||||
out("nullptr};\n")
|
||||
out("\n")
|
||||
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_2[] = {\n")
|
||||
index2 = 0
|
||||
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n")
|
||||
index = 0
|
||||
for fn in primary_funcs:
|
||||
coll, algo, proto, redop, ty, unroll = fn
|
||||
if unroll != "2": continue
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("/*%4d*/ %s,\n#else\n" % (index2, sym))
|
||||
out("/*%4d*/ %s,\n#else\n" % (index, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
out("/*%4d*/ %s,\n#endif\n" % (index2, sym_ll))
|
||||
out("/*%4d*/ %s,\n#endif\n" % (index, sym_ll))
|
||||
else:
|
||||
out("/*%4d*/ %s,\n" % (index2, sym))
|
||||
index2 += 1
|
||||
out("/*%4d*/ %s,\n" % (index, sym))
|
||||
index += 1
|
||||
out("nullptr};\n")
|
||||
out("\n")
|
||||
out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable_4[] = {\n")
|
||||
@@ -374,40 +355,22 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
|
||||
if not is_ifc:
|
||||
out("template<unsigned short f, unsigned short l>\n"
|
||||
"struct Caller1 {\n"
|
||||
"struct Caller {\n"
|
||||
" static __forceinline__ __device__ __host__\n"
|
||||
" void call1(unsigned short funcIndex) noexcept\n"
|
||||
" void call(unsigned short funcIndex) noexcept\n"
|
||||
" {\n"
|
||||
" constexpr unsigned short m = f + (l - f) / 2;\n"
|
||||
" return (funcIndex < m) ? Caller1<f, m>::call1(funcIndex) : Caller1<m, l>::call1(funcIndex);\n"
|
||||
" return (funcIndex < m) ? Caller<f, m>::call(funcIndex) : Caller<m, l>::call(funcIndex);\n"
|
||||
" }\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"template<unsigned short f>\n"
|
||||
"struct Caller1<f, f + 1>{\n"
|
||||
"struct Caller<f, f + 1>{\n"
|
||||
" static __forceinline__ __device__ __host__\n"
|
||||
" void call1(unsigned short funcIndex) noexcept { ncclDevFuncTable_1[f](); }\n"
|
||||
" void call(unsigned short funcIndex) noexcept { ncclDevFuncTable[f](); }\n"
|
||||
"};\n")
|
||||
out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {\n")
|
||||
out(f" Caller1<0, {index1}>::call1(funcIndex);\n")
|
||||
out("}\n\n")
|
||||
out("template<unsigned short f, unsigned short l>\n"
|
||||
"struct Caller2 {\n"
|
||||
" static __forceinline__ __device__ __host__\n"
|
||||
" void call2(unsigned short funcIndex) noexcept\n"
|
||||
" {\n"
|
||||
" constexpr unsigned short m = f + (l - f) / 2;\n"
|
||||
" return (funcIndex < m) ? Caller2<f, m>::call2(funcIndex) : Caller2<m, l>::call2(funcIndex);\n"
|
||||
" }\n"
|
||||
"};\n"
|
||||
"\n"
|
||||
"template<unsigned short f>\n"
|
||||
"struct Caller2<f, f + 1>{\n"
|
||||
" static __forceinline__ __device__ __host__\n"
|
||||
" void call2(unsigned short funcIndex) noexcept { ncclDevFuncTable_2[f](); }\n"
|
||||
"};\n")
|
||||
out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {\n")
|
||||
out(f" Caller2<0, {index2}>::call2(funcIndex);\n")
|
||||
out("__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {\n")
|
||||
out(f" Caller<0, {index}>::call(funcIndex);\n")
|
||||
out("}\n\n")
|
||||
out("template<unsigned short f, unsigned short l>\n"
|
||||
"struct Caller4 {\n"
|
||||
@@ -459,7 +422,7 @@ with open(os.path.join(gensrc, "host_table.cpp"), "w") as f:
|
||||
# The mapping from function rows to valid primary function ids.
|
||||
out("extern int const ncclDevFuncRowToId[] = {\n")
|
||||
index = 0
|
||||
for fn in func_rows[:len(func_rows)//3]:
|
||||
for fn in func_rows[:len(func_rows)//2]:
|
||||
fn_id, comment = -1, ""
|
||||
if fn is not None:
|
||||
fn_id = primary_to_index[equivalent_primary(*fn)]
|
||||
|
||||
@@ -11,9 +11,7 @@
|
||||
#include "common.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if defined(__gfx950__)
|
||||
#define COLL_UNROLL 1
|
||||
#elif defined(__gfx908__) || defined(__gfx942__)
|
||||
#if defined(__gfx908__) || defined(__gfx942__) || defined(__gfx950__)
|
||||
#define COLL_UNROLL 2
|
||||
#else
|
||||
#define COLL_UNROLL 4
|
||||
|
||||
+6
-9
@@ -34,20 +34,17 @@ struct ncclKernelMatch {
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 3 : 0))
|
||||
static ncclKernelMatch const ncclKerns[6] = {
|
||||
{(void *)ncclDevKernel_Generic_1, true},
|
||||
{(void *)ncclDevKernel_Generic_2, true},
|
||||
#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll + ((p_comm)->collTraceEnabled ? 2 : 0))
|
||||
static ncclKernelMatch const ncclKerns[4] = {
|
||||
{(void *)ncclDevKernel_Generic, true},
|
||||
{(void *)ncclDevKernel_Generic_4, true},
|
||||
{(void *)ncclDevKernelDebug_Generic_1, true},
|
||||
{(void *)ncclDevKernelDebug_Generic_2, true},
|
||||
{(void *)ncclDevKernelDebug_Generic, true},
|
||||
{(void *)ncclDevKernelDebug_Generic_4, true}
|
||||
};
|
||||
#else
|
||||
#define ncclGetKernelIndex(p_comm) ((p_comm)->unroll)
|
||||
static ncclKernelMatch const ncclKerns[3] = {
|
||||
{(void*)ncclDevKernel_Generic_1, true},
|
||||
{(void*)ncclDevKernel_Generic_2, true},
|
||||
static ncclKernelMatch const ncclKerns[2] = {
|
||||
{(void*)ncclDevKernel_Generic, true},
|
||||
{(void*)ncclDevKernel_Generic_4, true}
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -74,10 +74,9 @@ typedef enum {
|
||||
|
||||
#define NCCL_ALGO_PROTO_IGNORE -1.0
|
||||
|
||||
#define NCCL_NUM_UNROLLS 3 // 1/2/4
|
||||
#define NCCL_UNROLL_1 0
|
||||
#define NCCL_UNROLL_2 1
|
||||
#define NCCL_UNROLL_4 2
|
||||
#define NCCL_NUM_UNROLLS 2 // 2/4
|
||||
#define NCCL_UNROLL_2 0
|
||||
#define NCCL_UNROLL_4 1
|
||||
|
||||
#define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
|
||||
#endif
|
||||
|
||||
+2
-3
@@ -99,9 +99,8 @@ static uint64_t hashUniqueId(ncclUniqueId const &id) {
|
||||
ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
|
||||
hipDeviceProp_t devProp;
|
||||
CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
|
||||
if(IsArchMatch(devProp.gcnArchName, "gfx950"))
|
||||
comm->unroll = NCCL_UNROLL_1;
|
||||
else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80)))
|
||||
if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950"))
|
||||
&& devProp.multiProcessorCount > 80))
|
||||
comm->unroll = NCCL_UNROLL_2;
|
||||
else
|
||||
comm->unroll = NCCL_UNROLL_4;
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user