[BUILD] Enable LL128 on gfx950 (#1731)
* [BUILD] Enable LL128 on gfx950 * Modify comment in src/rccl_wrap.cc * Update CHANGELOG Signed-off-by: nileshnegi <Nilesh.Negi@amd.com> Co-authored-by: corey-derochie-amd <161367113+corey-derochie-amd@users.noreply.github.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
2ac44cfe4e
Коммит
ef5b4ff630
+6
-6
@@ -8,16 +8,16 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
|
||||
* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
|
||||
* Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes.
|
||||
* Suboptimal algorithmic switching point for AllReduce on MI300x
|
||||
* Suboptimal algorithmic switching point for AllReduce on MI300x.
|
||||
* Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes.
|
||||
|
||||
### Added
|
||||
|
||||
* Added new GPU target `gfx950`.
|
||||
* Added support for `unroll=1` in device-code generation to improve performance
|
||||
* Set a default of 112 channels for a single node with `8 * gfx950`
|
||||
* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the
|
||||
environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
|
||||
* Added support for `unroll=1` in device-code generation to improve performance,
|
||||
* Set a default of 112 channels for a single node with `8 * gfx950`,
|
||||
* Enabled LL128 protocol on `gfx950`.
|
||||
* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
|
||||
* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
|
||||
* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
|
||||
* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
|
||||
@@ -310,4 +310,4 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
### Changed
|
||||
- Switched to hip-clang as default compiler
|
||||
### Deprecated
|
||||
- Deprecated hcc build
|
||||
- Deprecated hcc build
|
||||
|
||||
@@ -310,7 +310,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
for fn in primary_funcs:
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
|
||||
out("%s %s();\n#else\n" % (func_declaration, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
@@ -327,7 +327,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
if unroll != "1": continue
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
|
||||
out("/*%4d*/ %s,\n#else\n" % (index1, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
@@ -344,7 +344,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
if unroll != "2": continue
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
|
||||
out("/*%4d*/ %s,\n#else\n" % (index2, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
@@ -361,7 +361,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f:
|
||||
if unroll != "4": continue
|
||||
sym = paste("_", "ncclDevFunc", *fn)
|
||||
if fn[2] == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
|
||||
out("/*%4d*/ %s,\n#else\n" % (index4, sym))
|
||||
fn_ll = fn[:2] + ("LL",) + fn[3:]
|
||||
sym_ll = paste("_", "ncclDevFunc", *fn_ll)
|
||||
@@ -531,7 +531,7 @@ for name in name_to_funcs.keys():
|
||||
(coll, algo, proto, redop, ty, unroll) = fn
|
||||
sym = paste("_", coll, algo, proto, redop, ty, unroll)
|
||||
if proto == "LL128":
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n")
|
||||
out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n")
|
||||
out(
|
||||
"DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {unroll})\n"
|
||||
.format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
|
||||
|
||||
@@ -657,7 +657,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
// Enable LL128 by default only on gfx90a with available tuning table
|
||||
pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
|
||||
((IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") ||
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) && comm->topo->ll128Enabled) ? 1 : 0;
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->ll128Enabled) ? 1 : 0;
|
||||
#else
|
||||
pEnable = 0;
|
||||
#endif
|
||||
|
||||
@@ -59,9 +59,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) {
|
||||
// Warn that model detection for MI300 (or future others) did not work as expected
|
||||
// Add supported archs to this condition as they come (e.g. gfx950)
|
||||
} else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
|
||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
|
||||
// Warn that model detection for the above listed architectures did not work as expected
|
||||
// Add supported archs to this condition as they come
|
||||
// Also make sure the tuning_model and model detection are updated for new archs
|
||||
static bool failedWarn = false;
|
||||
if (!failedWarn) {
|
||||
@@ -120,4 +121,4 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
|
||||
RCCL_STATIC_EXPOSE_CHECK();
|
||||
maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user