diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 6b2d2e26ad..2e6c9fa02e 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -8,16 +8,16 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call. * Fixed unit test failures in tests ending with `ManagedMem` and `ManagedMemGraph` suffixes. -* Suboptimal algorithmic switching point for AllReduce on MI300x +* Suboptimal algorithmic switching point for AllReduce on MI300x. * Fixed the known issue "When splitting a communicator using `ncclCommSplit` in some GPU configurations, MSCCL initialization can cause a segmentation fault." with a design change to use `comm` instead of `rank` for `mscclStatus`. The Global map for `comm` to `mscclStatus` is still not thread safe but should be explicitly handled by mutexes for read writes. This is tested for correctness, but there is a plan to use a thread-safe map data structure in upcoming changes. ### Added * Added new GPU target `gfx950`. -* Added support for `unroll=1` in device-code generation to improve performance -* Set a default of 112 channels for a single node with `8 * gfx950` -* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the - environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. +* Added support for `unroll=1` in device-code generation to improve performance, +* Set a default of 112 channels for a single node with `8 * gfx950`, +* Enabled LL128 protocol on `gfx950`. +* Added MSCCL support for AllGather multinode gfx942/gfx950 (i.e., 16 and 32 GPUs). To enable, set the environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. * Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set. * Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges. * LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL. @@ -310,4 +310,4 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Changed - Switched to hip-clang as default compiler ### Deprecated -- Deprecated hcc build \ No newline at end of file +- Deprecated hcc build diff --git a/projects/rccl/src/device/generate.py b/projects/rccl/src/device/generate.py index bcc15aefb6..58096432f9 100755 --- a/projects/rccl/src/device/generate.py +++ b/projects/rccl/src/device/generate.py @@ -310,7 +310,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: for fn in primary_funcs: sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n") out("%s %s();\n#else\n" % (func_declaration, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) @@ -327,7 +327,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: if unroll != "1": continue sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n") out("/*%4d*/ %s,\n#else\n" % (index1, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) @@ -344,7 +344,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: if unroll != "2": continue sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n") out("/*%4d*/ %s,\n#else\n" % (index2, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) @@ -361,7 +361,7 @@ with open(os.path.join(gensrc, "device_table.h"), "w") as f: if unroll != "4": continue sym = paste("_", "ncclDevFunc", *fn) if fn[2] == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n") out("/*%4d*/ %s,\n#else\n" % (index4, sym)) fn_ll = fn[:2] + ("LL",) + fn[3:] sym_ll = paste("_", "ncclDevFunc", *fn_ll) @@ -531,7 +531,7 @@ for name in name_to_funcs.keys(): (coll, algo, proto, redop, ty, unroll) = fn sym = paste("_", coll, algo, proto, redop, ty, unroll) if proto == "LL128": - out("#if (defined(__gfx90a__) || defined(__gfx942__)) && defined(ENABLE_LL128)\n") + out("#if (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && defined(ENABLE_LL128)\n") out( "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {unroll})\n" .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty], diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index 4944ce3a9c..2b4ea5ecce 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -657,7 +657,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Enable LL128 by default only on gfx90a with available tuning table pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && ((IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") || - IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) && comm->topo->ll128Enabled) ? 1 : 0; + IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || + IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) && comm->topo->ll128Enabled) ? 1 : 0; #else pEnable = 0; #endif diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc index 397cbc7664..f77b9fb7ee 100644 --- a/projects/rccl/src/rccl_wrap.cc +++ b/projects/rccl/src/rccl_wrap.cc @@ -59,9 +59,10 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s } } #endif - } else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942")) { - // Warn that model detection for MI300 (or future others) did not work as expected - // Add supported archs to this condition as they come (e.g. gfx950) + } else if (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || + IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) { + // Warn that model detection for the above listed architectures did not work as expected + // Add supported archs to this condition as they come // Also make sure the tuning_model and model detection are updated for new archs static bool failedWarn = false; if (!failedWarn) { @@ -120,4 +121,4 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, RCCL_STATIC_EXPOSE_CHECK(); maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count); return ncclSuccess; -} \ No newline at end of file +}