Set default max channels to 48 for MI350 multi-node (#2759)

* make 48 the default max channels for MI350

* address review comments

---------

Co-authored-by: Ghadeer Alabandi <abandiga@gmail.com>
Co-authored-by: systems-assistant[bot] <systems-assistant[bot]@users.noreply.github.com>
Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com>
Bu işleme şunda yer alıyor:
systems-assistant[bot]
2026-01-30 09:22:42 -06:00
işlemeyi yapan: GitHub
ebeveyn 13091e18ad
işleme 055909d335
+30 -3
Dosyayı Görüntüle
@@ -50,12 +50,39 @@ RCCL_PARAM(WarpSpeedEnable, "WARP_SPEED_ENABLE", 0);
#endif
#define RCCL_WARP_SPEED_MIN_BYTES (1ULL << 26) // 64 MB
RCCL_PARAM(ReducedCuEnable, "REDUCED_CU_ENABLE", 0);
void rcclRestrictMaxChannels(struct ncclComm* comm, int& nc ) {
if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && rcclParamReducedCuEnable() == 1) {
nc = comm->nChannels = std::min(nc, 48);
if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
const char* maxNChannelsStr = getenv("NCCL_MAX_NCHANNELS");
if (maxNChannelsStr) {
char* end = nullptr;
long userMax = strtol(maxNChannelsStr, &end, 10);
const bool valid = (end != maxNChannelsStr && *end == '\0' && userMax > 0);
if (valid) {
// 64 is the max number of channels for gfx950 multi-node
userMax = std::min<long>(userMax, 64);
const int cap = (int)userMax;
INFO(NCCL_TUNING, "RCCL MaxChannels is capped to: %i", cap);
// Cap max channels, but don't permanently shrink comm->nChannels
// based on a small-message tuning decision (which can legitimately pick nc=1).
nc = std::min(nc, cap);
comm->nChannels = std::min(comm->nChannels, cap);
} else {
// Invalid / non-positive value: treat as "unset" and apply default restriction.
INFO(NCCL_TUNING, "RCCL MaxChannels: ignoring invalid NCCL_MAX_NCHANNELS='%s', default capping to 48", maxNChannelsStr);
nc = std::min(nc, 48);
comm->nChannels = std::min(comm->nChannels, 48);
}
} else {
// Default restriction for gfx950 multi-node when user hasn't set a valid max.
nc = std::min(nc, 48);
comm->nChannels = std::min(comm->nChannels, 48);
INFO(NCCL_TUNING, "RCCL MaxChannels: default capping to 48");
}
}
}
static inline bool rcclCollSupportsRing(ncclFunc_t func) {