diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc index 4d62da96d9..1293fcfe66 100644 --- a/projects/rccl/src/rccl_wrap.cc +++ b/projects/rccl/src/rccl_wrap.cc @@ -50,12 +50,39 @@ RCCL_PARAM(WarpSpeedEnable, "WARP_SPEED_ENABLE", 0); #endif #define RCCL_WARP_SPEED_MIN_BYTES (1ULL << 26) // 64 MB -RCCL_PARAM(ReducedCuEnable, "REDUCED_CU_ENABLE", 0); void rcclRestrictMaxChannels(struct ncclComm* comm, int& nc ) { - if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && rcclParamReducedCuEnable() == 1) { - nc = comm->nChannels = std::min(nc, 48); + + if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) { + const char* maxNChannelsStr = getenv("NCCL_MAX_NCHANNELS"); + + if (maxNChannelsStr) { + char* end = nullptr; + long userMax = strtol(maxNChannelsStr, &end, 10); + + const bool valid = (end != maxNChannelsStr && *end == '\0' && userMax > 0); + if (valid) { + // 64 is the max number of channels for gfx950 multi-node + userMax = std::min(userMax, 64); + const int cap = (int)userMax; + INFO(NCCL_TUNING, "RCCL MaxChannels is capped to: %i", cap); + // Cap max channels, but don't permanently shrink comm->nChannels + // based on a small-message tuning decision (which can legitimately pick nc=1). + nc = std::min(nc, cap); + comm->nChannels = std::min(comm->nChannels, cap); + } else { + // Invalid / non-positive value: treat as "unset" and apply default restriction. + INFO(NCCL_TUNING, "RCCL MaxChannels: ignoring invalid NCCL_MAX_NCHANNELS='%s', default capping to 48", maxNChannelsStr); + nc = std::min(nc, 48); + comm->nChannels = std::min(comm->nChannels, 48); + } + } else { + // Default restriction for gfx950 multi-node when user hasn't set a valid max. + nc = std::min(nc, 48); + comm->nChannels = std::min(comm->nChannels, 48); + INFO(NCCL_TUNING, "RCCL MaxChannels: default capping to 48"); } + } } static inline bool rcclCollSupportsRing(ncclFunc_t func) {