Set default max channels to 48 for MI350 multi-node (#2759)
* make 48 the default max channels for MI350 * address review comments --------- Co-authored-by: Ghadeer Alabandi <abandiga@gmail.com> Co-authored-by: systems-assistant[bot] <systems-assistant[bot]@users.noreply.github.com> Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
13091e18ad
commit
055909d335
@@ -50,11 +50,38 @@ RCCL_PARAM(WarpSpeedEnable, "WARP_SPEED_ENABLE", 0);
|
|||||||
#endif
|
#endif
|
||||||
#define RCCL_WARP_SPEED_MIN_BYTES (1ULL << 26) // 64 MB
|
#define RCCL_WARP_SPEED_MIN_BYTES (1ULL << 26) // 64 MB
|
||||||
|
|
||||||
RCCL_PARAM(ReducedCuEnable, "REDUCED_CU_ENABLE", 0);
|
|
||||||
|
|
||||||
void rcclRestrictMaxChannels(struct ncclComm* comm, int& nc ) {
|
void rcclRestrictMaxChannels(struct ncclComm* comm, int& nc ) {
|
||||||
if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") && rcclParamReducedCuEnable() == 1) {
|
|
||||||
nc = comm->nChannels = std::min(nc, 48);
|
if (comm->nNodes > 1 && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) {
|
||||||
|
const char* maxNChannelsStr = getenv("NCCL_MAX_NCHANNELS");
|
||||||
|
|
||||||
|
if (maxNChannelsStr) {
|
||||||
|
char* end = nullptr;
|
||||||
|
long userMax = strtol(maxNChannelsStr, &end, 10);
|
||||||
|
|
||||||
|
const bool valid = (end != maxNChannelsStr && *end == '\0' && userMax > 0);
|
||||||
|
if (valid) {
|
||||||
|
// 64 is the max number of channels for gfx950 multi-node
|
||||||
|
userMax = std::min<long>(userMax, 64);
|
||||||
|
const int cap = (int)userMax;
|
||||||
|
INFO(NCCL_TUNING, "RCCL MaxChannels is capped to: %i", cap);
|
||||||
|
// Cap max channels, but don't permanently shrink comm->nChannels
|
||||||
|
// based on a small-message tuning decision (which can legitimately pick nc=1).
|
||||||
|
nc = std::min(nc, cap);
|
||||||
|
comm->nChannels = std::min(comm->nChannels, cap);
|
||||||
|
} else {
|
||||||
|
// Invalid / non-positive value: treat as "unset" and apply default restriction.
|
||||||
|
INFO(NCCL_TUNING, "RCCL MaxChannels: ignoring invalid NCCL_MAX_NCHANNELS='%s', default capping to 48", maxNChannelsStr);
|
||||||
|
nc = std::min(nc, 48);
|
||||||
|
comm->nChannels = std::min(comm->nChannels, 48);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Default restriction for gfx950 multi-node when user hasn't set a valid max.
|
||||||
|
nc = std::min(nc, 48);
|
||||||
|
comm->nChannels = std::min(comm->nChannels, 48);
|
||||||
|
INFO(NCCL_TUNING, "RCCL MaxChannels: default capping to 48");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user