Dosyalar
rocm-systems/projects/rccl/src/graph/connect.cc
T

.git-blame-ignore-revs dosyasındaki sürümler yok sayılıyor. Bunun yerine normal sorumlu görüntüsü için buraya tıklayın.

946 satır
38 KiB
C++
Ham Normal Görünüm Geçmiş

2019-11-19 14:57:39 -08:00
/*************************************************************************
2022-01-07 06:39:55 -08:00
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
2019-11-19 14:57:39 -08:00
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
2024-06-11 01:28:01 -07:00
#include "device.h"
2019-11-19 14:57:39 -08:00
#include "graph.h"
2024-06-11 01:28:01 -07:00
#include "transport.h"
2019-11-19 14:57:39 -08:00
#include "trees.h"
#include "rings.h"
2022-01-07 06:39:55 -08:00
#include "topo.h"
2019-11-19 14:57:39 -08:00
#include "msccl/msccl_lifecycle.h"
2019-11-19 14:57:39 -08:00
/******************************************************************/
/********************* Internode connection ***********************/
/******************************************************************/
2023-04-03 05:32:07 -07:00
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
2019-11-19 14:57:39 -08:00
int rank = comm->rank;
2022-01-07 06:39:55 -08:00
int localRanks = comm->topo->nodes[GPU].count;
2019-11-19 14:57:39 -08:00
int nChannels = comm->nChannels;
2025-09-02 13:21:14 -07:00
topoRanks->crossNicRing = graphs[NCCL_ALGO_RING]->crossNic;
2024-02-05 05:06:02 -08:00
topoRanks->nvlsHeadNum = 0;
2019-11-19 14:57:39 -08:00
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
2020-09-04 14:35:05 -07:00
channel->tree.up = -1;
2022-08-18 02:53:17 -07:00
channel->collnetChain.up = -1;
2020-09-04 14:35:05 -07:00
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
2022-08-18 02:53:17 -07:00
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
channel->collnetDirect.out = -1;
channel->collnetDirect.headRank = -1;
channel->collnetDirect.nHeads = 0;
channel->collnetDirect.shift = 0;
2024-02-05 05:06:02 -08:00
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
2022-08-18 02:53:17 -07:00
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
2019-11-19 14:57:39 -08:00
2023-04-03 05:32:07 -07:00
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
2019-11-19 14:57:39 -08:00
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0];
topoRanks->ringSend[c] = ringIntra[localRanks-1];
2024-02-05 05:06:02 -08:00
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
2019-11-19 14:57:39 -08:00
}
if (treeIntra[i] == rank) {
2020-09-04 14:35:05 -07:00
int parentIndex = 0;
2023-04-03 05:32:07 -07:00
int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
2019-11-19 14:57:39 -08:00
2020-09-04 14:35:05 -07:00
topoRanks->treeToParent[c] = treeIntra[parentIndex];
topoRanks->treeToChild0[c] = treeIntra[child0Index];
topoRanks->treeToChild1[c] = treeIntra[child1Index];
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
2022-10-25 00:55:55 -07:00
}
if (collNetIntra[i] == rank) {
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
2019-11-19 14:57:39 -08:00
}
2021-03-25 20:59:32 -07:00
}
}
2024-02-05 05:06:02 -08:00
// Duplicate channels trees
2021-03-25 20:59:32 -07:00
struct ncclChannel* channel0 = comm->channels;
2022-01-14 10:03:30 -08:00
struct ncclChannel* channel1 = (nChannels > MAXCHANNELS/2) ? 0 : channel0+nChannels;
if (channel1) memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
2024-02-05 05:06:02 -08:00
// Get nvls heads and the number of heads. Duplicate head is not allowed.
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
bool addHead = true;
2025-01-27 03:30:22 -08:00
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
2024-02-05 05:06:02 -08:00
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
addHead = false;
break;
}
}
if (addHead) {
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
}
}
2024-06-11 01:28:01 -07:00
memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
2019-11-19 14:57:39 -08:00
return ncclSuccess;
}
2023-09-15 15:01:33 -06:00
bool isRankHere(const char* s, int start, int end, int rank) {
if (end <= start || start < 0 || end < 0)
return false;
int num = 0;
while (start < end) {
char currChar = s[start];
if (isdigit(currChar)) {
num = num * 10 + (currChar - '0');
if (isdigit(s[start+1])) {
start++;
continue;
}
}
else if (currChar == '(' || currChar == ')') {
start++;
num = 0;
continue;
}
if (num == rank) return true;
start++;
}
return false;
}
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph) {
2025-08-05 17:36:23 -05:00
int x=0;
2023-09-15 15:01:33 -06:00
for (int i=0; treeGraph->treeBase[i][0]!=0; i++)
{
x=i+1;
}
2023-09-15 15:01:33 -06:00
if( treeGraph->treeBase[0][0] == 0) return ncclSuccess;
int nChannels = comm->nChannels;
2025-08-05 17:36:23 -05:00
//int localRanks = comm->topo->nodes[GPU].count; // unused variable - compiler warning
//new tree
2023-09-15 15:01:33 -06:00
for (int c=0; c<nChannels; c++) { // in here
int buff = c%x;
2023-09-15 15:01:33 -06:00
char tempString[NCCL_TOPO_MAX_NODES*4];
int ko=0;
while (treeGraph->treeBase[buff][ko] != 0) {
tempString[ko] = treeGraph->treeBase[buff][ko];
ko++;
2022-08-19 13:51:49 -06:00
}
2023-09-15 15:01:33 -06:00
tempString[ko]=0;
int start = 0;
int curRank = comm->rank;
2023-09-15 15:01:33 -06:00
struct ncclChannel* channel = comm->channels+c;
int end = 0;
while (tempString[end] != 0) end++;
int parent = -1;
// constructing a number from the continuous digits
while (start < end) {
int num = 0, num_found = 0;
start++;
while (start < end && tempString[start] != '('
&& tempString[start] != ')') {
int num_here = (int)(tempString[start] - '0');
num = num * 10 + num_here;
start = start + 1;
if (tempString[start] == '(' || tempString[start] == ')' || start == end) num_found = 1;
}
if (num_found != 0 && num == curRank) {
channel->tree.up = parent;
int depth = 0;
for (int childId = 0; childId < NCCL_MAX_TREE_ARITY; childId++) {
int or_start = start;
int child = -1;
channel->tree.down[childId] = -1;
if (or_start >= end -1) continue;
num=0;
or_start++;
while (tempString[or_start] != 0 && tempString[or_start] != '('
&& tempString[or_start] != ')') {
int num_here = (int)(tempString[or_start] - '0');
num = num * 10 + num_here;
or_start++;
}
child = num;
// find next child start
while (start < end) {
if (tempString[start] == '(' ) depth++;
else if(tempString[start] == ')') depth--;
if (depth == 0) break; // next child
start++;
}
start++;
channel->tree.down[childId] = child;
// get kids, update numbers, get out of this string
}
2023-09-15 15:01:33 -06:00
break;
}
else { //go to the next one
parent = num;
int start_c = start;
int end_c = start_c;
while (end_c < end) {
int depth = 0;
while (end_c < end) {
if (tempString[end_c] == '(' ) depth++;
else if(tempString[end_c] == ')') depth--;
if (depth == 0) break; // next child
end_c++;
}
if (isRankHere(tempString, start_c, end_c, curRank)) {
start = start_c;
end = end_c;
break;
}
else {
end_c++;
start_c = end_c;
}
}
}
}
2023-09-15 15:01:33 -06:00
}
return ncclSuccess;
}
2023-04-03 05:32:07 -07:00
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
2019-11-19 14:57:39 -08:00
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
for (int c=0; c<nChannels; c++) {
2023-04-03 05:32:07 -07:00
int* recv = ringRecv+c*comm->nNodes;
int* send = ringSend+c*comm->nNodes;
2019-11-19 14:57:39 -08:00
int* prev = ringPrev+c*comm->nRanks;
int* next = ringNext+c*comm->nRanks;
for (int n=0; n<nNodes; n++) {
2023-04-03 05:32:07 -07:00
int recvRank = recv[n];
int prevSendRank = send[(n-1+nNodes)%nNodes];
2019-11-19 14:57:39 -08:00
prev[recvRank] = prevSendRank;
2023-04-03 05:32:07 -07:00
int sendRank = send[n];
int nextRecvRank = recv[(n+1)%nNodes];
2019-11-19 14:57:39 -08:00
next[sendRank] = nextRecvRank;
}
}
2024-04-15 12:03:57 -06:00
// [RCCL] Print off the recv/send local ranks per node, per channel
if (comm->rank == 0)
{
char buff[2048] = "";
int offset = 0;
int inc;
int numChannels = (nChannels > MAXCHANNELS/2) ? 2 * nChannels : nChannels;
for (int c = 0; c < numChannels; c++) {
sprintf(buff + offset, " %02d%n", c, &inc);
offset += inc;
}
INFO(NCCL_GRAPH, "[RINGS] %s", buff);
for (int n = 0; n < nNodes; n++) {
offset = 0;
for (int c = 0; c < nChannels; c++) {
int recvRank = comm->rankToLocalRank[ringRecv[c*comm->nNodes+n]];
int sendRank = comm->rankToLocalRank[ringSend[c*comm->nNodes+n]];
sprintf(buff + offset, " %02d->%02d%n", recvRank, sendRank, &inc);
offset += inc;
}
INFO(NCCL_GRAPH, "[RINGS] %s", buff);
}
}
2019-11-19 14:57:39 -08:00
return ncclSuccess;
}
2023-04-03 05:32:07 -07:00
static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes) {
for (int n=0; n<nNodes; n++) indexes[n] = ranks[n];
2019-11-19 14:57:39 -08:00
return ncclSuccess;
}
2020-09-04 14:35:05 -07:00
static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) {
if (u == -1) return ncclSuccess;
tree->up = indexes[u];
2019-11-19 14:57:39 -08:00
return ncclSuccess;
}
2020-09-04 14:35:05 -07:00
static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
if (d == -1) return ncclSuccess;
2019-11-19 14:57:39 -08:00
int x = 0;
2020-09-04 14:35:05 -07:00
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
if (x == NCCL_MAX_TREE_ARITY) {
2021-02-09 15:34:08 -08:00
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
2019-11-19 14:57:39 -08:00
return ncclInternalError;
}
2020-09-04 14:35:05 -07:00
tree->down[x] = indexes[d];
2019-11-19 14:57:39 -08:00
return ncclSuccess;
}
2023-04-03 05:32:07 -07:00
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
2025-03-20 09:34:53 -06:00
const int channelLimit = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? 2*CHANNEL_LIMIT : CHANNEL_LIMIT;
const int nChannels = (comm->nChannels > channelLimit) ? comm->nChannels / 2 : comm->nChannels;
const int nNodes = comm->nNodes, node = comm->node;
2019-11-19 14:57:39 -08:00
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
2020-09-04 14:35:05 -07:00
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
2023-04-03 05:32:07 -07:00
int* ttp, *ttc0, *ttc1;
2020-09-04 14:35:05 -07:00
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
if (nChannels == comm->nChannels) {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
} else {
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
}
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
}
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
}
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
}
channel0->tree.depth = depth;
}
for (int c=nChannels; c<nChannels*2; c++) {
struct ncclChannel* channel1 = comm->channels+c;
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel1->tree.depth = depth;
}
2019-11-19 14:57:39 -08:00
}
return ncclSuccess;
}
2021-04-12 16:00:11 -07:00
static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
2023-06-13 00:19:57 -07:00
int nHeads = 0;
2021-04-12 16:00:11 -07:00
int *heads;
2023-06-13 00:19:57 -07:00
NCCLCHECK(ncclCalloc(&heads, localRanks));
2021-04-12 16:00:11 -07:00
// Find all head ranks
// Head index is always 0
2023-06-13 00:19:57 -07:00
for (int c=0; c<collNetGraph->nChannels; c++) {
2021-04-12 16:00:11 -07:00
int* collNetIntra = collNetGraph->intra+c*localRanks;
2023-06-13 00:19:57 -07:00
int head = collNetIntra[0];
for (int h=0; h<nHeads; h++) if (heads[h] == head) head = -1;
if (head != -1) heads[nHeads++] = collNetIntra[0];
2020-01-16 16:02:42 -08:00
}
2021-04-12 16:00:11 -07:00
// For all channels
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
char line[1024];
2024-06-11 01:28:01 -07:00
sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
2021-04-12 16:00:11 -07:00
int nDown = 0;
for (int i=0; i<nHeads; i++) {
if (rank == heads[i]) { // is head
2022-08-18 02:53:17 -07:00
channel->collnetDirect.headRank = i; // Mark the index for deciding offset in the CUDA kernel
channel->collnetDirect.out = comm->nRanks; // Set root of collnetDirect to id nranks
2021-04-12 16:00:11 -07:00
int* collNetIntra = collNetGraph->intra+i*localRanks;
sprintf(line+strlen(line), "down ");
for (int r=0; r<localRanks; r++) {
if (collNetIntra[r] == rank) continue;
2022-08-18 02:53:17 -07:00
channel->collnetDirect.down[nDown++] = collNetIntra[r]; // connect to all peers
2021-04-12 16:00:11 -07:00
sprintf(line+strlen(line), " %d ", collNetIntra[r]);
}
sprintf(line+strlen(line), "nDown %d ", nDown);
break;
}
2020-01-16 16:02:42 -08:00
}
2021-04-12 16:00:11 -07:00
// Connect to all heads
int nUp = 0;
sprintf(line+strlen(line), "up ");
for (int h=0; h<nHeads; h++) {
if (rank == heads[h]) continue;
2022-08-18 02:53:17 -07:00
channel->collnetDirect.up[nUp++] = heads[h];
2021-04-12 16:00:11 -07:00
sprintf(line+strlen(line), " %d ", heads[h]);
2020-01-16 16:02:42 -08:00
}
2024-02-05 05:06:02 -08:00
sprintf(line+strlen(line), "heads ");
{ // heads[] is the list of heads ordered in head order startubg with self
int h0 = (channel->collnetDirect.headRank == -1) ? 0 : channel->collnetDirect.headRank;
for (int h1=0; h1 < nHeads; h1++) {
int h = (h0+h1)%nHeads;
channel->collnetDirect.heads[h1] = heads[h];
sprintf(line+strlen(line), " %d ", heads[h]);
}
}
2022-08-18 02:53:17 -07:00
channel->collnetDirect.nHeads = nHeads;
2024-09-10 05:57:10 -07:00
// nHeads should always be greater than 0.
// coverity[divide_by_zero]
2022-08-18 02:53:17 -07:00
channel->collnetDirect.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collnetDirect.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
2021-04-12 16:00:11 -07:00
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
2022-08-18 02:53:17 -07:00
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
2021-04-12 16:00:11 -07:00
INFO(NCCL_GRAPH, "%s", line);
2020-01-16 16:02:42 -08:00
}
2021-04-26 14:24:50 -07:00
free(heads);
2020-01-16 16:02:42 -08:00
return ncclSuccess;
}
2024-02-05 05:06:02 -08:00
static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHeads) {
2023-04-03 05:32:07 -07:00
int headRank = -1;
if (nHeads == 0) {
comm->nvlsChannels = 0;
return ncclSuccess;
}
2024-02-05 05:06:02 -08:00
for (int h = 0; h < nHeads; h++) {
if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
}
2025-09-02 13:21:14 -07:00
for (int c=0; c<comm->nvlsChannels; c++) {
2023-04-03 05:32:07 -07:00
struct ncclChannel* channel = comm->channels+c;
channel->nvls.nHeads = nHeads;
for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
for (int h=nHeads; h<NCCL_MAX_NVLS_ARITY; h++) channel->nvls.up[h] = -1;
channel->nvls.down = comm->nRanks+1+headRank;
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
channel->nvls.headRank = headRank;
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
2025-05-29 20:56:40 -07:00
if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
2023-04-03 05:32:07 -07:00
}
2024-03-26 06:08:55 -07:00
if (comm->nNodes == 1) return ncclSuccess;
2023-04-03 05:32:07 -07:00
// Connect Trees
int tree0Parent, tree0Child0, tree0Child1, tree1Parent, tree1Child0, tree1Child1;
int pc0, pc1; // ignored
NCCLCHECK(ncclGetDtree(comm->nNodes, comm->node,
&tree0Parent, &tree0Child0, &tree0Child1, &pc0,
&tree1Parent, &tree1Child0, &tree1Child1, &pc1));
int* heads = NULL;
int treeUp[2] = { -1, -1 };
int treeDown0[2] = { -1, -1 };
int treeDown1[2] = { -1, -1 };
if (comm->node == 0) {
for (int h=0; h<nHeads; h++) {
char line[1024];
sprintf(line, "NVLS Head %2d:", h);
heads = nvlsHeads+h*comm->nNodes;
for (int n=0; n<comm->nNodes && n<20; n++) {
sprintf(line+strlen(line), " %2d", heads[n]);
}
INFO(NCCL_INIT, "%s", line);
}
}
// Find the heads where I'm the head rank and retain tree up/down
for (int h=0; h<nHeads; h++) {
heads = nvlsHeads+h*comm->nNodes;
if (heads[comm->node] == comm->rank) {
treeUp[0] = tree0Parent == -1 ? -1: heads[tree0Parent];
treeDown0[0] = tree0Child0 == -1 ? -1 : heads[tree0Child0];
treeDown1[0] = tree0Child1 == -1 ? -1 : heads[tree0Child1];
treeUp[1] = tree1Parent == -1 ? -1 : heads[tree1Parent];
treeDown0[1] = tree1Child0 == -1 ? -1 : heads[tree1Child0];
treeDown1[1] = tree1Child1 == -1 ? -1 : heads[tree1Child1];
break;
}
}
// Set prev/next in all channels (NVLS compute channels work
// orthogonally to NVLS search channels).
2025-09-02 13:21:14 -07:00
for (int c=0; c<comm->nvlsChannels; c++) {
2023-04-03 05:32:07 -07:00
struct ncclChannel* channel = comm->channels+c;
channel->nvls.treeUp = treeUp[c%2];
channel->nvls.treeDown[0] = channel->nvls.down;
int ix = 1;
if (treeDown0[c%2] != -1) channel->nvls.treeDown[ix++] = treeDown0[c%2];
if (treeDown1[c%2] != -1) channel->nvls.treeDown[ix] = treeDown1[c%2];
}
struct ncclNvls* nvls0 = &comm->channels[0].nvls;
struct ncclNvls* nvls1 = &comm->channels[1].nvls;
2024-03-26 06:08:55 -07:00
INFO(NCCL_GRAPH, "NVLS Trees : %d/%d/%d->%d->%d %d/%d/%d->%d->%d",
nvls0->treeDown[0], nvls0->treeDown[1], nvls0->treeDown[2], comm->rank, nvls0->treeUp,
nvls1->treeDown[0], nvls1->treeDown[1], nvls1->treeDown[2], comm->rank, nvls1->treeUp);
2023-04-03 05:32:07 -07:00
return ncclSuccess;
}
2019-11-19 14:57:39 -08:00
// Legacy naming
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
// New naming
NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
int ncclMinNchannels() {
2019-11-26 16:33:13 -08:00
int minNchannels = 2;
2019-11-19 14:57:39 -08:00
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
if (minNchannels > MAXCHANNELS) {
2025-05-29 20:56:40 -07:00
INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
2019-11-19 14:57:39 -08:00
minNchannels = MAXCHANNELS;
}
if (minNchannels < 0) minNchannels = 0;
return minNchannels;
}
2024-06-11 01:28:01 -07:00
extern int64_t ncclParamWorkArgsBytes();
2019-11-19 14:57:39 -08:00
int ncclMaxNchannels() {
int maxNchannels = MAXCHANNELS;
if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
2024-06-11 01:28:01 -07:00
maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
2019-11-19 14:57:39 -08:00
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) {
2025-05-29 20:56:40 -07:00
INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
2019-11-19 14:57:39 -08:00
maxNchannels = 1;
}
return maxNchannels;
}
2021-04-12 16:00:11 -07:00
static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for (c=start; c<end; c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
}
return c;
}
2024-02-05 05:06:02 -08:00
void exchangeValues(int* v0, int* v1) {
int tmp = *v1;
*v1 = *v0;
*v0 = tmp;
}
2025-02-20 15:18:29 -07:00
int getTreeNodeParity(int treeDir, int nNodes, int node)
{
if (node == -1) return -1;
int parentNodes[2], child0Nodes[2], child1Nodes[2], childTypes[2];
ncclGetDtree(nNodes, node,
&parentNodes[0], &child0Nodes[0], &child1Nodes[0], &childTypes[0],
&parentNodes[1], &child0Nodes[1], &child1Nodes[1], &childTypes[1]);
// Uptree and downtree have different parity
if (parentNodes[treeDir] == -1) return treeDir;
// Recurse and swap parity if this is child that exits from 2nd intranode rank (childType == 0)
return ((childTypes[treeDir] + 1) + getTreeNodeParity(treeDir, nNodes, parentNodes[treeDir])) % 2;
}
// [RCCL] Build rail-optimized trees
ncclResult_t connectRailOptimizedTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1)
{
INFO(NCCL_GRAPH, "Building rail-optimized trees for %d nodes", comm->nNodes);
/* Rail-optimized trees are implemented in RCCL via a set of specially crafted complimentary
pairs of intra-node XGMI paths such that:
A) Complimentary pairs alternate their first two elements
B) Cover all the XGMI links exactly once
E.g: For MI300X
Path 1A 0 1 2 4 3 6 5 7 Path 2A 2 3 0 5 6 1 7 4
Path 1B 1 0 4 7 3 5 2 6 Path 2B 3 2 7 0 6 4 1 5
^ ^ ^ ^
Path 3A 4 5 1 6 0 3 7 2 Path 4A 6 7 5 3 4 0 2 1
Path 3B 5 4 6 2 0 7 1 3 Path 4B 7 6 3 1 4 2 5 0
^ ^ ^ ^
Due to the balanced tree pattern, the 1st rank in the Path gets connected to the first
rank of the path of the left child node, while the 2nd rank in the path connects to the first
rank of the right child node.
In order to avoid crossing rails, any time a right child is visited, the channel should
be swapped:
AB BA
/ \ / \
/ \ / \
AB BA BA AB
/ \ / \ / \ / \
AB BA BA AB BA AB AB BA
*/
const int nChannels = (comm->nChannels / 2);
const int nNodes = comm->nNodes, node = comm->node, rank = comm->rank;
const int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
const int nGpus = comm->topo->nodes[GPU].count;
struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
// Compute parent/child nodes for this current node, for uptree and downtree
int parentNodes[2], child0Nodes[2], child1Nodes[2], childTypes[2];
NCCLCHECK(ncclGetDtree(nNodes, node,
&parentNodes[0], &child0Nodes[0], &child1Nodes[0], &childTypes[0],
&parentNodes[1], &child0Nodes[1], &child1Nodes[1], &childTypes[1]));
// Loop over up-tree / down-tree
for (int treeDir = 0; treeDir < 2; treeDir++) {
// Collect the parent / child nodes for this tree direction
int parentNode = parentNodes[treeDir];
int child0Node = child0Nodes[treeDir];
int child1Node = child1Nodes[treeDir];
int* treeToChild = (childTypes[treeDir] == 0) ? treeToChild0 : treeToChild1;
// Compute the parity for nodes for this tree direction
int nodeParity = getTreeNodeParity(treeDir, nNodes, node);
int parentParity = getTreeNodeParity(treeDir, nNodes, parentNode);
int child0Parity = getTreeNodeParity(treeDir, nNodes, child0Node);
int child1Parity = getTreeNodeParity(treeDir, nNodes, child1Node);
// Loop over pairs of complimentary channels
for (int ch = 0; ch < nChannels; ch += 2) {
int ch0 = treeDir * nChannels + ch;
int ch1 = ch0 + 1;
ncclChannel* channel[2] = {&comm->channels[ch0], &comm->channels[ch1]};
channel[0]->tree.depth = channel[1]->tree.depth = depth;
// Determine ranks that connect to other nodes for each of the two channels
int rankToParent[2] = {treeToParent[ch0 * nNodes + node], treeToParent[ch1 * nNodes + node]};
int rankToChild0[2] = {treeToChild0[ch0 * nNodes + node], treeToChild0[ch1 * nNodes + node]};
int rankToChild1[2] = {treeToChild1[ch0 * nNodes + node], treeToChild1[ch1 * nNodes + node]};
// All ranks swizzle channels. This maintains internal rank structures setup during TopoPreset
if (nodeParity) {
std::swap(channel[0]->tree, channel[1]->tree);
std::swap(rankToParent[0], rankToParent[1]);
std::swap(rankToChild0[0], rankToChild0[1]);
std::swap(rankToChild1[0], rankToChild1[1]);
// Swap NICs
std::swap(treeGraph->inter[ch0 * 2 ], treeGraph->inter[ch1 * 2 ]);
std::swap(treeGraph->inter[ch0 * 2 + 1], treeGraph->inter[ch1 * 2 + 1]);
// Swap lines
for (int j = 0; j < nGpus; j++)
std::swap(treeGraph->intra[ch0 * nGpus + j], treeGraph->intra[ch1 * nGpus + j]);
}
// Connect ranks that connect to other nodes for each of the two channels
for (int i = 0; i < 2; i++) {
if (rank == rankToParent[i] && parentNode != -1) {
// Connect this rank to correct child rank on parent node
int parentChannel = (parentParity + i) % 2 == 0 ? ch0 : ch1;
channel[i]->tree.up = treeToChild[parentChannel * nNodes + parentNode];
}
if (rank == rankToChild0[i] && child0Node != -1) {
// Connect this rank to the parent rank on child0 node
int child0Channel = (child0Parity + i) % 2 == 0 ? ch0 : ch1;
setTreeDown(&channel[i]->tree, treeToParent + child0Channel * nNodes, child0Node);
}
if (rank == rankToChild1[i] && child1Node != -1) {
// Connect this rank to the parent rank on child1 node
int child1Channel = (child1Parity + i) % 2 == 0 ? ch0 : ch1;
setTreeDown(&channel[i]->tree, treeToParent + child1Channel * nNodes, child1Node);
}
if (rank == rankToParent[i] ||
rank == rankToChild0[i] ||
rank == rankToChild1[i]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", (i == 0 ? ch0 : ch1),
channel[i]->tree.up, rank,
channel[i]->tree.down[0],
channel[i]->tree.down[1],
channel[i]->tree.down[2]);
}
}
}
}
return ncclSuccess;
}
2024-06-11 01:28:01 -07:00
NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
2025-02-20 15:18:29 -07:00
RCCL_PARAM(OutputTrees, "OUTPUT_TREES", 0);
2024-06-11 01:28:01 -07:00
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent, int nc) {
2019-11-19 14:57:39 -08:00
// Gather data from all ranks
2024-09-10 05:57:10 -07:00
ncclResult_t ret = ncclSuccess;
int *ringRecv = NULL, *ringSend = NULL, *ringPrev = NULL, *ringNext = NULL, *treeToParent = NULL, *treeToChild0 = NULL, *treeToChild1 = NULL, *nvlsHeads = NULL;
2019-11-19 14:57:39 -08:00
int nranks = comm->nRanks;
2023-04-03 05:32:07 -07:00
int nNodes = comm->nNodes;
2019-11-19 14:57:39 -08:00
int nChannels = comm->nChannels;
2024-02-05 05:06:02 -08:00
int minHeadNum = INT_MAX;
2025-05-29 20:56:40 -07:00
int shared = parent && parent->nvlsSupport && parent->shareResources;
int maxChannels;
int minNchannels, maxNchannels;
int duplicateCount = 1;
#ifdef ENABLE_WARP_SPEED
const int wsEnabled = comm->topo->warpSpeedEnabled;
const bool singleNode = comm->nNodes == 1;
#endif
2023-04-03 05:32:07 -07:00
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
2024-09-10 05:57:10 -07:00
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&ringNext, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
2024-02-05 05:06:02 -08:00
2025-09-02 13:21:14 -07:00
// Alternate rings to avoid crossing rails.
// CrossNic values could be not the same on all nodes as it depends on the number of net devs and the NVLink bandwidth.
// Therefore, it's only done if the rank obtained a solution with crossNic=2.
for (int r = 0; r < comm->nRanks; r++) {
if (allTopoRanks[r]->crossNicRing == 2 && (nChannels % 2) == 0 && (comm->rankToNode[r] % 2) == 1) {
// Exchange rings
for (int c=0; c<nChannels; c+=2) {
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
2024-02-05 05:06:02 -08:00
}
}
}
2023-04-03 05:32:07 -07:00
for (int c=0; c<nChannels;c++) {
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
}
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
2019-11-19 14:57:39 -08:00
}
}
2024-02-05 05:06:02 -08:00
for (int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
}
for (int c = 0; c < minHeadNum; c++) {
for (int n = 0; n < nNodes; n++) {
2023-09-26 05:47:28 -07:00
int r = firstRanks[n];
2024-02-05 05:06:02 -08:00
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
2023-09-26 05:47:28 -07:00
}
}
2019-11-19 14:57:39 -08:00
// Connect rings and trees. This should also duplicate the channels.
2023-04-03 05:32:07 -07:00
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
2025-02-20 15:18:29 -07:00
// [RCCL] Connect rail-optimized trees
if (comm->topo->useRailOptimizedTrees) {
NCCLCHECK(connectRailOptimizedTrees(comm, treeToParent, treeToChild0, treeToChild1));
} else {
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
}
// Dump graphviz-friendly trees
if (rcclParamOutputTrees()) {
int rank = comm->rank;
char color[8][16] =
{"red", "orange", "yellow", "yellowgreen", "green", "cyan", "deepskyblue", "violet"};
for (int i = 0; i < comm->nChannels; i++) {
INFO(NCCL_GRAPH, "[TREE] %d.%d [style=filled, fillcolor=%s]", i, rank, color[rank % comm->localRanks]);
for (int j = 0; j < 3; j++) {
if (comm->channels[i].tree.down[j] != -1) {
bool sameNode = (comm->rankToNode[rank] == comm->rankToNode[comm->channels[i].tree.down[j]]);
INFO(NCCL_GRAPH, "[TREE] %d.%d->%d.%d [style=%s,width=10,color=%s,label=\"%s\"]",
i, rank, i, comm->channels[i].tree.down[j],
sameNode ? "solid" : "dashed",
sameNode ? "black" : color[rank % comm->localRanks],
sameNode ? "" : (std::string("N") + std::to_string(graphs[NCCL_ALGO_TREE]->inter[i*2+1])).c_str());
}
}
}
}
2019-11-19 14:57:39 -08:00
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
? MAXCHANNELS : 2*CHANNEL_LIMIT;
#ifdef ENABLE_WARP_SPEED
if(!wsEnabled){
maxChannels = std::min((singleNode? RCCL_MI3XX_MAX_SINGLE_NODE_CHANNELS : RCCL_MI3XX_MAX_MULTI_NODE_CHANNELS), maxChannels);
}
#else
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
2024-07-19 08:15:59 -07:00
maxChannels = std::min(64, maxChannels);
}
#endif
2020-03-02 12:51:29 -08:00
// Duplicate ringPrev/ringNext for ncclBuildRing
duplicateCount = maxChannels / nChannels;
if (duplicateCount > 1) {
int limit = duplicateCount;
for (int dup = 1; dup < limit; ++dup) {
memcpy(ringPrev + dup * nChannels * nranks, ringPrev, nChannels * nranks * sizeof(int));
memcpy(ringNext + dup * nChannels * nranks, ringNext, nChannels * nranks * sizeof(int));
}
}
// Get number of channels after duplication
maxNchannels = std::min((int)ncclMaxNchannels(), maxChannels);
nc = std::min(maxNchannels/comm->nChannels, nc);
nc *= comm->nChannels;
2024-02-05 05:06:02 -08:00
// Set ring prev/next for my rank
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
channel0->ring.prev = ringPrev[c*nranks+comm->rank];
channel0->ring.next = ringNext[c*nranks+comm->rank];
if (c + nChannels < MAXCHANNELS) {
channel1->ring.prev = channel0->ring.prev;
channel1->ring.next = channel0->ring.next;
}
2024-02-05 05:06:02 -08:00
}
2020-03-02 12:51:29 -08:00
// Duplication should be complete now
nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);
2021-04-12 16:00:11 -07:00
// Setup CollNet
2025-05-29 20:56:40 -07:00
if (comm->config.collnetEnable) {
2024-06-11 01:28:01 -07:00
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
2021-04-12 16:00:11 -07:00
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
2024-06-11 01:28:01 -07:00
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(maxChannels, nChannels+nChannels/2);
2021-04-12 16:00:11 -07:00
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
2025-09-02 13:21:14 -07:00
for (int c = 0; c < comm->nChannels; c++) {
comm->channels[c].collnetChain.depth = comm->nRanks/comm->nNodes;
}
if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
}
2021-04-12 16:00:11 -07:00
}
2023-04-03 05:32:07 -07:00
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
2023-04-03 05:32:07 -07:00
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
2025-02-20 15:18:29 -07:00
2024-06-11 01:28:01 -07:00
// Double the number of channels when using unpack networking (greater than 1 node)
// We won't automatically double past 16 channels, users can specify 32 if they want
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
2023-04-03 05:32:07 -07:00
minNchannels = ncclMinNchannels();
2024-06-04 15:22:38 -05:00
if (comm->nNodes > 1) {
minNchannels = std::min(64, minNchannels);
2024-06-25 10:10:10 -05:00
}
if (comm->nRanks < 8 && 64 < minNchannels) {
minNchannels = 2;
WARN("NCCL_MIN_NCHANNELS set by environment is ignored due to less than 8 GPUs.");
2024-05-31 15:50:30 -05:00
}
2024-07-19 08:15:59 -07:00
if (minNchannels > maxChannels) {
minNchannels = 2;
WARN("NCCL_MIN_NCHANNELS set by environment is ignored due to greater than max allowed %d channels.", maxChannels);
}
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) {
int mscclNumChannelsRequired = maxNchannels;
mscclSchedulerInit(comm, &mscclNumChannelsRequired);
if (comm->mscclCompatible) {
minNchannels = std::max(minNchannels, mscclNumChannelsRequired);
}
}
2019-11-19 14:57:39 -08:00
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
2023-04-03 05:32:07 -07:00
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(minNchannels, std::max(nc, comm->config.minCTAs)), comm->sharedRes->tpNChannels), ringPrev, ringNext);
2023-04-03 05:32:07 -07:00
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(minNchannels, std::max(nc, comm->config.minCTAs)), ringPrev, ringNext);
2023-04-03 05:32:07 -07:00
}
2019-11-19 14:57:39 -08:00
2024-02-05 05:06:02 -08:00
comm->collChannels = comm->nChannels;
2024-03-26 06:08:55 -07:00
#if CUDART_VERSION >= 12010
2024-02-05 05:06:02 -08:00
// Support maximal channel usage for aggregation
2024-03-26 06:08:55 -07:00
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
comm->nvlsChannels = parent->nvlsResources->nChannels;
}
2024-09-10 05:57:10 -07:00
NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
2024-03-26 06:08:55 -07:00
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
}
2024-02-05 05:06:02 -08:00
2019-11-19 14:57:39 -08:00
// Create rings array and check all is fine
2024-09-10 05:57:10 -07:00
NCCLCHECKGOTO(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext), ret, fail);
exit:
if (ringRecv) free(ringRecv);
if (ringSend) free(ringSend);
if (ringPrev) free(ringPrev);
if (ringNext) free(ringNext);
if (treeToParent) free(treeToParent);
if (treeToChild0) free(treeToChild0);
if (treeToChild1) free(treeToChild1);
if (nvlsHeads) free(nvlsHeads);
return ret;
fail:
goto exit;
}