* Allow disabling rail-optimized trees via RCCL_DISABLE_RAIL_TREES, Graphviz-friendly output via RCCL_OUTPUT_TREES


[ROCm/rccl commit: ddc5d58b93]
Этот коммит содержится в:
gilbertlee-amd
2025-02-20 15:18:29 -07:00
коммит произвёл GitHub
родитель 5f3134db50
Коммит 4ca7e6873e
5 изменённых файлов: 270 добавлений и 5 удалений
+6
Просмотреть файл
@@ -13,6 +13,12 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
### Changed
* Compatibility with NCCL 2.22.3
* Added support for the rail-optimized tree algorithm for the MI300 series. This feature requires the use of all eight GPUs within
each node. It limits NIC traffic to use only GPUs of the same index across nodes and should not impact performance
on non-rail-optimized network topologies. The original method of building trees can be enabled by setting the
environment variable `RCCL_DISABLE_RAIL_TREES=1`.
* Additional debug information about how the trees are built can be logged to the GRAPH logging subsys by setting
`RCCL_OUTPUT_TREES=1`.
## RCCL 2.21.5 for ROCm 6.3.1
+171 -2
Просмотреть файл
@@ -568,7 +568,148 @@ void exchangeValues(int* v0, int* v1) {
*v0 = tmp;
}
int getTreeNodeParity(int treeDir, int nNodes, int node)
{
if (node == -1) return -1;
int parentNodes[2], child0Nodes[2], child1Nodes[2], childTypes[2];
ncclGetDtree(nNodes, node,
&parentNodes[0], &child0Nodes[0], &child1Nodes[0], &childTypes[0],
&parentNodes[1], &child0Nodes[1], &child1Nodes[1], &childTypes[1]);
// Uptree and downtree have different parity
if (parentNodes[treeDir] == -1) return treeDir;
// Recurse and swap parity if this is child that exits from 2nd intranode rank (childType == 0)
return ((childTypes[treeDir] + 1) + getTreeNodeParity(treeDir, nNodes, parentNodes[treeDir])) % 2;
}
// [RCCL] Build rail-optimized trees
ncclResult_t connectRailOptimizedTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1)
{
INFO(NCCL_GRAPH, "Building rail-optimized trees for %d nodes", comm->nNodes);
/* Rail-optimized trees are implemented in RCCL via a set of specially crafted complimentary
pairs of intra-node XGMI paths such that:
A) Complimentary pairs alternate their first two elements
B) Cover all the XGMI links exactly once
E.g: For MI300X
Path 1A 0 1 2 4 3 6 5 7 Path 2A 2 3 0 5 6 1 7 4
Path 1B 1 0 4 7 3 5 2 6 Path 2B 3 2 7 0 6 4 1 5
^ ^ ^ ^
Path 3A 4 5 1 6 0 3 7 2 Path 4A 6 7 5 3 4 0 2 1
Path 3B 5 4 6 2 0 7 1 3 Path 4B 7 6 3 1 4 2 5 0
^ ^ ^ ^
Due to the balanced tree pattern, the 1st rank in the Path gets connected to the first
rank of the path of the left child node, while the 2nd rank in the path connects to the first
rank of the right child node.
In order to avoid crossing rails, any time a right child is visited, the channel should
be swapped:
AB BA
/ \ / \
/ \ / \
AB BA BA AB
/ \ / \ / \ / \
AB BA BA AB BA AB AB BA
*/
const int nChannels = (comm->nChannels / 2);
const int nNodes = comm->nNodes, node = comm->node, rank = comm->rank;
const int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
const int nGpus = comm->topo->nodes[GPU].count;
struct ncclTopoGraph* treeGraph = &comm->graphs[NCCL_ALGO_TREE];
// Compute parent/child nodes for this current node, for uptree and downtree
int parentNodes[2], child0Nodes[2], child1Nodes[2], childTypes[2];
NCCLCHECK(ncclGetDtree(nNodes, node,
&parentNodes[0], &child0Nodes[0], &child1Nodes[0], &childTypes[0],
&parentNodes[1], &child0Nodes[1], &child1Nodes[1], &childTypes[1]));
// Loop over up-tree / down-tree
for (int treeDir = 0; treeDir < 2; treeDir++) {
// Collect the parent / child nodes for this tree direction
int parentNode = parentNodes[treeDir];
int child0Node = child0Nodes[treeDir];
int child1Node = child1Nodes[treeDir];
int* treeToChild = (childTypes[treeDir] == 0) ? treeToChild0 : treeToChild1;
// Compute the parity for nodes for this tree direction
int nodeParity = getTreeNodeParity(treeDir, nNodes, node);
int parentParity = getTreeNodeParity(treeDir, nNodes, parentNode);
int child0Parity = getTreeNodeParity(treeDir, nNodes, child0Node);
int child1Parity = getTreeNodeParity(treeDir, nNodes, child1Node);
// Loop over pairs of complimentary channels
for (int ch = 0; ch < nChannels; ch += 2) {
int ch0 = treeDir * nChannels + ch;
int ch1 = ch0 + 1;
ncclChannel* channel[2] = {&comm->channels[ch0], &comm->channels[ch1]};
channel[0]->tree.depth = channel[1]->tree.depth = depth;
// Determine ranks that connect to other nodes for each of the two channels
int rankToParent[2] = {treeToParent[ch0 * nNodes + node], treeToParent[ch1 * nNodes + node]};
int rankToChild0[2] = {treeToChild0[ch0 * nNodes + node], treeToChild0[ch1 * nNodes + node]};
int rankToChild1[2] = {treeToChild1[ch0 * nNodes + node], treeToChild1[ch1 * nNodes + node]};
// All ranks swizzle channels. This maintains internal rank structures setup during TopoPreset
if (nodeParity) {
std::swap(channel[0]->tree, channel[1]->tree);
std::swap(rankToParent[0], rankToParent[1]);
std::swap(rankToChild0[0], rankToChild0[1]);
std::swap(rankToChild1[0], rankToChild1[1]);
// Swap NICs
std::swap(treeGraph->inter[ch0 * 2 ], treeGraph->inter[ch1 * 2 ]);
std::swap(treeGraph->inter[ch0 * 2 + 1], treeGraph->inter[ch1 * 2 + 1]);
// Swap lines
for (int j = 0; j < nGpus; j++)
std::swap(treeGraph->intra[ch0 * nGpus + j], treeGraph->intra[ch1 * nGpus + j]);
}
// Connect ranks that connect to other nodes for each of the two channels
for (int i = 0; i < 2; i++) {
if (rank == rankToParent[i] && parentNode != -1) {
// Connect this rank to correct child rank on parent node
int parentChannel = (parentParity + i) % 2 == 0 ? ch0 : ch1;
channel[i]->tree.up = treeToChild[parentChannel * nNodes + parentNode];
}
if (rank == rankToChild0[i] && child0Node != -1) {
// Connect this rank to the parent rank on child0 node
int child0Channel = (child0Parity + i) % 2 == 0 ? ch0 : ch1;
setTreeDown(&channel[i]->tree, treeToParent + child0Channel * nNodes, child0Node);
}
if (rank == rankToChild1[i] && child1Node != -1) {
// Connect this rank to the parent rank on child1 node
int child1Channel = (child1Parity + i) % 2 == 0 ? ch0 : ch1;
setTreeDown(&channel[i]->tree, treeToParent + child1Channel * nNodes, child1Node);
}
if (rank == rankToParent[i] ||
rank == rankToChild0[i] ||
rank == rankToChild1[i]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", (i == 0 ? ch0 : ch1),
channel[i]->tree.up, rank,
channel[i]->tree.down[0],
channel[i]->tree.down[1],
channel[i]->tree.down[2]);
}
}
}
}
return ncclSuccess;
}
NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
RCCL_PARAM(OutputTrees, "OUTPUT_TREES", 0);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent, int nc) {
// Gather data from all ranks
@@ -578,6 +719,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
@@ -632,7 +774,34 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
// [RCCL] Connect rail-optimized trees
if (comm->topo->useRailOptimizedTrees) {
NCCLCHECK(connectRailOptimizedTrees(comm, treeToParent, treeToChild0, treeToChild1));
} else {
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
}
// Dump graphviz-friendly trees
if (rcclParamOutputTrees()) {
int rank = comm->rank;
char color[8][16] =
{"red", "orange", "yellow", "yellowgreen", "green", "cyan", "deepskyblue", "violet"};
for (int i = 0; i < comm->nChannels; i++) {
INFO(NCCL_GRAPH, "[TREE] %d.%d [style=filled, fillcolor=%s]", i, rank, color[rank % comm->localRanks]);
for (int j = 0; j < 3; j++) {
if (comm->channels[i].tree.down[j] != -1) {
bool sameNode = (comm->rankToNode[rank] == comm->rankToNode[comm->channels[i].tree.down[j]]);
INFO(NCCL_GRAPH, "[TREE] %d.%d->%d.%d [style=%s,width=10,color=%s,label=\"%s\"]",
i, rank, i, comm->channels[i].tree.down[j],
sameNode ? "solid" : "dashed",
sameNode ? "black" : color[rank % comm->localRanks],
sameNode ? "" : (std::string("N") + std::to_string(graphs[NCCL_ALGO_TREE]->inter[i*2+1])).c_str());
}
}
}
}
// Only use full MAXCHANNELS for gfx94x
int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ?
@@ -683,7 +852,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && 2*nChannels <= maxChannels) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Double the number of channels when using unpack networking (greater than 1 node)
// We won't automatically double past 16 channels, users can specify 32 if they want
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+91 -2
Просмотреть файл
@@ -21,6 +21,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "core.h"
#include "comm.h"
#include "graph.h"
#include "topo.h"
#include "xml.h"
@@ -48,8 +49,7 @@ struct rcclRomeModel {
const char *ringTail1; // Lines to use for node N-1 if the total number of nodes is odd
const char *options;
const char *treeBase;
const char *treeRail;
};
static struct rcclRomeModel rome_model_22 = {
@@ -815,6 +815,78 @@ static struct rcclRomeModel rome_model_81 = {
"N1 1 0 2 4 3 5 7 6 N6|",
.options = "noCpuCheck=1,tuning=5,disableNumaMatching=1",
.treeRail = "N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N6 6 7 5 3 4 0 2 1 N7|"
"N7 7 6 3 1 4 2 5 0 N6|"
"N0 0 1 2 4 3 6 5 7 N1|"
"N1 1 0 4 7 3 5 2 6 N0|"
"N2 2 3 0 5 6 1 7 4 N3|"
"N3 3 2 7 0 6 4 1 5 N2|"
"N4 4 5 1 6 0 3 7 2 N5|"
"N5 5 4 6 2 0 7 1 3 N4|",
};
static struct rcclRomeModel rome_model_84 = {
@@ -1756,6 +1828,8 @@ int checkAlltoallWidth(struct rcclRomeModel *romeTopo) {
return width;
}
RCCL_PARAM(DisableRailTrees, "DISABLE_RAIL_TREES", 0);
ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, const char *ringBase) {
constexpr int NUMA_CPUS = 2;
constexpr int NUMA_GPUS = 4;
@@ -1908,6 +1982,21 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
}
break;
case NCCL_TOPO_PATTERN_BALANCED_TREE:
// Check if rail-optimized trees have been defined
system->useRailOptimizedTrees = false;
if (romeTopoModels[i].treeRail != nullptr && !rcclParamDisableRailTrees()) {
// If so, parse the lines in advanced
// These lines will be modified appropriately during ncclTopoPostset
NCCLCHECK(parseGraph(romeTopoModels[i].treeRail, system, graph, g8, nnets > 1 ? n : NULL, 0));
if (graph->nChannels) {
system->useRailOptimizedTrees = true;
return ncclSuccess;
}
}
// Fall back to looking for tree configuration from treeBase
if (romeTopoModels[i].treeBase != nullptr) {
NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, rdm.data()));
if (graph->nChannels) return ncclSuccess;
+1
Просмотреть файл
@@ -192,6 +192,7 @@ struct ncclTopoSystem {
// [RCCL] Track hostIdx to support rail-optimized rings/trees
int hostIdx;
bool useRailOptimizedTrees;
};
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+1 -1
Просмотреть файл
@@ -152,7 +152,7 @@ int main(int argc,char* argv[])
printf("Usage: ./topo_expl -m model_id [-n numNodes=1]\n");
printf("List of model_id:\n");
for (int i = 0; i < num_models; i++)
printf(" %2d: %s\n", i, model_descs[i].description);
printf(" %2d: %24s [%s]\n", i, model_descs[i].description, model_descs[i].filename);
exit(0);
}