diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index f8d8e89c91..8f96627f67 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -30,84 +30,6 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* return ncclInternalError; } -// [RCCL] -// This function traverses only XGMI links (including multi-GPU hops) and builds them into the -// topology system, which corresponds to how XGMI hardware operates -static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system) -{ - // Compute paths to GPU g - for (int g=0; gnodes[GPU].count; g++) { - struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g; - - if (baseNode->paths[baseNode->type] == NULL) { - NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); - } - - // breadth-first search to set all paths to that node in the system - struct ncclTopoNodeList nodeList; - struct ncclTopoNodeList nextNodeList; - nodeList.count = 1; nodeList.list[0] = baseNode; - nextNodeList.count = 0; - struct ncclTopoLinkList* basePath; - NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); - basePath->count = 0; - basePath->width = LOC_WIDTH; - basePath->type = PATH_LOC; - - while (nodeList.count) { - nextNodeList.count = 0; - for (int n=0; ntype, baseNode->id, &path)); - for (int l=0; lnlinks; l++) { - struct ncclTopoLink* link = node->links+l; - struct ncclTopoNode* remNode = link->remNode; - - // Skip non-XGMI links - if (link->type != LINK_NVL) continue; - - if (remNode->paths[baseNode->type] == NULL) { - NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); - } - - struct ncclTopoLinkList* remPath; - NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); - float width = std::min(path->width, link->width); - if (remPath->width < width) { - // Find reverse link - for (int l=0; lnlinks; l++) { - if (remNode->links[l].remNode == node) { - remPath->list[0] = remNode->links+l; - break; - } - } - if (remPath->list[0] == NULL) { - WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", - remNode->type, remNode->id, remNode->nlinks, node->type, node->id); - return ncclInternalError; - } - // Copy the rest of the path - for (int i=0; icount; i++) remPath->list[i+1] = path->list[i]; - remPath->count = path->count + 1; - remPath->width = width; - remPath->type = PATH_NVL; - - // Add to the list for the next iteration if not already in the list - // In this case, permit GPUs are intermediate XGMI steps - for (int i=0; ipaths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); @@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); float width = std::min(path->width, link->width); - - // [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster) - // Unless they are of shorter length - // if (remPath->width < width) { - bool notXGMI = remPath->type != PATH_NVL; - if (remPath->width < width && notXGMI) { - // [/RCCL] - + if (remPath->width < width) { // Find reverse link for (int l=0; lnlinks; l++) { if (remNode->links[l].remNode == node) { @@ -451,10 +366,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); } - // [RCCL] Add XGMI-only links between GPUs first before any other paths - NCCLCHECK(ncclTopoSetXgmi(system)); - // [/RCCL] - // Set direct paths from/to GPUs. for (int g=0; gnodes[GPU].count; g++) { // Compute paths to GPU g diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index 6a0c824b64..56bbd4ca9b 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -159,9 +159,6 @@ struct ncclGpuScore { int intraNhops; int intraWidth; int interNhops; - // [RCCL] - int intraType; // New sort parameter to favor XGMI - // [/RCCL] int interPciWidth; int interWidth; // Most important }; @@ -172,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) { int d; if ((d = (s2->interWidth - s1->interWidth))) return d; if ((d = (s2->interPciWidth - s1->interPciWidth))) return d; - // [RCCL] - if ((d = (s1->intraType - s2->intraType))) return d; // Prefer XGMI over any other types - // [/RCCL] if ((d = (s1->interNhops - s2->interNhops))) return d; if ((d = (s2->intraWidth - s1->intraWidth))) return d; if ((d = (s1->intraNhops - s2->intraNhops))) return d; @@ -234,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc for (int i=1; i graph->typeIntra) continue; // Skip if the intra path type is already slower than the current target - if (paths[g].width < graph->speedIntra) continue; - if (netPaths) - { - if (netPaths[g].type > graph->typeInter) continue; // Skip if the inter path type is already slower than the current target - if (netPaths[g].width < graph->speedInter) continue; - } - } - // [/RCCL] if (system->nodes[GPU].nodes[g].used & flag) continue; scores[count].g = g; scores[count].startIndex = i; scores[count].intraNhops = paths[g].count; scores[count].intraWidth = paths[g].width; - // [RCCL] - Add path type as sort factor - scores[count].intraType = paths[g].type; - // [/RCCL] if (netPaths) { scores[count].interNhops = netPaths[g].count; scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g); @@ -313,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } +static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + int ngpus = system->nodes[GPU].count; + int count = 0; + for (int c=0; cnChannels; c++) { + for (int i=0; iintra[ngpus*c+i]; + int n = graph->intra[ngpus*c+((i+1)%ngpus)]; + struct ncclTopoNode *node; + int j; + for (j=0; jnodes[GPU].nodes[j].gpu.rank == g) break; + if (jnodes[GPU].nodes+j; + for (int k = 0; knodes[GPU].count; k++) { + if (node->paths[GPU][k].count == 1) { + struct ncclTopoLink* link = node->paths[GPU][k].list[0]; + struct ncclTopoNode* remNode = link->remNode; + if (remNode->gpu.rank == n) { + if (link->type == LINK_NVL) + count ++; + } + } + } + } + } + } + return count; +} + ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { // 1. Constraint to get the same nChannels between Rings and Trees if (graph->nChannels < graph->minChannels) return ncclSuccess; @@ -326,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop // 3. Less hops (but not at the price of going cross NICs) if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1; + // 4. Prefer graph with more XGMI connections + if (graph->nChannels == refGraph->nChannels + && ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1; return ncclSuccess; } @@ -451,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (paths[i].count < paths[f].count) f = i; int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f)); - // [RCCL] Event if forced order PCI is found, continue the search instead of ending early - // if (t == -1) *time = -1; - // [/RCCL] + if (t == -1) *time = -1; } // Then try the most local GPUs @@ -536,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time); } else { // Intra-node only. - // [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching - { - for (int g=0; gnodes[GPU].count; g++) { - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g)); - } - return ncclSuccess; - } - // [/RCCL] if (graph->nChannels == 0) { // Try PCI order first NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0)); diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h index 11f44f6325..9379844bf2 100644 --- a/projects/rccl/src/graph/topo.h +++ b/projects/rccl/src/graph/topo.h @@ -23,7 +23,7 @@ #define P9_WIDTH 32.0 #define ARM_WIDTH 6.0 #define NET_WIDTH 12.0 // 100Gbit -#define VEGA_XGMI_WIDTH 20.0 +#define VEGA_XGMI_WIDTH 24.0 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth.