Revert "Changes to topology based on XGMI (#272)"
This reverts commit0a9adc16f4. [ROCm/rccl commit:e055229e56]
Этот коммит содержится в:
@@ -30,84 +30,6 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// [RCCL]
|
||||
// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
|
||||
// topology system, which corresponds to how XGMI hardware operates
|
||||
static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
|
||||
{
|
||||
// Compute paths to GPU g
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
|
||||
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
|
||||
// breadth-first search to set all paths to that node in the system
|
||||
struct ncclTopoNodeList nodeList;
|
||||
struct ncclTopoNodeList nextNodeList;
|
||||
nodeList.count = 1; nodeList.list[0] = baseNode;
|
||||
nextNodeList.count = 0;
|
||||
struct ncclTopoLinkList* basePath;
|
||||
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
|
||||
basePath->count = 0;
|
||||
basePath->width = LOC_WIDTH;
|
||||
basePath->type = PATH_LOC;
|
||||
|
||||
while (nodeList.count) {
|
||||
nextNodeList.count = 0;
|
||||
for (int n=0; n<nodeList.count; n++) {
|
||||
struct ncclTopoNode* node = nodeList.list[n];
|
||||
struct ncclTopoLinkList* path;
|
||||
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
|
||||
for (int l=0; l<node->nlinks; l++) {
|
||||
struct ncclTopoLink* link = node->links+l;
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
|
||||
// Skip non-XGMI links
|
||||
if (link->type != LINK_NVL) continue;
|
||||
|
||||
if (remNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
}
|
||||
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
float width = std::min(path->width, link->width);
|
||||
if (remPath->width < width) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
remPath->list[0] = remNode->links+l;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (remPath->list[0] == NULL) {
|
||||
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
|
||||
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
// Copy the rest of the path
|
||||
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
|
||||
remPath->count = path->count + 1;
|
||||
remPath->width = width;
|
||||
remPath->type = PATH_NVL;
|
||||
|
||||
// Add to the list for the next iteration if not already in the list
|
||||
// In this case, permit GPUs are intermediate XGMI steps
|
||||
for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
|
||||
nextNodeList.list[nextNodeList.count++] = remNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
// [/RCCL]
|
||||
|
||||
|
||||
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
|
||||
if (baseNode->paths[baseNode->type] == NULL) {
|
||||
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
|
||||
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
|
||||
struct ncclTopoLinkList* remPath;
|
||||
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
|
||||
float width = std::min(path->width, link->width);
|
||||
|
||||
// [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
|
||||
// Unless they are of shorter length
|
||||
// if (remPath->width < width) {
|
||||
bool notXGMI = remPath->type != PATH_NVL;
|
||||
if (remPath->width < width && notXGMI) {
|
||||
// [/RCCL]
|
||||
|
||||
if (remPath->width < width) {
|
||||
// Find reverse link
|
||||
for (int l=0; l<remNode->nlinks; l++) {
|
||||
if (remNode->links[l].remNode == node) {
|
||||
@@ -451,10 +366,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
|
||||
}
|
||||
|
||||
// [RCCL] Add XGMI-only links between GPUs first before any other paths
|
||||
NCCLCHECK(ncclTopoSetXgmi(system));
|
||||
// [/RCCL]
|
||||
|
||||
// Set direct paths from/to GPUs.
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Compute paths to GPU g
|
||||
|
||||
@@ -159,9 +159,6 @@ struct ncclGpuScore {
|
||||
int intraNhops;
|
||||
int intraWidth;
|
||||
int interNhops;
|
||||
// [RCCL]
|
||||
int intraType; // New sort parameter to favor XGMI
|
||||
// [/RCCL]
|
||||
int interPciWidth;
|
||||
int interWidth; // Most important
|
||||
};
|
||||
@@ -172,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
|
||||
int d;
|
||||
if ((d = (s2->interWidth - s1->interWidth))) return d;
|
||||
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
|
||||
// [RCCL]
|
||||
if ((d = (s1->intraType - s2->intraType))) return d; // Prefer XGMI over any other types
|
||||
// [/RCCL]
|
||||
if ((d = (s1->interNhops - s2->interNhops))) return d;
|
||||
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
|
||||
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
|
||||
@@ -234,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
|
||||
for (int i=1; i<ngpus; i++) {
|
||||
int g = (start+i)%ngpus;
|
||||
if (paths[g].count == 0) continue; // There is no path to that GPU
|
||||
// [RCCL] - Prune earlier for performance
|
||||
{
|
||||
if (paths[g].type > graph->typeIntra) continue; // Skip if the intra path type is already slower than the current target
|
||||
if (paths[g].width < graph->speedIntra) continue;
|
||||
if (netPaths)
|
||||
{
|
||||
if (netPaths[g].type > graph->typeInter) continue; // Skip if the inter path type is already slower than the current target
|
||||
if (netPaths[g].width < graph->speedInter) continue;
|
||||
}
|
||||
}
|
||||
// [/RCCL]
|
||||
if (system->nodes[GPU].nodes[g].used & flag) continue;
|
||||
scores[count].g = g;
|
||||
scores[count].startIndex = i;
|
||||
scores[count].intraNhops = paths[g].count;
|
||||
scores[count].intraWidth = paths[g].width;
|
||||
// [RCCL] - Add path type as sort factor
|
||||
scores[count].intraType = paths[g].type;
|
||||
// [/RCCL]
|
||||
if (netPaths) {
|
||||
scores[count].interNhops = netPaths[g].count;
|
||||
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
|
||||
@@ -313,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int count = 0;
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
int g = graph->intra[ngpus*c+i];
|
||||
int n = graph->intra[ngpus*c+((i+1)%ngpus)];
|
||||
struct ncclTopoNode *node;
|
||||
int j;
|
||||
for (j=0; j<ngpus; j++)
|
||||
if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
|
||||
if (j<ngpus) {
|
||||
node = system->nodes[GPU].nodes+j;
|
||||
for (int k = 0; k<system->nodes[GPU].count; k++) {
|
||||
if (node->paths[GPU][k].count == 1) {
|
||||
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->gpu.rank == n) {
|
||||
if (link->type == LINK_NVL)
|
||||
count ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
|
||||
// 1. Constraint to get the same nChannels between Rings and Trees
|
||||
if (graph->nChannels < graph->minChannels) return ncclSuccess;
|
||||
@@ -326,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
|
||||
// 3. Less hops (but not at the price of going cross NICs)
|
||||
if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
|
||||
|
||||
// 4. Prefer graph with more XGMI connections
|
||||
if (graph->nChannels == refGraph->nChannels
|
||||
&& ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -451,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
|
||||
// [RCCL] Event if forced order PCI is found, continue the search instead of ending early
|
||||
// if (t == -1) *time = -1;
|
||||
// [/RCCL]
|
||||
if (t == -1) *time = -1;
|
||||
}
|
||||
|
||||
// Then try the most local GPUs
|
||||
@@ -536,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
|
||||
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
|
||||
} else {
|
||||
// Intra-node only.
|
||||
// [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
|
||||
{
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
// [/RCCL]
|
||||
if (graph->nChannels == 0) {
|
||||
// Try PCI order first
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
#define P9_WIDTH 32.0
|
||||
#define ARM_WIDTH 6.0
|
||||
#define NET_WIDTH 12.0 // 100Gbit
|
||||
#define VEGA_XGMI_WIDTH 20.0
|
||||
#define VEGA_XGMI_WIDTH 24.0
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
|
||||
Ссылка в новой задаче
Block a user