Revert "Changes to topology based on XGMI (#272)"

This reverts commit 0a9adc16f4.


[ROCm/rccl commit: e055229e56]
Этот коммит содержится в:
Wenkai Du
2020-12-08 16:51:34 -05:00
родитель adff98765c
Коммит 34c6013299
3 изменённых файлов: 35 добавлений и 122 удалений
+1 -90
Просмотреть файл
@@ -30,84 +30,6 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
return ncclInternalError;
}
// [RCCL]
// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
// topology system, which corresponds to how XGMI hardware operates
static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
{
// Compute paths to GPU g
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
for (int n=0; n<nodeList.count; n++) {
struct ncclTopoNode* node = nodeList.list[n];
struct ncclTopoLinkList* path;
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
struct ncclTopoNode* remNode = link->remNode;
// Skip non-XGMI links
if (link->type != LINK_NVL) continue;
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float width = std::min(path->width, link->width);
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
remPath->list[0] = remNode->links+l;
break;
}
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
remPath->count = path->count + 1;
remPath->width = width;
remPath->type = PATH_NVL;
// Add to the list for the next iteration if not already in the list
// In this case, permit GPUs are intermediate XGMI steps
for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
}
return ncclSuccess;
}
// [/RCCL]
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
float width = std::min(path->width, link->width);
// [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
// Unless they are of shorter length
// if (remPath->width < width) {
bool notXGMI = remPath->type != PATH_NVL;
if (remPath->width < width && notXGMI) {
// [/RCCL]
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node) {
@@ -451,10 +366,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
}
// [RCCL] Add XGMI-only links between GPUs first before any other paths
NCCLCHECK(ncclTopoSetXgmi(system));
// [/RCCL]
// Set direct paths from/to GPUs.
for (int g=0; g<system->nodes[GPU].count; g++) {
// Compute paths to GPU g
+33 -31
Просмотреть файл
@@ -159,9 +159,6 @@ struct ncclGpuScore {
int intraNhops;
int intraWidth;
int interNhops;
// [RCCL]
int intraType; // New sort parameter to favor XGMI
// [/RCCL]
int interPciWidth;
int interWidth; // Most important
};
@@ -172,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
int d;
if ((d = (s2->interWidth - s1->interWidth))) return d;
if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
// [RCCL]
if ((d = (s1->intraType - s2->intraType))) return d; // Prefer XGMI over any other types
// [/RCCL]
if ((d = (s1->interNhops - s2->interNhops))) return d;
if ((d = (s2->intraWidth - s1->intraWidth))) return d;
if ((d = (s1->intraNhops - s2->intraNhops))) return d;
@@ -234,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
for (int i=1; i<ngpus; i++) {
int g = (start+i)%ngpus;
if (paths[g].count == 0) continue; // There is no path to that GPU
// [RCCL] - Prune earlier for performance
{
if (paths[g].type > graph->typeIntra) continue; // Skip if the intra path type is already slower than the current target
if (paths[g].width < graph->speedIntra) continue;
if (netPaths)
{
if (netPaths[g].type > graph->typeInter) continue; // Skip if the inter path type is already slower than the current target
if (netPaths[g].width < graph->speedInter) continue;
}
}
// [/RCCL]
if (system->nodes[GPU].nodes[g].used & flag) continue;
scores[count].g = g;
scores[count].startIndex = i;
scores[count].intraNhops = paths[g].count;
scores[count].intraWidth = paths[g].width;
// [RCCL] - Add path type as sort factor
scores[count].intraType = paths[g].type;
// [/RCCL]
if (netPaths) {
scores[count].interNhops = netPaths[g].count;
scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
@@ -313,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int count = 0;
for (int c=0; c<graph->nChannels; c++) {
for (int i=0; i<ngpus; i++) {
int g = graph->intra[ngpus*c+i];
int n = graph->intra[ngpus*c+((i+1)%ngpus)];
struct ncclTopoNode *node;
int j;
for (j=0; j<ngpus; j++)
if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
if (j<ngpus) {
node = system->nodes[GPU].nodes+j;
for (int k = 0; k<system->nodes[GPU].count; k++) {
if (node->paths[GPU][k].count == 1) {
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
struct ncclTopoNode* remNode = link->remNode;
if (remNode->gpu.rank == n) {
if (link->type == LINK_NVL)
count ++;
}
}
}
}
}
}
return count;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
// 1. Constraint to get the same nChannels between Rings and Trees
if (graph->nChannels < graph->minChannels) return ncclSuccess;
@@ -326,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
// 3. Less hops (but not at the price of going cross NICs)
if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
// 4. Prefer graph with more XGMI connections
if (graph->nChannels == refGraph->nChannels
&& ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
return ncclSuccess;
}
@@ -451,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (paths[i].count < paths[f].count) f = i;
int t = 1 << 10;
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
// [RCCL] Event if forced order PCI is found, continue the search instead of ending early
// if (t == -1) *time = -1;
// [/RCCL]
if (t == -1) *time = -1;
}
// Then try the most local GPUs
@@ -536,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
// Intra-node only.
// [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
{
for (int g=0; g<system->nodes[GPU].count; g++) {
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
}
return ncclSuccess;
}
// [/RCCL]
if (graph->nChannels == 0) {
// Try PCI order first
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
+1 -1
Просмотреть файл
@@ -23,7 +23,7 @@
#define P9_WIDTH 32.0
#define ARM_WIDTH 6.0
#define NET_WIDTH 12.0 // 100Gbit
#define VEGA_XGMI_WIDTH 20.0
#define VEGA_XGMI_WIDTH 24.0
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.