From 532b70afb6bda692a2aee98b1cf087c89549eeec Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Fri, 23 Aug 2024 08:45:43 +0800 Subject: [PATCH] Add new Rome model (#1304) * Add another rome model and override * Fix bug * Fix typo * Add ring * Update ring * Fix model matching * Clean up * Clean up * Reverse rings for NCCL_RINGS input * Only reverse NCCL_RINGS for ring graph * Fix mapping issue when using NCCL_RINGS * Add NCCL_RINGS_REMAP to handle inconsistant net names --- src/graph/rome_models.cc | 42 ++++++++++++++++++++++++++++++----- src/graph/rome_models.h | 2 +- src/graph/search.cc | 7 ++++-- src/graph/topo.cc | 1 + src/graph/topo.h | 1 + tools/scripts/topo_val.sh | 2 +- tools/topo_expl/topo_expl.cpp | 1 + 7 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc index 5b70ee1d40..752c6acb72 100644 --- a/src/graph/rome_models.cc +++ b/src/graph/rome_models.cc @@ -841,6 +841,18 @@ static struct rcclRomeModel rome_model_85 = { .options = "tuning=2", }; +static struct rcclRomeModel rome_model_87 = { + .nGpus = 8, .nCpus = 2, .nNics = 4, .nLinks = 7, + .gpuIds = { 0xa000, 0x80000, 0xa4000, 0xc8000, 0x10b000, 0x181000, 0x1a5000, 0x1c9000, }, + .nicIds = { 0xc9000, 0x1a2000, 0x108000, 0x81000, }, + .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .nicNuma = { 0, 1, 1, 0, }, + .connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, }, + .gdrLevel = { PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PXB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PXB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, }, + .pattern = "4242", + .ringBase = "N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N1 6 3 1 4 0 7 5 2 N0|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 7 2 0 6 4 1 5 3 N0|N3 0 1 2 3 4 5 6 7 N1|N3 1 0 2 4 3 5 7 6 N1|N0 2 5 0 3 6 1 7 4 N2|N0 3 7 0 4 2 1 6 5 N2|N2 4 6 2 7 3 0 5 1 N3|N2 5 4 7 1 3 2 6 0 N3|N1 6 3 1 4 0 7 5 2 N0", + .options = "noCpuCheck=1,netOverride=1", +}; static struct rcclRomeModel romeTopoModels[] = { rome_model_22, /* 0 */ @@ -886,6 +898,7 @@ static struct rcclRomeModel romeTopoModels[] = { rome_model_81, /* 40 */ rome_model_84, /* 41 */ rome_model_85, /* 42 */ + rome_model_87, /* 43 */ }; /* Parse user defined rings. Format is like : @@ -1306,7 +1319,7 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo struct ncclNetId net_scores[NCCL_TOPO_MAX_NODES]; for (int i = 0; i < romeTopo->nNics; i ++) { net_scores[i].n = i; - net_scores[i].id = system->nodes[NET].nodes[i].id; + net_scores[i].id = system->nodes[NET].nodes[i].net.dev; } qsort(net_scores, romeTopo->nNics, sizeof(struct ncclNetId), cmpNets); @@ -1494,7 +1507,7 @@ static bool permuteNetIds(int *n, int *g, int s, int last, struct rcclRomeModel* } -ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { +ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, const char *ringBase) { static char ringRemap[64]; int i; @@ -1623,14 +1636,14 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* // Attempt to use rail-optimized rings if they exist if (system->nHosts % 2 == 0) { // For even number of nodes, alternate forward/reverse on ringBase - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); } else { // For odd number of nodes, check first to see if ringTail1 and ringTail2 are defined if (system->nHosts == 1 || romeTopoModels[i].ringTail1 == nullptr || romeTopoModels[i].ringTail2 == nullptr) { if (system->nHosts > 1) INFO(NCCL_GRAPH, "[WARN] Dropping back due to lack of support for odd-number of nodes for model index %d\n", i); - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); } else { @@ -1639,7 +1652,7 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* } else if (system->hostIdx == (system->nHosts - 2)) { NCCLCHECK(parseGraph(romeTopoModels[i].ringTail2, system, graph, g, nnets > 1 ? n : NULL, 0)); } else { - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); + NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, system->hostIdx % 2)); } } } @@ -1651,7 +1664,24 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* } // Fall back to tree from ringBase - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, 0)); + NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, g, nnets > 1 ? n : NULL, 0)); + // Override GDR distance if requested + if (checkOption(romeTopoModels[i].options, "netOverride")) { + for (int i = 0; i < system->nodes[NET].count; i++) { + for (int j = 0; j < system->nodes[GPU].count; j++) { + if (system->nodes[GPU].nodes[j].paths[NET][i].type == PATH_PXB) { + int k; + for (k = 0; k < system->nodes[GPU].count; k++) { + if (k != j && + system->nodes[GPU].nodes[k].gpu.dev/2 == system->nodes[GPU].nodes[j].gpu.dev/2) + break; + } + if (k < system->nodes[GPU].count) + system->nodes[GPU].nodes[k].paths[NET][i].type = PATH_PXB; + } + } + } + } break; } return ncclSuccess; diff --git a/src/graph/rome_models.h b/src/graph/rome_models.h index 0a33a10e93..f54cb633a3 100644 --- a/src/graph/rome_models.h +++ b/src/graph/rome_models.h @@ -24,7 +24,7 @@ THE SOFTWARE. ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int* net_map, int reverse); ncclResult_t parseGraphLight(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map); -ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph2, const char *ringBase); ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); diff --git a/src/graph/search.cc b/src/graph/search.cc index 39f93a3bba..f1d18000bf 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -947,7 +947,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(parseGraphLight(strTrees, system, graph, NULL)); system->treeDefined=true; } else { - NCCLCHECK(parseGraph(str, system, graph, NULL, NULL, false)); + // For even number of nodes, alternate forward/reverse on ringBase + NCCLCHECK(parseGraph(str, system, graph, NULL, NULL, + graph->pattern == NCCL_TOPO_PATTERN_RING ? system->hostIdx % 2 : 0)); int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (graph->nChannels && arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { @@ -959,7 +961,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(parseChordalRing(system, graph)); if (graph->nChannels) return ncclSuccess; // try to match Rome 4P2H - NCCLCHECK(parseRome4P2H(system, graph)); + const char *remap_str = getenv("NCCL_RINGS_REMAP"); + NCCLCHECK(parseRome4P2H(system, graph, remap_str)); if (graph->nChannels) return ncclSuccess; // try to match 1H16P diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 122f323b34..8327b6aa76 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -355,6 +355,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "dev", &net->net.dev, 0)); net->net.busId = busId; ncclDebugNoWarn = 0; diff --git a/src/graph/topo.h b/src/graph/topo.h index d77eb2c67d..53028480e9 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -140,6 +140,7 @@ struct ncclTopoNode { int collSupport; int maxChannels; int64_t busId; + int dev; }net; struct { int arch; diff --git a/tools/scripts/topo_val.sh b/tools/scripts/topo_val.sh index 954a2be4a7..fd5c3d75c2 100755 --- a/tools/scripts/topo_val.sh +++ b/tools/scripts/topo_val.sh @@ -21,7 +21,7 @@ DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -for i in {0..82} +for i in {0..87} do if [[ $i -eq 50 ]] || [[ $i -eq 51 ]] then diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index c1b6ffb729..a525cf8288 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -159,6 +159,7 @@ NodeModelDesc model_descs[] = { {2, "topo_8p_940_16n.xml", "2 nodes gfx940 16 NICs"}, {2, "topo_8p1h_6.xml", "2 nodes 8P1H Alt."}, {5, "topo_8p_940.xml", "5 nodes gfx940 8P"}, + {2, "topo_8p_942.xml", "2 nodes gfx942 8P"}, }; NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);