From 1d2946ee4bb4c36896f3b9b0fb04c52564bb8520 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Wed, 31 Mar 2021 10:25:59 -0700 Subject: [PATCH] Rework network port trimming code (#338) * Rework network port trimming code * Move Rome related changes to separate source files --- CMakeLists.txt | 1 + src/graph/paths.cc | 23 ++ src/graph/rome_models.cc | 682 +++++++++++++++++++++++++++++++++++++++ src/graph/rome_models.h | 264 +-------------- src/graph/search.cc | 418 +----------------------- tools/topo_expl/Makefile | 2 +- 6 files changed, 714 insertions(+), 676 deletions(-) create mode 100755 src/graph/rome_models.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a3511c3409..32605da9d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,7 @@ set(CC_SOURCES src/graph/tuning.cc src/graph/topo.cc src/graph/xml.cc + src/graph/rome_models.cc src/collectives/all_reduce_api.cc src/collectives/all_gather_api.cc src/collectives/reduce_api.cc diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 92758c154a..db74694e1e 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -468,6 +468,28 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } + // trim low speed port on same NIC + for (int i = 0; i < system->nodes[NET].count; i ++) { + for (int j = 0; j < system->nodes[NET].count; j ++) { + if (i == j) continue; + if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) { + if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[j].net.width) + system->nodes[NET].nodes[j].net.width = 0; + } + } + } + do { + int n; + for (n=0; nnodes[NET].count; n++) { + if (system->nodes[NET].nodes[n].net.width == 0) break; + } + if (nnodes[NET].count) { + NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); + } + else + break; + } while (system->nodes[NET].count); + int remove = 1; int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); @@ -494,6 +516,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } + free(domains); free(ids); return ncclSuccess; diff --git a/src/graph/rome_models.cc b/src/graph/rome_models.cc new file mode 100755 index 0000000000..bb88a7b8b6 --- /dev/null +++ b/src/graph/rome_models.cc @@ -0,0 +1,682 @@ +/* +Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "xml.h" +#include +#include +#include "rome_models.h" + +#define MAX_ROME_GPUS 16 +#define MAX_ROME_NICS 8 + +struct rcclRomeModel { + int nGpus; + int nCpus; + int nNics; + int nLinks; + int64_t gpuIds[MAX_ROME_GPUS]; + int64_t nicIds[MAX_ROME_NICS]; + int64_t gpuNuma[MAX_ROME_GPUS]; + int64_t nicNuma[MAX_ROME_NICS]; + int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS]; + const char *pattern; + const char *ringBase; +}; + +static struct rcclRomeModel rome_model_22 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2, + .gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, }, + .nicIds = { 0xe1000, }, + .gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, }, + .pattern = "10302120", + .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6", +}; + +static struct rcclRomeModel rome_model_25 = { + .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { 0x61000, 0xa1000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { 0, 3, }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "11303011", + .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0", +}; + +static struct rcclRomeModel rome_model_27 = { + .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { 0x61000, 0xa1000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { 0, 3, }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "11303011", + .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2", +}; + +static struct rcclRomeModel rome_model_29 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { 0xe1000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "10302120", + .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0", +}; + +static struct rcclRomeModel rome_model_31 = { + .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { 0x61000, 0xa1000, }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { 0, 6, }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "0110201010200110", + .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3", +}; + +static struct rcclRomeModel rome_model_33 = { + .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { 0x61000, 0xa1000, }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { 0, 6, }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "0110201010200110", + .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0", +}; + +static struct rcclRomeModel rome_model_30 = { + .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "0010201010200010", + .ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5", +}; + +static struct rcclRomeModel rome_model_32 = { + .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "0010201010200010", + .ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4", +}; + +static struct rcclRomeModel rome_model_24 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "10303010", + .ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6", +}; + +static struct rcclRomeModel rome_model_26 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "10303010", + .ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4", +}; + +static struct rcclRomeModel rome_model_23 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { }, + .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, }, + .pattern = "10302020", + .ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1", +}; + +static struct rcclRomeModel rome_model_38 = { + .nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, + .nicNuma = { }, + .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, }, + .pattern = "00102010002010", + .ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6", +}; + +static struct rcclRomeModel rome_model_28 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "10302020", + .ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0", +}; + +static struct rcclRomeModel rome_model_40 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { 0xe1000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, }, + .pattern = "10302120", + .ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5", +}; + +static struct rcclRomeModel rome_model_42 = { + .nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { 0xe1000, }, + .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, + .nicNuma = { 4, }, + .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, }, + .pattern = "00102010012010", + .ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0", +}; + +static struct rcclRomeModel rome_model_44 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, }, + .nicIds = { 0xc4000, }, + .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "20202120", + .ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3", +}; + +static struct rcclRomeModel rome_model_45 = { + .nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { }, + .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "00102010002010", + .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1", +}; + +static struct rcclRomeModel rome_model_46 = { + .nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .nicIds = { 0xe1000, }, + .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, + .nicNuma = { 4, }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "00102010012010", + .ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2", +}; + +static struct rcclRomeModel rome_model_48 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3, + .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, }, + .nicIds = { }, + .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "20202020", + .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0", +}; + +static struct rcclRomeModel rome_model_49 = { + .nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3, + .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, }, + .nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, }, + .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { 0, 1, 2, 3, }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "21212121", + .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1", +}; + +static struct rcclRomeModel romeTopoModels[] = { + rome_model_22, + rome_model_25, + rome_model_27, + rome_model_29, + rome_model_31, + rome_model_33, + rome_model_30, + rome_model_32, + rome_model_24, + rome_model_26, + rome_model_23, + rome_model_38, + rome_model_28, + rome_model_40, + rome_model_42, + rome_model_44, + rome_model_45, + rome_model_46, + rome_model_48, + rome_model_49, +}; + +/* Parse user defined rings. Format is like : + * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1" + * Network interfaces can be optionally specified by N prefix. + * Rings with a non-matching number of gpus are ignored so we can provide + * rings for multiple cases. + */ +ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) { + int gpus[MAX_ROME_GPUS]; + int nChannels = 0; + int gpu = 0; + int offset = 0; + int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET + int nets[2]; + int net = 0; + int ngpus = system->nodes[GPU].count; + int nnets = system->nodes[NET].count; + do { + if (str[offset] == 'N') { + if (status == 0) { + status = 2; + } + } else { + int digit = str[offset] - '0'; + if (digit >= 0 && digit <= 9) { + if (status == 0) { + gpus[gpu] = digit; + status = 1; + } else if (status == 2) { + nets[net] = digit; + } + else{ + gpus[gpu] = gpus[gpu]*10+digit; + } + } else { + if (status == 1) { + gpu++; + if (gpu > MAX_ROME_GPUS) goto end; + } else if (status == 2) { + net++; + if (net > 2) goto end; + } + status = 0; + if (str[offset] == '|' || str[offset] == '\0') { + // Ignore if ngpus doesn't match + if (gpu != ngpus) goto newchannel; + // Ignore if nnets are not 0 or 2 + if (net && net != 2) goto newchannel; + + for (int r=0; r= ngpus) goto newchannel; + // Ignore if gpus are duplicate + for (int i=0; inodes[GPU].nodes[j].gpu.dev) + break; + if (j < ngpus) + graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank; + else + return ncclInternalError; + } + + if (net) { + if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel; + graph->inter[nChannels*2] = system->nodes[NET].nodes[nets[0]].id; + graph->inter[nChannels*2+1] = system->nodes[NET].nodes[nets[1]].id; + } else if (nnets) { + graph->inter[nChannels*2] = system->nodes[NET].nodes[nChannels%nnets].id; + graph->inter[nChannels*2+1] = system->nodes[NET].nodes[(nChannels+1)%nnets].id; + } + nChannels++; +newchannel: + gpu = 0; + net = 0; + } + } + } + } while (str[offset++] != 0); +end: + graph->nChannels = nChannels; + graph->speedIntra = graph->speedInter = system->maxWidth; +#if 0 + for (int i=0; inChannels; i++) { + printf("%d: ", i); + printf ("NET/%d ", graph->inter[i*2]); + for (int j=0; jintra[i*ngpus+j]); + printf ("NET/%d ", graph->inter[i*2+1]); + printf("\n"); + } +#endif + return ncclSuccess; +} + +ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3"; + int id[8], dist[8]; + int i; + + int ngpus = system->nodes[GPU].count; + if (ngpus != 8) + return ncclSuccess; + // validate chordal ring and calculate distance + for (i=0; inodes[GPU].nodes+i; + if (node->paths[GPU] == NULL) continue; + int sum = ngpus*(ngpus-1)/2 - node->gpu.dev; + int count = 0; + for (int n = 0; nlinks; link->remNode; link++) { + if (link->remNode->gpu.dev == n) break; + } + if (!link->remNode) continue; + if (link->type != LINK_NVL) continue; + sum -= system->nodes[GPU].nodes[n].gpu.dev; + count ++; + } + if(count != ngpus-2 || sum < 0 || sum > ngpus-1) { + return ncclSuccess; + } + dist[i] = sum; + } + // remap GPU ids + for (i = 0; itype |= RCCL_TOPO_CR8G; + NCCLCHECK(parseGraph(ringBase, system, graph, id)); + if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) { + int *intra, *used; + graph->nChannels = system->nodes[NET].count; + NCCLCHECK(ncclCalloc(&intra, ngpus)); + NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); + for (int n = 0; n < system->nodes[NET].count; n++) { + graph->inter[n*2] = graph->inter[n*2+1] = n; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoLinkList* paths = net->paths[GPU]; + // find the first unsed GPU that is closest to NIC + int f, m; + for (f = 0; f < ngpus; f++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; + if(j >= n) break; + } + for (int i = 0; i < ngpus; i++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; + if (j < n) continue; + if (paths[i].count < paths[f].count) f = i; + } + for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; + used[n] = graph->intra[n*ngpus+m]; + for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; + for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; + } + free(used); + free(intra); + } + return ncclSuccess; +} + +struct ncclGpuIdHIP { + int g; + int dev; +}; + +static int cmpIds(const void * g1, const void * g2) { + struct ncclGpuIdHIP *s1 = (struct ncclGpuIdHIP*)g1; + struct ncclGpuIdHIP *s2 = (struct ncclGpuIdHIP*)g2; + return s1->dev - s2->dev; +} + +static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern) { + pattern[0] = 0; // pattern will be NULL for invalid topology + romeTopo->nGpus = system->nodes[GPU].count; + romeTopo->nCpus = system->nodes[CPU].count; + romeTopo->nNics = system->nodes[NET].count; + romeTopo->nLinks = 0; + // sort GPU devices by HIP device ID + struct ncclGpuIdHIP scores[MAX_ROME_GPUS]; + for (int i = 0; i < romeTopo->nGpus; i ++) { + scores[i].g = i; + scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev; + } + qsort(scores, romeTopo->nGpus, sizeof(struct ncclGpuIdHIP), cmpIds); + for (int i = 0; i < romeTopo->nGpus; i ++) { + int gpu, n, m, distance; + gpu = scores[i].g; + romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id; + m = 0; + distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count; + for (n = 1; n < romeTopo->nCpus; n++) { + if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) { + distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count; + m = n; + } + } + if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id; + + struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu; + if (node->paths[GPU] == NULL) continue; + int count = 0; + for (n = 0; n < romeTopo->nGpus; n++) { + romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0; + struct ncclTopoLink* link; + for (link = node->links; link->remNode; link++) { + if (link->remNode->gpu.dev == n) break; + } + if (!link->remNode) continue; + if (link->type != LINK_NVL) continue; + romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1; + count ++; + } + if (romeTopo->nLinks < count) romeTopo->nLinks = count; + } + + for (int net = 0; net < romeTopo->nNics; net++) { + int n, m, distance; + m = 0; + distance = system->nodes[NET].nodes[net].paths[CPU][m].count; + for (n = 0; n < romeTopo->nCpus; n++) + if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) { + distance = system->nodes[NET].nodes[net].paths[CPU][n].count; + m = n; + } + if (m < romeTopo->nCpus) romeTopo->nicNuma[net] = system->nodes[CPU].nodes[m].id; + else return ncclSuccess; + } + + // number of GPUs and NICs on each numa node is used as first screening pattern + for (int i = 0; i < romeTopo->nCpus; i++) { + int g = 0, n = 0; + for (int j = 0; j < romeTopo->nGpus; j++) + if (romeTopo->gpuNuma[j] == i) g++; + for (int j = 0; j < romeTopo->nNics; j++) + if (romeTopo->nicNuma[j] == i) n++; + pattern[i*2] = '0' + g; + pattern[i*2+1] = '0' + n; + } + pattern[romeTopo->nCpus*2] = 0; + + const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE"); + if (romeModelFile) { + INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile); + FILE* file = fopen(romeModelFile, "w"); + if (file == NULL) { + WARN("Unable to open %s, not dumping Rome model.", romeModelFile); + return ncclSuccess; + } + fprintf(file, "static struct rcclRomeModel rome_model_ = {\n"); + fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks); + fprintf(file, " .gpuIds = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]); + fprintf(file, "},\n"); + fprintf(file, " .nicIds = { "); + for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]); + fprintf(file, "},\n"); + fprintf(file, " .gpuNuma = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]); + fprintf(file, "},\n"); + fprintf(file, " .nicNuma = { "); + for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]); + fprintf(file, "},\n"); + fprintf(file, " .connMatrix = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) + for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]); + fprintf(file, "},\n"); + fprintf(file, " .pattern = \"%s\",\n", pattern); + fprintf(file, " .ringBase = \"\",\n"); + fprintf(file, "};\n"); + fclose(file); + } + return ncclSuccess; +} + +static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time, bool nbio) { + (*time) ++; + if (n == last) { + int i, j; + // match GPU numa + for (i = 0; i < ref->nGpus; i++) + if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break; + if (i < ref->nGpus) return false; + // match XGMI connection + for (i = 0; i < ref->nGpus; i++) { + for (j = 0; j < ref->nGpus; j++) { + if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break; + if ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0) break; + } + if (j < ref->nGpus) break; + } + if (i < ref->nGpus) return false; + // match NBIO + if (nbio) { + for (i = 0; i < ref->nGpus; i++) { + for (j = 0; j < ref->nGpus; j++) { + if (i == j) continue; + bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000); + bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000); + if (nbio_ref != nbio_topo) break; + if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break; + } + if (j < ref->nGpus) break; + } + if (i < ref->nGpus) return false; + } + return true; + } else { + for (int i = n; i <= last; i++) { + std::swap(g[n], g[i]); + if (permuteGpuIds(g, n+1, last, ref, topo, time, nbio)) return true; + std::swap(g[n], g[i]); + } + } + return false; +} + +ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + static char ringRemap[64]; + int i; + + int ngpus = system->nodes[GPU].count; + int ncpus = system->nodes[CPU].count; + + // only valid on Rome + int arch, vendor, model; + NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); + if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME) + return ncclSuccess; + + // number of GPUs and NICs on each numa node is used as first screening pattern + struct rcclRomeModel romeTopo; + char pattern[256]; + NCCLCHECK(parseRomeSystem(system, &romeTopo, pattern)); + + // recognize system as Rome 4P2H even if no matching model + if (ngpus > 4 && romeTopo.nLinks) system->type |= RCCL_TOPO_4P2H_ROME; + + int g[MAX_ROME_GPUS]; + int time = 0; + struct timeval tvs, tve; + gettimeofday(&tvs, NULL); + + // check if GPUs are directly connected to CPU + bool match_nbio = true; + for (i = 0; i < romeTopo.nGpus; i++) { + int cpu, gpu; + NCCLCHECK(ncclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu)); + NCCLCHECK(ncclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu)); + if (system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2) break; + } + if (i < romeTopo.nGpus) match_nbio = false; + + for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) { + if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || + romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue; + if (strcmp(romeTopoModels[i].pattern, pattern)) continue; + for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus; + if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio)) break; + } + gettimeofday(&tve, NULL); + float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; + if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { + //printf("No solution in %.2fms (%d iter)\n", t, time); + return ncclSuccess; + } + + char line[1024]; + //sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time); + sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i); + int offset = strlen(line); + for (int k = 0; k < ngpus; k++) { + sprintf(line+offset, "%d ", g[k]); + offset = strlen(line); + } + INFO(NCCL_GRAPH, "%s", line); + + // create 4P2H based on reference and remapped ids + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g)); + return ncclSuccess; +} diff --git a/src/graph/rome_models.h b/src/graph/rome_models.h index 556e01977e..08526b953d 100755 --- a/src/graph/rome_models.h +++ b/src/graph/rome_models.h @@ -19,263 +19,11 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#ifndef RCCL_ROME_MODELS_H_ +#define RCCL_ROME_MODELS_H_ -#define MAX_ROME_GPUS 16 -#define MAX_ROME_NICS 8 +ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map); +ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); -struct rcclRomeModel { - int nGpus; - int nCpus; - int nNics; - int nLinks; - int64_t gpuIds[MAX_ROME_GPUS]; - int64_t nicIds[MAX_ROME_NICS]; - int64_t gpuNuma[MAX_ROME_GPUS]; - int64_t nicNuma[MAX_ROME_NICS]; - int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS]; - const char *pattern; - const char *ringBase; -}; - -static struct rcclRomeModel rome_model_22 = { - .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2, - .gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, }, - .nicIds = { 0xe1000, }, - .gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, }, - .nicNuma = { 2, }, - .connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, }, - .pattern = "10302120", - .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6", -}; - -static struct rcclRomeModel rome_model_25 = { - .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { 0x61000, 0xa1000, }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, - .nicNuma = { 0, 3, }, - .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, - .pattern = "11303011", - .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0", -}; - -static struct rcclRomeModel rome_model_27 = { - .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { 0x61000, 0xa1000, }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, - .nicNuma = { 0, 3, }, - .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, - .pattern = "11303011", - .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2", -}; - -static struct rcclRomeModel rome_model_29 = { - .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { 0xe1000, }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { 2, }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "10302120", - .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0", -}; - -static struct rcclRomeModel rome_model_31 = { - .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { 0x61000, 0xa1000, }, - .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, - .nicNuma = { 0, 6, }, - .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, - .pattern = "0110201010200110", - .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3", -}; - -static struct rcclRomeModel rome_model_33 = { - .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { 0x61000, 0xa1000, }, - .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, - .nicNuma = { 0, 6, }, - .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, - .pattern = "0110201010200110", - .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0", -}; - -static struct rcclRomeModel rome_model_30 = { - .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, - .nicNuma = { }, - .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, - .pattern = "0010201010200010", - .ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5", -}; - -static struct rcclRomeModel rome_model_32 = { - .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, - .nicNuma = { }, - .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, - .pattern = "0010201010200010", - .ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4", -}; - -static struct rcclRomeModel rome_model_24 = { - .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, - .nicNuma = { }, - .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, - .pattern = "10303010", - .ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6", -}; - -static struct rcclRomeModel rome_model_26 = { - .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, - .nicNuma = { }, - .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, - .pattern = "10303010", - .ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4", -}; - -static struct rcclRomeModel rome_model_23 = { - .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { }, - .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, }, - .pattern = "10302020", - .ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1", -}; - -static struct rcclRomeModel rome_model_38 = { - .nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, - .nicNuma = { }, - .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, }, - .pattern = "00102010002010", - .ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6", -}; - -static struct rcclRomeModel rome_model_28 = { - .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "10302020", - .ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0", -}; - -static struct rcclRomeModel rome_model_40 = { - .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { 0xe1000, }, - .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { 2, }, - .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, }, - .pattern = "10302120", - .ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5", -}; - -static struct rcclRomeModel rome_model_42 = { - .nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { 0xe1000, }, - .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, - .nicNuma = { 4, }, - .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, }, - .pattern = "00102010012010", - .ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0", -}; - -static struct rcclRomeModel rome_model_44 = { - .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, - .gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, }, - .nicIds = { 0xc4000, }, - .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { 2, }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "20202120", - .ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3", -}; - -static struct rcclRomeModel rome_model_45 = { - .nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { }, - .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, - .nicNuma = { }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "00102010002010", - .ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1", -}; - -static struct rcclRomeModel rome_model_46 = { - .nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3, - .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, - .nicIds = { 0xe1000, }, - .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, }, - .nicNuma = { 4, }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "00102010012010", - .ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2", -}; - -static struct rcclRomeModel rome_model_48 = { - .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3, - .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, }, - .nicIds = { }, - .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "20202020", - .ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0", -}; - -static struct rcclRomeModel rome_model_49 = { - .nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3, - .gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, }, - .nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, }, - .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, - .nicNuma = { 0, 1, 2, 3, }, - .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, - .pattern = "21212121", - .ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1", -}; - -static struct rcclRomeModel romeTopoModels[] = { - rome_model_22, - rome_model_25, - rome_model_27, - rome_model_29, - rome_model_31, - rome_model_33, - rome_model_30, - rome_model_32, - rome_model_24, - rome_model_26, - rome_model_23, - rome_model_38, - rome_model_28, - rome_model_40, - rome_model_42, - rome_model_44, - rome_model_45, - rome_model_46, - rome_model_48, - rome_model_49, -}; +#endif \ No newline at end of file diff --git a/src/graph/search.cc b/src/graph/search.cc index b4720e66fd..3ea5b7c06b 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -689,422 +689,6 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs return ncclSuccess; } -/* Parse user defined rings. Format is like : - * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1" - * Network interfaces can be optionally specified by N prefix. - * Rings with a non-matching number of gpus are ignored so we can provide - * rings for multiple cases. - */ -static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) { - int gpus[MAX_ROME_GPUS]; - int nChannels = 0; - int gpu = 0; - int offset = 0; - int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET - int nets[2]; - int net = 0; - int ngpus = system->nodes[GPU].count; - do { - if (str[offset] == 'N') { - if (status == 0) { - status = 2; - } - } else { - int digit = str[offset] - '0'; - if (digit >= 0 && digit <= 9) { - if (status == 0) { - gpus[gpu] = digit; - status = 1; - } else if (status == 2) { - nets[net] = digit; - } - else{ - gpus[gpu] = gpus[gpu]*10+digit; - } - } else { - if (status == 1) { - gpu++; - if (gpu > MAX_ROME_GPUS) goto end; - } else if (status == 2) { - net++; - if (net > 2) goto end; - } - status = 0; - if (str[offset] == '|' || str[offset] == '\0') { - // Ignore if ngpus doesn't match - if (gpu != ngpus) goto newchannel; - // Ignore if nnets are not 0 or 2 - if (net && net != 2) goto newchannel; - - for (int r=0; r= ngpus) goto newchannel; - // Ignore if gpus are duplicate - for (int i=0; inodes[GPU].nodes[j].gpu.dev) - break; - if (j < ngpus) - graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank; - else - return ncclInternalError; - } - - if (net) { - if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel; - graph->inter[nChannels*2] = nets[0]; - graph->inter[nChannels*2+1] = nets[1]; - } else if (net_map && nnets) { - graph->inter[nChannels*2] = net_map[nChannels%nnets]; - graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets]; - } else if (nnets) { - graph->inter[nChannels*2] = nChannels%nnets; - graph->inter[nChannels*2+1] = (nChannels+1)%nnets; - } - nChannels++; -newchannel: - gpu = 0; - net = 0; - } - } - } - } while (str[offset++] != 0); -end: - graph->nChannels = nChannels; - graph->speedIntra = graph->speedInter = system->maxWidth; -#if 0 - for (int i=0; inChannels; i++) { - printf("%d: ", i); - printf ("NET/%d ", graph->inter[i*2]); - for (int j=0; jintra[i*ngpus+j]); - printf ("NET/%d ", graph->inter[i*2+1]); - printf("\n"); - } -#endif - return ncclSuccess; -} - -static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { - static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3"; - int id[8], dist[8]; - int i; - - int ngpus = system->nodes[GPU].count; - if (ngpus != 8) - return ncclSuccess; - // validate chordal ring and calculate distance - for (i=0; inodes[GPU].nodes+i; - if (node->paths[GPU] == NULL) continue; - int sum = ngpus*(ngpus-1)/2 - node->gpu.dev; - int count = 0; - for (int n = 0; nlinks; link->remNode; link++) { - if (link->remNode->gpu.dev == n) break; - } - if (!link->remNode) continue; - if (link->type != LINK_NVL) continue; - sum -= system->nodes[GPU].nodes[n].gpu.dev; - count ++; - } - if(count != ngpus-2 || sum < 0 || sum > ngpus-1) { - return ncclSuccess; - } - dist[i] = sum; - } - // remap GPU ids - for (i = 0; itype |= RCCL_TOPO_CR8G; - NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL)); - if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) { - int *intra, *used; - graph->nChannels = system->nodes[NET].count; - NCCLCHECK(ncclCalloc(&intra, ngpus)); - NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); - for (int n = 0; n < system->nodes[NET].count; n++) { - graph->inter[n*2] = graph->inter[n*2+1] = n; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoLinkList* paths = net->paths[GPU]; - // find the first unsed GPU that is closest to NIC - int f, m; - for (f = 0; f < ngpus; f++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; - if(j >= n) break; - } - for (int i = 0; i < ngpus; i++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; - if (j < n) continue; - if (paths[i].count < paths[f].count) f = i; - } - for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; - used[n] = graph->intra[n*ngpus+m]; - for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; - for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; - } - free(used); - free(intra); - } - return ncclSuccess; -} - -struct ncclGpuIdHIP { - int g; - int dev; -}; - -static int cmpIds(const void * g1, const void * g2) { - struct ncclGpuIdHIP *s1 = (struct ncclGpuIdHIP*)g1; - struct ncclGpuIdHIP *s2 = (struct ncclGpuIdHIP*)g2; - return s1->dev - s2->dev; -} - -static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) { - pattern[0] = 0; // pattern will be NULL for invalid topology - romeTopo->nGpus = system->nodes[GPU].count; - romeTopo->nCpus = system->nodes[CPU].count; - romeTopo->nNics = 0; - romeTopo->nLinks = 0; - // sort GPU devices by HIP device ID - struct ncclGpuIdHIP scores[MAX_ROME_GPUS]; - for (int i = 0; i < romeTopo->nGpus; i ++) { - scores[i].g = i; - scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev; - } - qsort(scores, romeTopo->nGpus, sizeof(struct ncclGpuIdHIP), cmpIds); - for (int i = 0; i < romeTopo->nGpus; i ++) { - int gpu, n, m, distance; - gpu = scores[i].g; - romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id; - m = 0; - distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count; - for (n = 1; n < romeTopo->nCpus; n++) { - if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) { - distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count; - m = n; - } - } - if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id; - - struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu; - if (node->paths[GPU] == NULL) continue; - int count = 0; - for (n = 0; n < romeTopo->nGpus; n++) { - romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0; - struct ncclTopoLink* link; - for (link = node->links; link->remNode; link++) { - if (link->remNode->gpu.dev == n) break; - } - if (!link->remNode) continue; - if (link->type != LINK_NVL) continue; - romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1; - count ++; - } - if (romeTopo->nLinks < count) romeTopo->nLinks = count; - } - - // trim ports and create NET map - for (int i = 0; i < system->nodes[NET].count; i ++) { - int j; - for (j = 0; j < romeTopo->nNics; j++) { - if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) { - if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width) - net_map[j] = i; - break; - } - } - if (j >= romeTopo->nNics) { - net_map[j] = i; - romeTopo->nicIds[romeTopo->nNics] = system->nodes[NET].nodes[i].net.busId; - (romeTopo->nNics)++; - if (romeTopo->nNics >= MAX_ROME_NICS) break; - } - } - - for (int i = 0; i < romeTopo->nNics; i ++) { - int net, n, m, distance; - NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net)); - m = 0; - distance = system->nodes[NET].nodes[net].paths[CPU][m].count; - for (n = 0; n < romeTopo->nCpus; n++) - if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) { - distance = system->nodes[NET].nodes[net].paths[CPU][n].count; - m = n; - } - if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id; - else return ncclSuccess; - } - - // number of GPUs and NICs on each numa node is used as first screening pattern - for (int i = 0; i < romeTopo->nCpus; i++) { - int g = 0, n = 0; - for (int j = 0; j < romeTopo->nGpus; j++) - if (romeTopo->gpuNuma[j] == i) g++; - for (int j = 0; j < romeTopo->nNics; j++) - if (romeTopo->nicNuma[j] == i) n++; - pattern[i*2] = '0' + g; - pattern[i*2+1] = '0' + n; - } - pattern[romeTopo->nCpus*2] = 0; - - const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE"); - if (romeModelFile) { - INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile); - FILE* file = fopen(romeModelFile, "w"); - if (file == NULL) { - WARN("Unable to open %s, not dumping Rome model.", romeModelFile); - return ncclSuccess; - } - fprintf(file, "static struct rcclRomeModel rome_model_ = {\n"); - fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks); - fprintf(file, " .gpuIds = { "); - for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]); - fprintf(file, "},\n"); - fprintf(file, " .nicIds = { "); - for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]); - fprintf(file, "},\n"); - fprintf(file, " .gpuNuma = { "); - for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]); - fprintf(file, "},\n"); - fprintf(file, " .nicNuma = { "); - for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]); - fprintf(file, "},\n"); - fprintf(file, " .connMatrix = { "); - for (int i = 0; i < romeTopo->nGpus; i ++) - for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]); - fprintf(file, "},\n"); - fprintf(file, " .pattern = \"%s\",\n", pattern); - fprintf(file, " .ringBase = \"\",\n"); - fprintf(file, "};\n"); - fclose(file); - } - return ncclSuccess; -} - -static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time, bool nbio) { - (*time) ++; - if (n == last) { - int i, j; - // match GPU numa - for (i = 0; i < ref->nGpus; i++) - if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break; - if (i < ref->nGpus) return false; - // match XGMI connection - for (i = 0; i < ref->nGpus; i++) { - for (j = 0; j < ref->nGpus; j++) { - if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break; - if ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0) break; - } - if (j < ref->nGpus) break; - } - if (i < ref->nGpus) return false; - // match NBIO - if (nbio) { - for (i = 0; i < ref->nGpus; i++) { - for (j = 0; j < ref->nGpus; j++) { - if (i == j) continue; - bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000); - bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000); - if (nbio_ref != nbio_topo) break; - if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break; - } - if (j < ref->nGpus) break; - } - if (i < ref->nGpus) return false; - } - return true; - } else { - for (int i = n; i <= last; i++) { - std::swap(g[n], g[i]); - if (permuteGpuIds(g, n+1, last, ref, topo, time, nbio)) return true; - std::swap(g[n], g[i]); - } - } - return false; -} - -static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { - static char ringRemap[64]; - int i; - - int ngpus = system->nodes[GPU].count; - int ncpus = system->nodes[CPU].count; - - // only valid on Rome - int arch, vendor, model; - NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); - if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME) - return ncclSuccess; - - // number of GPUs and NICs on each numa node is used as first screening pattern - struct rcclRomeModel romeTopo; - char pattern[256]; - int net_map[MAX_ROME_NICS]; - NCCLCHECK(parseRomeSystem(system, &romeTopo, pattern, net_map)); - - // recognize system as Rome 4P2H even if no matching model - if (ngpus > 4 && romeTopo.nLinks) system->type |= RCCL_TOPO_4P2H_ROME; - - int g[MAX_ROME_GPUS]; - int time = 0; - struct timeval tvs, tve; - gettimeofday(&tvs, NULL); - - // check if GPUs are directly connected to CPU - bool match_nbio = true; - for (i = 0; i < romeTopo.nGpus; i++) { - int cpu, gpu; - NCCLCHECK(ncclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu)); - NCCLCHECK(ncclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu)); - if (system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2) break; - } - if (i < romeTopo.nGpus) match_nbio = false; - - for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) { - if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || - romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue; - if (strcmp(romeTopoModels[i].pattern, pattern)) continue; - for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus; - if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio)) break; - } - gettimeofday(&tve, NULL); - float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; - if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { - //printf("No solution in %.2fms (%d iter)\n", t, time); - return ncclSuccess; - } - - char line[1024]; - //sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time); - sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i); - int offset = strlen(line); - for (int k = 0; k < ngpus; k++) { - sprintf(line+offset, "%d ", g[k]); - offset = strlen(line); - } - INFO(NCCL_GRAPH, "%s", line); - - // create 4P2H based on reference and remapped ids - NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map)); - return ncclSuccess; -} - #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #else @@ -1141,7 +725,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph str = getenv("NCCL_RINGS"); if (str) { // user supplied topo - NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL)); + NCCLCHECK(parseGraph(str, system, graph, NULL)); if (graph->nChannels) { system->type |= RCCL_TOPO_4P2H_ROME; } diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile index 0dee8416dd..7480cae94b 100644 --- a/tools/topo_expl/Makefile +++ b/tools/topo_expl/Makefile @@ -9,7 +9,7 @@ EXE = topo_expl CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -lnuma files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \ - ../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc + ../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc all: $(EXE)