From edeea499b5f3a47c6161874c29fb76088c1d0d42 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Wed, 18 Oct 2023 17:28:01 -0700 Subject: [PATCH] Update rome models (#922) [ROCm/rccl commit: 4278a9918b442791e4ea7b8650eef688daf1b346] --- projects/rccl/src/graph/rome_models.cc | 20 +++++++++++++++++--- projects/rccl/src/graph/topo.cc | 4 ++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index 3afc58ce55..e5015e08a8 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -591,7 +591,20 @@ static struct rcclRomeModel rome_model_82 = { .gdrLevel = { }, .pattern = "4040", .ringBase = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0", - .options = "noCpuCheck=1,mscclEnabled=1", + .options = "noCpuCheck=1,mscclEnabled=1,disableNumaMatching=1", +}; + +static struct rcclRomeModel rome_model_83 = { + .nGpus = 8, .nCpus = 2, .nNics = 8, .nLinks = 7, + .gpuIds = { 0xc000, 0x22000, 0x38000, 0x5c000, 0x9f000, 0xaf000, 0xbf000, 0xdf000, }, + .nicIds = { 0x7000, 0x1d000, 0x33000, 0x57000, 0x9a000, 0xaa000, 0xba000, 0xda000, }, + .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .nicNuma = { 0, 0, 0, 0, 1, 1, 1, 1, }, + .connMatrix = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, }, + .gdrLevel = { PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, PATH_SYS, PATH_SYS, PATH_SYS, PATH_SYS, PATH_PHB, PATH_PHB, PATH_PHB, PATH_PHB, }, + .pattern = "4444", + .ringBase = "N0 0 1 2 3 4 5 6 7 N7|N1 1 0 2 4 3 5 7 6 N6|N2 2 5 0 3 6 1 7 4 N4|N3 3 7 0 4 2 1 6 5 N5|N4 4 6 2 7 3 0 5 1 N1|N5 5 4 7 1 3 2 6 0 N0|N6 6 3 1 4 0 7 5 2 N2|N7 7 2 0 6 4 1 5 3 N3", + .options = "noCpuCheck=1,disableNumaMatching=1", }; static struct rcclRomeModel romeTopoModels[] = { @@ -636,7 +649,8 @@ static struct rcclRomeModel romeTopoModels[] = { rome_model_79, rome_model_80, rome_model_81, - rome_model_82 + rome_model_82, + rome_model_83, }; /* Parse user defined rings. Format is like : @@ -1281,7 +1295,7 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* if (!ignore_numa && strcmp(romeTopoModels[i].pattern, pattern)) continue; // permute GPU IDs for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus; - if (!permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio, ignore_numa)) continue; + if (!permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, ignore_cpu ? false : match_nbio, ignore_numa)) continue; if (nnets > 1) { // permute NET IDs for (int j = 0; j < nnets; j++) n[j] = (j+2)%nnets; diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index 345b37b6ab..cf8913b7a3 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -376,6 +376,10 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s gpu->gpu.gcn = "gfx908"; } else if (strcmp(gpu->gpu.gcn, "910") == 0) { gpu->gpu.gcn = "gfx90a"; + } else if (strcmp(gpu->gpu.gcn, "940") == 0) { + gpu->gpu.gcn = "gfx940"; + } else if (strcmp(gpu->gpu.gcn, "941") == 0) { + gpu->gpu.gcn = "gfx941"; } rcclHipDeviceArch_t arch; NCCLCHECK(xmlGetAttrInt(xmlGpu, "arch", &arch.value));