Rework network port trimming code (#338)
* Rework network port trimming code * Move Rome related changes to separate source files
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -125,6 +125,7 @@ set(CC_SOURCES
|
||||
src/graph/tuning.cc
|
||||
src/graph/topo.cc
|
||||
src/graph/xml.cc
|
||||
src/graph/rome_models.cc
|
||||
src/collectives/all_reduce_api.cc
|
||||
src/collectives/all_gather_api.cc
|
||||
src/collectives/reduce_api.cc
|
||||
|
||||
@@ -468,6 +468,28 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
|
||||
}
|
||||
|
||||
// trim low speed port on same NIC
|
||||
for (int i = 0; i < system->nodes[NET].count; i ++) {
|
||||
for (int j = 0; j < system->nodes[NET].count; j ++) {
|
||||
if (i == j) continue;
|
||||
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) {
|
||||
if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[j].net.width)
|
||||
system->nodes[NET].nodes[j].net.width = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
do {
|
||||
int n;
|
||||
for (n=0; n<system->nodes[NET].count; n++) {
|
||||
if (system->nodes[NET].nodes[n].net.width == 0) break;
|
||||
}
|
||||
if (n<system->nodes[NET].count) {
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
else
|
||||
break;
|
||||
} while (system->nodes[NET].count);
|
||||
|
||||
int remove = 1;
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
@@ -494,6 +516,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
|
||||
free(domains);
|
||||
free(ids);
|
||||
return ncclSuccess;
|
||||
|
||||
Εκτελέσιμο αρχείο
+682
@@ -0,0 +1,682 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#include "core.h"
|
||||
#include "graph.h"
|
||||
#include "topo.h"
|
||||
#include "xml.h"
|
||||
#include <math.h>
|
||||
#include <sys/time.h>
|
||||
#include "rome_models.h"
|
||||
|
||||
#define MAX_ROME_GPUS 16
|
||||
#define MAX_ROME_NICS 8
|
||||
|
||||
struct rcclRomeModel {
|
||||
int nGpus;
|
||||
int nCpus;
|
||||
int nNics;
|
||||
int nLinks;
|
||||
int64_t gpuIds[MAX_ROME_GPUS];
|
||||
int64_t nicIds[MAX_ROME_NICS];
|
||||
int64_t gpuNuma[MAX_ROME_GPUS];
|
||||
int64_t nicNuma[MAX_ROME_NICS];
|
||||
int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
|
||||
const char *pattern;
|
||||
const char *ringBase;
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_22 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
|
||||
.gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_25 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_27 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_29 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_31 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_33 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_30 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "0010201010200010",
|
||||
.ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_32 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "0010201010200010",
|
||||
.ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_24 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "10303010",
|
||||
.ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_26 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "10303010",
|
||||
.ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_23 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
|
||||
.pattern = "10302020",
|
||||
.ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_38 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
|
||||
.pattern = "00102010002010",
|
||||
.ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_28 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "10302020",
|
||||
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_40 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_42 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { 4, },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
|
||||
.pattern = "00102010012010",
|
||||
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_44 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xc4000, },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "20202120",
|
||||
.ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_45 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "00102010002010",
|
||||
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_46 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { 4, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "00102010012010",
|
||||
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_48 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "20202020",
|
||||
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_49 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 0, 1, 2, 3, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "21212121",
|
||||
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_22,
|
||||
rome_model_25,
|
||||
rome_model_27,
|
||||
rome_model_29,
|
||||
rome_model_31,
|
||||
rome_model_33,
|
||||
rome_model_30,
|
||||
rome_model_32,
|
||||
rome_model_24,
|
||||
rome_model_26,
|
||||
rome_model_23,
|
||||
rome_model_38,
|
||||
rome_model_28,
|
||||
rome_model_40,
|
||||
rome_model_42,
|
||||
rome_model_44,
|
||||
rome_model_45,
|
||||
rome_model_46,
|
||||
rome_model_48,
|
||||
rome_model_49,
|
||||
};
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
|
||||
* Network interfaces can be optionally specified by N prefix.
|
||||
* Rings with a non-matching number of gpus are ignored so we can provide
|
||||
* rings for multiple cases.
|
||||
*/
|
||||
ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map) {
|
||||
int gpus[MAX_ROME_GPUS];
|
||||
int nChannels = 0;
|
||||
int gpu = 0;
|
||||
int offset = 0;
|
||||
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET
|
||||
int nets[2];
|
||||
int net = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int nnets = system->nodes[NET].count;
|
||||
do {
|
||||
if (str[offset] == 'N') {
|
||||
if (status == 0) {
|
||||
status = 2;
|
||||
}
|
||||
} else {
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (status == 0) {
|
||||
gpus[gpu] = digit;
|
||||
status = 1;
|
||||
} else if (status == 2) {
|
||||
nets[net] = digit;
|
||||
}
|
||||
else{
|
||||
gpus[gpu] = gpus[gpu]*10+digit;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
gpu++;
|
||||
if (gpu > MAX_ROME_GPUS) goto end;
|
||||
} else if (status == 2) {
|
||||
net++;
|
||||
if (net > 2) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
// Ignore if ngpus doesn't match
|
||||
if (gpu != ngpus) goto newchannel;
|
||||
// Ignore if nnets are not 0 or 2
|
||||
if (net && net != 2) goto newchannel;
|
||||
|
||||
for (int r=0; r<ngpus; r++) {
|
||||
int g = gpus[r];
|
||||
// Ignore if gpus are out of bounds
|
||||
if (g < 0 || g >= ngpus) goto newchannel;
|
||||
// Ignore if gpus are duplicate
|
||||
for (int i=0; i<r; i++)
|
||||
if (gpus[i] == g) goto newchannel;
|
||||
// remap if needed
|
||||
if (gpu_map) g = gpu_map[g];
|
||||
// Translate gpu numbers into ranks
|
||||
int j = 0;
|
||||
for (j = 0; j < ngpus; j++)
|
||||
if (g == system->nodes[GPU].nodes[j].gpu.dev)
|
||||
break;
|
||||
if (j < ngpus)
|
||||
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
|
||||
else
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
if (net) {
|
||||
if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel;
|
||||
graph->inter[nChannels*2] = system->nodes[NET].nodes[nets[0]].id;
|
||||
graph->inter[nChannels*2+1] = system->nodes[NET].nodes[nets[1]].id;
|
||||
} else if (nnets) {
|
||||
graph->inter[nChannels*2] = system->nodes[NET].nodes[nChannels%nnets].id;
|
||||
graph->inter[nChannels*2+1] = system->nodes[NET].nodes[(nChannels+1)%nnets].id;
|
||||
}
|
||||
nChannels++;
|
||||
newchannel:
|
||||
gpu = 0;
|
||||
net = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
graph->nChannels = nChannels;
|
||||
graph->speedIntra = graph->speedInter = system->maxWidth;
|
||||
#if 0
|
||||
for (int i=0; i<graph->nChannels; i++) {
|
||||
printf("%d: ", i);
|
||||
printf ("NET/%d ", graph->inter[i*2]);
|
||||
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
|
||||
printf ("NET/%d ", graph->inter[i*2+1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
|
||||
int id[8], dist[8];
|
||||
int i;
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
if (ngpus != 8)
|
||||
return ncclSuccess;
|
||||
// validate chordal ring and calculate distance
|
||||
for (i=0; i<ngpus; i++) {
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+i;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
int sum = ngpus*(ngpus-1)/2 - node->gpu.dev;
|
||||
int count = 0;
|
||||
for (int n = 0; n<ngpus; n++) {
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == n) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type != LINK_NVL) continue;
|
||||
sum -= system->nodes[GPU].nodes[n].gpu.dev;
|
||||
count ++;
|
||||
}
|
||||
if(count != ngpus-2 || sum < 0 || sum > ngpus-1) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
dist[i] = sum;
|
||||
}
|
||||
// remap GPU ids
|
||||
for (i = 0; i<ngpus; i++) id[i] = i;
|
||||
for (i = 0; i<ngpus; i++) {
|
||||
if (dist[i] == ngpus-1-i) continue;
|
||||
int j, m, n, temp;
|
||||
for (j=i+1; j < ngpus; j++)
|
||||
if(dist[j] == ngpus-1-i) break;
|
||||
m = dist[i]; n = dist[j]; dist[i] = n; dist[j] = m;
|
||||
temp = id[m]; id[m] = id[n]; id[n] = temp; temp =dist[m];
|
||||
dist[m] = dist[n]; dist[n] = temp;
|
||||
}
|
||||
// create chordal ring based on reference and remapped ids
|
||||
system->type |= RCCL_TOPO_CR8G;
|
||||
NCCLCHECK(parseGraph(ringBase, system, graph, id));
|
||||
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
|
||||
int *intra, *used;
|
||||
graph->nChannels = system->nodes[NET].count;
|
||||
NCCLCHECK(ncclCalloc(&intra, ngpus));
|
||||
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
|
||||
for (int n = 0; n < system->nodes[NET].count; n++) {
|
||||
graph->inter[n*2] = graph->inter[n*2+1] = n;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
// find the first unsed GPU that is closest to NIC
|
||||
int f, m;
|
||||
for (f = 0; f < ngpus; f++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
if(j >= n) break;
|
||||
}
|
||||
for (int i = 0; i < ngpus; i++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
|
||||
if (j < n) continue;
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
}
|
||||
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
used[n] = graph->intra[n*ngpus+m];
|
||||
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
|
||||
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
|
||||
}
|
||||
free(used);
|
||||
free(intra);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclGpuIdHIP {
|
||||
int g;
|
||||
int dev;
|
||||
};
|
||||
|
||||
static int cmpIds(const void * g1, const void * g2) {
|
||||
struct ncclGpuIdHIP *s1 = (struct ncclGpuIdHIP*)g1;
|
||||
struct ncclGpuIdHIP *s2 = (struct ncclGpuIdHIP*)g2;
|
||||
return s1->dev - s2->dev;
|
||||
}
|
||||
|
||||
static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern) {
|
||||
pattern[0] = 0; // pattern will be NULL for invalid topology
|
||||
romeTopo->nGpus = system->nodes[GPU].count;
|
||||
romeTopo->nCpus = system->nodes[CPU].count;
|
||||
romeTopo->nNics = system->nodes[NET].count;
|
||||
romeTopo->nLinks = 0;
|
||||
// sort GPU devices by HIP device ID
|
||||
struct ncclGpuIdHIP scores[MAX_ROME_GPUS];
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
scores[i].g = i;
|
||||
scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
}
|
||||
qsort(scores, romeTopo->nGpus, sizeof(struct ncclGpuIdHIP), cmpIds);
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
int gpu, n, m, distance;
|
||||
gpu = scores[i].g;
|
||||
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
|
||||
m = 0;
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
|
||||
for (n = 1; n < romeTopo->nCpus; n++) {
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
|
||||
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
int count = 0;
|
||||
for (n = 0; n < romeTopo->nGpus; n++) {
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0;
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == n) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type != LINK_NVL) continue;
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
|
||||
count ++;
|
||||
}
|
||||
if (romeTopo->nLinks < count) romeTopo->nLinks = count;
|
||||
}
|
||||
|
||||
for (int net = 0; net < romeTopo->nNics; net++) {
|
||||
int n, m, distance;
|
||||
m = 0;
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->nicNuma[net] = system->nodes[CPU].nodes[m].id;
|
||||
else return ncclSuccess;
|
||||
}
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
for (int i = 0; i < romeTopo->nCpus; i++) {
|
||||
int g = 0, n = 0;
|
||||
for (int j = 0; j < romeTopo->nGpus; j++)
|
||||
if (romeTopo->gpuNuma[j] == i) g++;
|
||||
for (int j = 0; j < romeTopo->nNics; j++)
|
||||
if (romeTopo->nicNuma[j] == i) n++;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[romeTopo->nCpus*2] = 0;
|
||||
|
||||
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
|
||||
if (romeModelFile) {
|
||||
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
|
||||
FILE* file = fopen(romeModelFile, "w");
|
||||
if (file == NULL) {
|
||||
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
|
||||
return ncclSuccess;
|
||||
}
|
||||
fprintf(file, "static struct rcclRomeModel rome_model_ = {\n");
|
||||
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
|
||||
fprintf(file, " .gpuIds = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .nicIds = { ");
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .gpuNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .nicNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .connMatrix = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++)
|
||||
for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .pattern = \"%s\",\n", pattern);
|
||||
fprintf(file, " .ringBase = \"\",\n");
|
||||
fprintf(file, "};\n");
|
||||
fclose(file);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time, bool nbio) {
|
||||
(*time) ++;
|
||||
if (n == last) {
|
||||
int i, j;
|
||||
// match GPU numa
|
||||
for (i = 0; i < ref->nGpus; i++)
|
||||
if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break;
|
||||
if (i < ref->nGpus) return false;
|
||||
// match XGMI connection
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++) {
|
||||
if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break;
|
||||
if ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0) break;
|
||||
}
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
// match NBIO
|
||||
if (nbio) {
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++) {
|
||||
if (i == j) continue;
|
||||
bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000);
|
||||
bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000);
|
||||
if (nbio_ref != nbio_topo) break;
|
||||
if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break;
|
||||
}
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
for (int i = n; i <= last; i++) {
|
||||
std::swap(g[n], g[i]);
|
||||
if (permuteGpuIds(g, n+1, last, ref, topo, time, nbio)) return true;
|
||||
std::swap(g[n], g[i]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static char ringRemap[64];
|
||||
int i;
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int ncpus = system->nodes[CPU].count;
|
||||
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
return ncclSuccess;
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
struct rcclRomeModel romeTopo;
|
||||
char pattern[256];
|
||||
NCCLCHECK(parseRomeSystem(system, &romeTopo, pattern));
|
||||
|
||||
// recognize system as Rome 4P2H even if no matching model
|
||||
if (ngpus > 4 && romeTopo.nLinks) system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
|
||||
int g[MAX_ROME_GPUS];
|
||||
int time = 0;
|
||||
struct timeval tvs, tve;
|
||||
gettimeofday(&tvs, NULL);
|
||||
|
||||
// check if GPUs are directly connected to CPU
|
||||
bool match_nbio = true;
|
||||
for (i = 0; i < romeTopo.nGpus; i++) {
|
||||
int cpu, gpu;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2) break;
|
||||
}
|
||||
if (i < romeTopo.nGpus) match_nbio = false;
|
||||
|
||||
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
|
||||
if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus ||
|
||||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
|
||||
if (strcmp(romeTopoModels[i].pattern, pattern)) continue;
|
||||
for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
|
||||
if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio)) break;
|
||||
}
|
||||
gettimeofday(&tve, NULL);
|
||||
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
|
||||
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
|
||||
//printf("No solution in %.2fms (%d iter)\n", t, time);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
char line[1024];
|
||||
//sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
|
||||
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
|
||||
int offset = strlen(line);
|
||||
for (int k = 0; k < ngpus; k++) {
|
||||
sprintf(line+offset, "%d ", g[k]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
|
||||
// create 4P2H based on reference and remapped ids
|
||||
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -19,263 +19,11 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
#ifndef RCCL_ROME_MODELS_H_
|
||||
#define RCCL_ROME_MODELS_H_
|
||||
|
||||
#define MAX_ROME_GPUS 16
|
||||
#define MAX_ROME_NICS 8
|
||||
ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map);
|
||||
ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
|
||||
|
||||
struct rcclRomeModel {
|
||||
int nGpus;
|
||||
int nCpus;
|
||||
int nNics;
|
||||
int nLinks;
|
||||
int64_t gpuIds[MAX_ROME_GPUS];
|
||||
int64_t nicIds[MAX_ROME_NICS];
|
||||
int64_t gpuNuma[MAX_ROME_GPUS];
|
||||
int64_t nicNuma[MAX_ROME_NICS];
|
||||
int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
|
||||
const char *pattern;
|
||||
const char *ringBase;
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_22 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
|
||||
.gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_25 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_27 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_29 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_31 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_33 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { 0x61000, 0xa1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_30 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "0010201010200010",
|
||||
.ringBase = "3 0 1 2 6 7 5 4|2 1 0 3 7 6 4 5",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_32 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "0010201010200010",
|
||||
.ringBase = "0 6 2 3 4 5 7 1|3 2 6 0 1 7 5 4",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_24 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "10303010",
|
||||
.ringBase = "0 1 2 3 5 7 6 4|1 0 3 2 7 5 4 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_26 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "10303010",
|
||||
.ringBase = "4 5 7 1 0 3 2 6|3 0 6 2 1 7 5 4",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_23 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
|
||||
.pattern = "10302020",
|
||||
.ringBase = "1 7 6 4 5 2 0 3|2 5 3 0 4 6 7 1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_38 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
|
||||
.pattern = "00102010002010",
|
||||
.ringBase = "6 7 1 4 3 5 2 0|0 2 5 3 4 1 7 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_28 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "10302020",
|
||||
.ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_40 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_42 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { 4, },
|
||||
.connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
|
||||
.pattern = "00102010012010",
|
||||
.ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_44 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xc4000, },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "20202120",
|
||||
.ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_45 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "00102010002010",
|
||||
.ringBase = "0 1 2 3 4 5 6 7|0 2 5 7 4 6 1 3|0 3 1 6 4 7 5 2|0 7 6 5 4 3 2 1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_46 = {
|
||||
.nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.nicIds = { 0xe1000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
|
||||
.nicNuma = { 4, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "00102010012010",
|
||||
.ringBase = "6 5 7 4 1 2 3 0|7 4 6 5 1 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_48 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "20202020",
|
||||
.ringBase = "0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_49 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
|
||||
.gpuIds = { 0x4a000, 0x50000, 0xa000, 0xf000, 0xcb000, 0xd1000, 0x8a000, 0x90000, },
|
||||
.nicIds = { 0x45000, 0x13000, 0xc6000, 0x85000, },
|
||||
.gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 0, 1, 2, 3, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "21212121",
|
||||
.ringBase = "N0 0 1 2 3 4 5 6 7 N3|N3 7 6 5 4 3 2 1 0 N0|N1 2 3 0 1 6 7 4 5 N2|N2 5 4 7 6 1 0 3 2 N1",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_22,
|
||||
rome_model_25,
|
||||
rome_model_27,
|
||||
rome_model_29,
|
||||
rome_model_31,
|
||||
rome_model_33,
|
||||
rome_model_30,
|
||||
rome_model_32,
|
||||
rome_model_24,
|
||||
rome_model_26,
|
||||
rome_model_23,
|
||||
rome_model_38,
|
||||
rome_model_28,
|
||||
rome_model_40,
|
||||
rome_model_42,
|
||||
rome_model_44,
|
||||
rome_model_45,
|
||||
rome_model_46,
|
||||
rome_model_48,
|
||||
rome_model_49,
|
||||
};
|
||||
#endif
|
||||
+1
-417
@@ -689,422 +689,6 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
|
||||
* Network interfaces can be optionally specified by N prefix.
|
||||
* Rings with a non-matching number of gpus are ignored so we can provide
|
||||
* rings for multiple cases.
|
||||
*/
|
||||
static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) {
|
||||
int gpus[MAX_ROME_GPUS];
|
||||
int nChannels = 0;
|
||||
int gpu = 0;
|
||||
int offset = 0;
|
||||
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET
|
||||
int nets[2];
|
||||
int net = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
do {
|
||||
if (str[offset] == 'N') {
|
||||
if (status == 0) {
|
||||
status = 2;
|
||||
}
|
||||
} else {
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (status == 0) {
|
||||
gpus[gpu] = digit;
|
||||
status = 1;
|
||||
} else if (status == 2) {
|
||||
nets[net] = digit;
|
||||
}
|
||||
else{
|
||||
gpus[gpu] = gpus[gpu]*10+digit;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
gpu++;
|
||||
if (gpu > MAX_ROME_GPUS) goto end;
|
||||
} else if (status == 2) {
|
||||
net++;
|
||||
if (net > 2) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
// Ignore if ngpus doesn't match
|
||||
if (gpu != ngpus) goto newchannel;
|
||||
// Ignore if nnets are not 0 or 2
|
||||
if (net && net != 2) goto newchannel;
|
||||
|
||||
for (int r=0; r<ngpus; r++) {
|
||||
int g = gpus[r];
|
||||
// Ignore if gpus are out of bounds
|
||||
if (g < 0 || g >= ngpus) goto newchannel;
|
||||
// Ignore if gpus are duplicate
|
||||
for (int i=0; i<r; i++)
|
||||
if (gpus[i] == g) goto newchannel;
|
||||
// remap if needed
|
||||
if (gpu_map) g = gpu_map[g];
|
||||
// Translate gpu numbers into ranks
|
||||
int j = 0;
|
||||
for (j = 0; j < ngpus; j++)
|
||||
if (g == system->nodes[GPU].nodes[j].gpu.dev)
|
||||
break;
|
||||
if (j < ngpus)
|
||||
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
|
||||
else
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
if (net) {
|
||||
if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel;
|
||||
graph->inter[nChannels*2] = nets[0];
|
||||
graph->inter[nChannels*2+1] = nets[1];
|
||||
} else if (net_map && nnets) {
|
||||
graph->inter[nChannels*2] = net_map[nChannels%nnets];
|
||||
graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets];
|
||||
} else if (nnets) {
|
||||
graph->inter[nChannels*2] = nChannels%nnets;
|
||||
graph->inter[nChannels*2+1] = (nChannels+1)%nnets;
|
||||
}
|
||||
nChannels++;
|
||||
newchannel:
|
||||
gpu = 0;
|
||||
net = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
graph->nChannels = nChannels;
|
||||
graph->speedIntra = graph->speedInter = system->maxWidth;
|
||||
#if 0
|
||||
for (int i=0; i<graph->nChannels; i++) {
|
||||
printf("%d: ", i);
|
||||
printf ("NET/%d ", graph->inter[i*2]);
|
||||
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
|
||||
printf ("NET/%d ", graph->inter[i*2+1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
|
||||
int id[8], dist[8];
|
||||
int i;
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
if (ngpus != 8)
|
||||
return ncclSuccess;
|
||||
// validate chordal ring and calculate distance
|
||||
for (i=0; i<ngpus; i++) {
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+i;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
int sum = ngpus*(ngpus-1)/2 - node->gpu.dev;
|
||||
int count = 0;
|
||||
for (int n = 0; n<ngpus; n++) {
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == n) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type != LINK_NVL) continue;
|
||||
sum -= system->nodes[GPU].nodes[n].gpu.dev;
|
||||
count ++;
|
||||
}
|
||||
if(count != ngpus-2 || sum < 0 || sum > ngpus-1) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
dist[i] = sum;
|
||||
}
|
||||
// remap GPU ids
|
||||
for (i = 0; i<ngpus; i++) id[i] = i;
|
||||
for (i = 0; i<ngpus; i++) {
|
||||
if (dist[i] == ngpus-1-i) continue;
|
||||
int j, m, n, temp;
|
||||
for (j=i+1; j < ngpus; j++)
|
||||
if(dist[j] == ngpus-1-i) break;
|
||||
m = dist[i]; n = dist[j]; dist[i] = n; dist[j] = m;
|
||||
temp = id[m]; id[m] = id[n]; id[n] = temp; temp =dist[m];
|
||||
dist[m] = dist[n]; dist[n] = temp;
|
||||
}
|
||||
// create chordal ring based on reference and remapped ids
|
||||
system->type |= RCCL_TOPO_CR8G;
|
||||
NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL));
|
||||
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
|
||||
int *intra, *used;
|
||||
graph->nChannels = system->nodes[NET].count;
|
||||
NCCLCHECK(ncclCalloc(&intra, ngpus));
|
||||
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
|
||||
for (int n = 0; n < system->nodes[NET].count; n++) {
|
||||
graph->inter[n*2] = graph->inter[n*2+1] = n;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
// find the first unsed GPU that is closest to NIC
|
||||
int f, m;
|
||||
for (f = 0; f < ngpus; f++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
if(j >= n) break;
|
||||
}
|
||||
for (int i = 0; i < ngpus; i++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
|
||||
if (j < n) continue;
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
}
|
||||
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
used[n] = graph->intra[n*ngpus+m];
|
||||
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
|
||||
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
|
||||
}
|
||||
free(used);
|
||||
free(intra);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclGpuIdHIP {
|
||||
int g;
|
||||
int dev;
|
||||
};
|
||||
|
||||
static int cmpIds(const void * g1, const void * g2) {
|
||||
struct ncclGpuIdHIP *s1 = (struct ncclGpuIdHIP*)g1;
|
||||
struct ncclGpuIdHIP *s2 = (struct ncclGpuIdHIP*)g2;
|
||||
return s1->dev - s2->dev;
|
||||
}
|
||||
|
||||
static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) {
|
||||
pattern[0] = 0; // pattern will be NULL for invalid topology
|
||||
romeTopo->nGpus = system->nodes[GPU].count;
|
||||
romeTopo->nCpus = system->nodes[CPU].count;
|
||||
romeTopo->nNics = 0;
|
||||
romeTopo->nLinks = 0;
|
||||
// sort GPU devices by HIP device ID
|
||||
struct ncclGpuIdHIP scores[MAX_ROME_GPUS];
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
scores[i].g = i;
|
||||
scores[i].dev = system->nodes[GPU].nodes[i].gpu.dev;
|
||||
}
|
||||
qsort(scores, romeTopo->nGpus, sizeof(struct ncclGpuIdHIP), cmpIds);
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
int gpu, n, m, distance;
|
||||
gpu = scores[i].g;
|
||||
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
|
||||
m = 0;
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][m].count;
|
||||
for (n = 1; n < romeTopo->nCpus; n++) {
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[GPU].nodes[gpu].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[m].id;
|
||||
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
int count = 0;
|
||||
for (n = 0; n < romeTopo->nGpus; n++) {
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0;
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == n) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type != LINK_NVL) continue;
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
|
||||
count ++;
|
||||
}
|
||||
if (romeTopo->nLinks < count) romeTopo->nLinks = count;
|
||||
}
|
||||
|
||||
// trim ports and create NET map
|
||||
for (int i = 0; i < system->nodes[NET].count; i ++) {
|
||||
int j;
|
||||
for (j = 0; j < romeTopo->nNics; j++) {
|
||||
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) {
|
||||
if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width)
|
||||
net_map[j] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j >= romeTopo->nNics) {
|
||||
net_map[j] = i;
|
||||
romeTopo->nicIds[romeTopo->nNics] = system->nodes[NET].nodes[i].net.busId;
|
||||
(romeTopo->nNics)++;
|
||||
if (romeTopo->nNics >= MAX_ROME_NICS) break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) {
|
||||
int net, n, m, distance;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
|
||||
m = 0;
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][m].count;
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[NET].nodes[net].paths[CPU][n].count < distance) {
|
||||
distance = system->nodes[NET].nodes[net].paths[CPU][n].count;
|
||||
m = n;
|
||||
}
|
||||
if (m < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[m].id;
|
||||
else return ncclSuccess;
|
||||
}
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
for (int i = 0; i < romeTopo->nCpus; i++) {
|
||||
int g = 0, n = 0;
|
||||
for (int j = 0; j < romeTopo->nGpus; j++)
|
||||
if (romeTopo->gpuNuma[j] == i) g++;
|
||||
for (int j = 0; j < romeTopo->nNics; j++)
|
||||
if (romeTopo->nicNuma[j] == i) n++;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[romeTopo->nCpus*2] = 0;
|
||||
|
||||
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
|
||||
if (romeModelFile) {
|
||||
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
|
||||
FILE* file = fopen(romeModelFile, "w");
|
||||
if (file == NULL) {
|
||||
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
|
||||
return ncclSuccess;
|
||||
}
|
||||
fprintf(file, "static struct rcclRomeModel rome_model_ = {\n");
|
||||
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
|
||||
fprintf(file, " .gpuIds = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .nicIds = { ");
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .gpuNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .nicNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .connMatrix = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++)
|
||||
for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .pattern = \"%s\",\n", pattern);
|
||||
fprintf(file, " .ringBase = \"\",\n");
|
||||
fprintf(file, "};\n");
|
||||
fclose(file);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time, bool nbio) {
|
||||
(*time) ++;
|
||||
if (n == last) {
|
||||
int i, j;
|
||||
// match GPU numa
|
||||
for (i = 0; i < ref->nGpus; i++)
|
||||
if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break;
|
||||
if (i < ref->nGpus) return false;
|
||||
// match XGMI connection
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++) {
|
||||
if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break;
|
||||
if ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0) break;
|
||||
}
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
// match NBIO
|
||||
if (nbio) {
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++) {
|
||||
if (i == j) continue;
|
||||
bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000);
|
||||
bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000);
|
||||
if (nbio_ref != nbio_topo) break;
|
||||
if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break;
|
||||
}
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
for (int i = n; i <= last; i++) {
|
||||
std::swap(g[n], g[i]);
|
||||
if (permuteGpuIds(g, n+1, last, ref, topo, time, nbio)) return true;
|
||||
std::swap(g[n], g[i]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static char ringRemap[64];
|
||||
int i;
|
||||
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int ncpus = system->nodes[CPU].count;
|
||||
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
return ncclSuccess;
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
struct rcclRomeModel romeTopo;
|
||||
char pattern[256];
|
||||
int net_map[MAX_ROME_NICS];
|
||||
NCCLCHECK(parseRomeSystem(system, &romeTopo, pattern, net_map));
|
||||
|
||||
// recognize system as Rome 4P2H even if no matching model
|
||||
if (ngpus > 4 && romeTopo.nLinks) system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
|
||||
int g[MAX_ROME_GPUS];
|
||||
int time = 0;
|
||||
struct timeval tvs, tve;
|
||||
gettimeofday(&tvs, NULL);
|
||||
|
||||
// check if GPUs are directly connected to CPU
|
||||
bool match_nbio = true;
|
||||
for (i = 0; i < romeTopo.nGpus; i++) {
|
||||
int cpu, gpu;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, CPU, romeTopo.gpuNuma[i], &cpu));
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, GPU, romeTopo.gpuIds[i], &gpu));
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][cpu].count > 2) break;
|
||||
}
|
||||
if (i < romeTopo.nGpus) match_nbio = false;
|
||||
|
||||
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
|
||||
if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus ||
|
||||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
|
||||
if (strcmp(romeTopoModels[i].pattern, pattern)) continue;
|
||||
for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
|
||||
if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time, match_nbio)) break;
|
||||
}
|
||||
gettimeofday(&tve, NULL);
|
||||
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
|
||||
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
|
||||
//printf("No solution in %.2fms (%d iter)\n", t, time);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
char line[1024];
|
||||
//sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
|
||||
sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
|
||||
int offset = strlen(line);
|
||||
for (int k = 0; k < ngpus; k++) {
|
||||
sprintf(line+offset, "%d ", g[k]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
|
||||
// create 4P2H based on reference and remapped ids
|
||||
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
float speedArray[] = { 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
|
||||
#else
|
||||
@@ -1141,7 +725,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
str = getenv("NCCL_RINGS");
|
||||
if (str) {
|
||||
// user supplied topo
|
||||
NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL));
|
||||
NCCLCHECK(parseGraph(str, system, graph, NULL));
|
||||
if (graph->nChannels) {
|
||||
system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ EXE = topo_expl
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -lnuma
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \
|
||||
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc
|
||||
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc
|
||||
|
||||
all: $(EXE)
|
||||
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user