Add NPS4 support on some models (#256)
* Add NPS4 support on some models * Add XML models
Este cometimento está contido em:
cometido por
GitHub
ascendente
ec9af40fcd
cometimento
391bbf3f1e
+26
-4
@@ -882,6 +882,8 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0";
|
||||
static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0";
|
||||
static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2";
|
||||
static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3";
|
||||
static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5";
|
||||
static const char *ringBase;
|
||||
static char ringRemap[64];
|
||||
int id[8], dist[8];
|
||||
@@ -891,7 +893,7 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int ncpus = system->nodes[CPU].count;
|
||||
// 8 GPUs and 4 numa nodes only
|
||||
if (ngpus != 8 || ncpus != 4)
|
||||
if (ngpus != 8 || (ncpus != 4 && ncpus != 8))
|
||||
return ncclSuccess;
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
@@ -899,14 +901,14 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
return ncclSuccess;
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
char pattern[9];
|
||||
for (int i = 0; i < ncpus; i++) {
|
||||
char pattern[256];
|
||||
for (i = 0; i < ncpus; i++) {
|
||||
int g, n;
|
||||
if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[8] = 0;
|
||||
pattern[i*2] = 0;
|
||||
int g[8], h1[4], h2[4];
|
||||
for (int i = 0; i <8; i++) g[i] = -1;
|
||||
if (strcmp(pattern, "10302120") == 0) {
|
||||
@@ -961,6 +963,26 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
ringBase = ringBase_11303011_1;
|
||||
}
|
||||
}
|
||||
else if (strcmp(pattern, "0110201010200110") == 0) {
|
||||
if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) {
|
||||
if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
|
||||
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
|
||||
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
|
||||
ringBase = ringBase_0110201010200110_2;
|
||||
} else {
|
||||
if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess;
|
||||
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
|
||||
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
|
||||
ringBase = ringBase_0110201010200110_1;
|
||||
}
|
||||
}
|
||||
else
|
||||
return ncclSuccess;
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
for i in {0..29}
|
||||
for i in {0..33}
|
||||
do
|
||||
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
|
||||
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
<system version="2">
|
||||
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x18815600039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_2" dev="1" speed="200000" port="1" guid="0x70815600039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -0,0 +1,92 @@
|
||||
<system version="2">
|
||||
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_2" dev="1" speed="200000" port="1" guid="0x38815600039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -99,6 +99,10 @@ NodeModelDesc model_descs[] = {
|
||||
{4, "topo_8p_ts1_1.xml", "4 nodes 8 VEGA20 TS1 Alt. Model"},
|
||||
{1, "topo_4p3l_2h.xml", "single node 8 gfx908 Rome"},
|
||||
{4, "topo_4p3l_2h.xml", "4 nodes 8 gfx908 Rome"},
|
||||
{1, "topo_8p_ts1_n4.xml", "single node 8 VEGA20 TS1 NPS=4"},
|
||||
{4, "topo_8p_ts1_n4.xml", "4 nodes 8 VEGA20 TS1 NPS=4"},
|
||||
{1, "topo_8p_ts1_n4_1.xml", "single node 8 VEGA20 TS1 NPS=4 Alt. Model"},
|
||||
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
|
||||
Criar uma nova questão referindo esta
Bloquear um utilizador