Fix Rome PCIe 2 node topology generation (#310)
Этот коммит содержится в:
@@ -426,7 +426,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i = 0; i<system->nodes[GPU].count; i++)
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
int t = 1 << 10;
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
|
||||
NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f));
|
||||
// [RCCL] Event if forced order PCI is found, continue the search instead of ending early
|
||||
// if (t == -1) *time = -1;
|
||||
// [/RCCL]
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
for i in {0..46}
|
||||
for i in {0..47}
|
||||
do
|
||||
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
|
||||
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
|
||||
|
||||
@@ -6,7 +6,7 @@ endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE = topo_expl
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph/ -DTOPO_EXPL -DENABLE_TRACE -lnuma
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src/include -I../../src/graph -I../../src -DTOPO_EXPL -DENABLE_TRACE -lnuma
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \
|
||||
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
<system version="2">
|
||||
<cpu numaid="0" affinity="00000000,00000000,ffffffff,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="90" gcn="906" arch="38911" rank="0" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="90" gcn="906" arch="38911" rank="1" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="90" gcn="906" arch="38911" rank="2" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="90" gcn="906" arch="38911" rank="3" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="1" affinity="ffffffff,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="90" gcn="906" arch="38911" rank="4" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="90" gcn="906" arch="38911" rank="5" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="90" gcn="906" arch="38911" rank="6" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="90" gcn="906" arch="38911" rank="7" gdr="1"/>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:a1:00.0" class="0x020000" link_speed="8 GT/s" link_width="4">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x7893510003a1420c" maxconn="262144" gdr="1"/>
|
||||
<net name="mlx5_1" dev="1" speed="100000" port="2" guid="0x7893510003a1420c" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -116,6 +116,7 @@ NodeModelDesc model_descs[] = {
|
||||
{4, "topo_4p3l_n2_1.xml", "4 nodes 8 gfx908 Rome"},
|
||||
{1, "topo_8p_rome_n4_1.xml", "single node 8 gfx908 Rome NPS=4"},
|
||||
{4, "topo_8p_rome_n4_1.xml", "4 nodes node 8 gfx908 Rome NPS=4"},
|
||||
{2, "topo_8p_rome_pcie.xml", "2 nodes node 8 VEGA20 PCIe"},
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
|
||||
Ссылка в новой задаче
Block a user