* Add more Rome models

* Update models and tuning

* Update tuning

[ROCm/rccl commit: 2249a1d9d3]
Этот коммит содержится в:
Wenkai Du
2021-10-12 08:23:20 -07:00
коммит произвёл GitHub
родитель 227848b70f
Коммит b587b55c2e
10 изменённых файлов: 246 добавлений и 18 удалений
+39
Просмотреть файл
@@ -354,6 +354,42 @@ static struct rcclRomeModel rome_model_58 = {
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
};
static struct rcclRomeModel rome_model_59 = {
.nGpus = 16, .nCpus = 4, .nNics = 8, .nLinks = 4,
.gpuIds = { 0x4e000, 0x51000, 0x56000, 0x59000, 0xe000, 0x11000, 0x16000, 0x19000, 0xcf000, 0xd2000, 0xd7000, 0xda000, 0x8f000, 0x92000, 0x97000, 0x9a000, },
.nicIds = { 0x4b000, 0x5a000, 0xb000, 0x1a000, 0xcc000, 0xdb000, 0x8c000, 0x9b000, },
.gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, },
.nicNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
.connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, },
.gdrLevel = { 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, },
.pattern = "42424242",
.ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|",
};
static struct rcclRomeModel rome_model_62 = {
.nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
.gpuIds = { 0xc1000, 0xc6000, 0xc9000, 0xce000, 0xd1000, 0xd6000, 0xd9000, 0xde000, },
.nicIds = { },
.gpuNuma = { 3, 3, 1, 1, 0, 0, 2, 2, },
.nicNuma = { },
.connMatrix = { 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, },
.gdrLevel = { },
.pattern = "20202020",
.ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0",
};
static struct rcclRomeModel rome_model_63 = {
.nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3,
.gpuIds = { 0xc1000, 0xc6000, 0xc9000, 0xce000, 0xd1000, 0xd6000, 0xd9000, 0xde000, },
.nicIds = { 0xc5000, 0xcd000, 0xd5000, 0xdd000, },
.gpuNuma = { 3, 3, 1, 1, 0, 0, 2, 2, },
.nicNuma = { 3, 1, 0, 2, },
.connMatrix = { 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, },
.gdrLevel = { 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, },
.pattern = "21212121",
.ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
@@ -381,6 +417,9 @@ static struct rcclRomeModel romeTopoModels[] = {
rome_model_55,
rome_model_56,
rome_model_58,
rome_model_59,
rome_model_62,
rome_model_63,
};
/* Parse user defined rings. Format is like :
+17 -10
Просмотреть файл
@@ -138,6 +138,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11;
if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04;
if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9;
if (gcn == 910 && a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 2 && nRanks == 32) busBw *= 3.2;
#else
if (compCap80) busBw = std::min(busBw, 235.0f);
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); }
@@ -281,16 +282,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
// factor is not ideal but works quite well. Powers of two, 64 B to 128MB.
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, },
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, },
{ 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, },
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][25] = {
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, },
{ 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, },
{ 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, 1.5, 1.7, 2.4, },
};
static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, },
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, },
{ 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, },
static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][25] = {
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, },
{ 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, },
{ 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, 1.1, 2.0, 2.9, },
};
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
@@ -302,8 +303,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
int logSize = log2i(info->nBytes>>6);
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize];
if (algorithm == NCCL_ALGO_TREE) {
if (logSize < 25) bw *= treeCorrectionFactor[protocol][logSize];
else bw *= treeCorrectionFactor[protocol][24];
}
else if (algorithm == NCCL_ALGO_RING) {
if(logSize < 25) bw *= ringCorrectionFactor[protocol][logSize];
else bw *= ringCorrectionFactor[protocol][24];
}
#else
if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
+2
Просмотреть файл
@@ -931,6 +931,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for i in {0..58}
for i in {0..63}
do
if [[ $i -eq 50 ]] || [[ $i -eq 51 ]]
then
+2 -7
Просмотреть файл
@@ -191,7 +191,7 @@
</pci>
<pci busid="0000:8c:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_7" dev="7" speed="200000" port="1" guid="0x8c40bb0003f6ceb8" maxconn="262144" gdr="1"/>
<net name="mlx5_7" dev="6" speed="200000" port="1" guid="0x8c40bb0003f6ceb8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
@@ -217,15 +217,10 @@
</pci>
<pci busid="0000:9b:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" guid="0x6c40bb0003f6ceb8" maxconn="262144" gdr="1"/>
<net name="mlx5_8" dev="7" speed="200000" port="1" guid="0x6c40bb0003f6ceb8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
</pci>
<pci busid="0000:b1:00.0" class="0x020000" vendor="0x15b3" device="0x1015" subsystem_vendor="0x15b3" subsystem_device="0x0190" link_speed="8.0 GT/s PCIe" link_width="8">
<nic>
<net name="mlx5_6" dev="6" speed="40000" port="1" guid="0x100fe0b7312aea4" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
+26
Просмотреть файл
@@ -0,0 +1,26 @@
<system version="2">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
<pci busid="0000:21:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="90" gcn="910" arch="38911" rank="0" gdr="1"/>
</pci>
</pci>
<pci busid="0000:41:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xad9c300039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="1" affinity="ffff0000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="90" gcn="910" arch="38911" rank="1" gdr="1"/>
</pci>
</pci>
<pci busid="0000:e2:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:e4:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="90" gcn="910" arch="38911" rank="2" gdr="1"/>
</pci>
</pci>
</cpu>
</system>
+26
Просмотреть файл
@@ -0,0 +1,26 @@
<system version="2">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
<pci busid="0000:21:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="90" gcn="910" arch="38911" rank="0" gdr="1"/>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="ffff0000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="175" modelid="1">
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="90" gcn="910" arch="38911" rank="1" gdr="1"/>
</pci>
</pci>
<pci busid="0000:e2:00.0" class="0x060400" vendor="0x1022" device="0x14c7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="16.0 GT/s PCIe" link_width="16">
<pci busid="0000:e4:00.0" class="0x038000" vendor="0x1002" device="0x740f" subsystem_vendor="0x1002" subsystem_device="0x0c34" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="90" gcn="910" arch="38911" rank="2" gdr="1"/>
</pci>
</pci>
<pci busid="0000:a1:00.0" class="0x020700" vendor="0x15b3" device="0x101b" subsystem_vendor="0x15b3" subsystem_device="0x0007" link_speed="16.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_1" dev="0" speed="200000" port="1" guid="0x7657900003f6ceb8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
+126
Просмотреть файл
@@ -0,0 +1,126 @@
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff,00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:43:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:46:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:48:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:4a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
<xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:4c:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:4e:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:50:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
<xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:45:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_1" dev="0" speed="200000" port="1" guid="0x48b9170003a1420c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000,00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:03:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:05:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:08:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:0a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
<xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0f:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:0b:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:0d:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:0f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
<xgmi target="0000:4a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:50:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:0a:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:13:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_3" dev="1" speed="200000" port="1" guid="0x18604a0003a1420c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000,00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c7:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c9:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:cb:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
<xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:cd:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:cf:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:d1:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
<xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:c6:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_5" dev="2" speed="200000" port="1" guid="0xd0b9170003a1420c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000,ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:83:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:86:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:88:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:8a:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
<xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:90:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:8c:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:8e:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:90:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
<xgmi target="0000:cb:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:d1:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:8a:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:85:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_7" dev="3" speed="200000" port="1" guid="0xd0bd170003a1420c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
</system>
+5
Просмотреть файл
@@ -128,6 +128,11 @@ NodeModelDesc model_descs[] = {
{1, "topo_16p1h.xml", "single node 16P1H"},
{4, "topo_8p_rome_4n_2.xml", "4 nodes 8 gfx908 Rome 4 NICs NPS=4 Alt. Model"},
{1, "topo_8p_90a_1.xml", "single node gfx90a Alt. Model"},
{4, "topo_16p1h.xml", "4 nodes 16P1H"},
{4, "topo_3p_pcie.xml", "4 nodes 3P"},
{4, "topo_3p_pcie_1.xml", "4 nodes 3P Alt. Model"},
{1, "topo_8p_4nics.xml", "single nodes 8P 4 NICs"},
{4, "topo_8p_4nics.xml", "4 nodes 8P 4 NICs"},
};
int main(int argc,char* argv[])
+2
Просмотреть файл
@@ -672,6 +672,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910)
allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;