Increase minimal channels for gfx908 (#259)

[ROCm/rccl commit: c5cbece6d0]
Этот коммит содержится в:
Wenkai Du
2020-08-26 11:40:11 -07:00
коммит произвёл GitHub
родитель 0898fea746
Коммит 03bb6bcb54
8 изменённых файлов: 107 добавлений и 7 удалений
+2 -1
Просмотреть файл
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
int nranks = comm->nRanks;
@@ -290,6 +290,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
int nc = nChannels*2;
if (gcn == 908) nc = std::max(nc, 4);
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
+1 -1
Просмотреть файл
@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
struct ncclTopoRanks** allTopoRanks, int* rings);
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+8 -2
Просмотреть файл
@@ -759,6 +759,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int cudaCompCap;
int fullCudaCompCap;
int nChannels;
int gcn;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
@@ -766,7 +767,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
} *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -813,8 +817,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
int gcn = allGather3Data[0].gcn;
for (int i=0; i<nranks; i++) {
allTopoRanks[i] = &allGather3Data[i].topoRanks;
gcn = std::min(allGather3Data[i].gcn, gcn);
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -840,7 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
+1 -1
Просмотреть файл
@@ -21,7 +21,7 @@
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for i in {0..33}
for i in {0..35}
do
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
+1
Просмотреть файл
@@ -25,6 +25,7 @@ struct allGather3Data_t{
int cudaCompCap;
int fullCudaCompCap;
int nChannels;
int gcn;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
+85
Просмотреть файл
@@ -0,0 +1,85 @@
<system version="2">
<cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
<pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="98" gcn="908" arch="38911" rank="0" gdr="0">
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="98" gcn="908" arch="38911" rank="1" gdr="0">
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="98" gcn="908" arch="38911" rank="2" gdr="0">
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="98" gcn="908" arch="38911" rank="3" gdr="0">
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
</nic>
</pci>
</pci>
<pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="98" gcn="908" arch="38911" rank="4" gdr="0">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="98" gcn="908" arch="38911" rank="5" gdr="0">
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="98" gcn="908" arch="38911" rank="6" gdr="0">
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
<pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="98" gcn="908" arch="38911" rank="7" gdr="0">
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</pci>
</cpu>
</system>
+2
Просмотреть файл
@@ -103,6 +103,8 @@ NodeModelDesc model_descs[] = {
{4, "topo_8p_ts1_n4.xml", "4 nodes 8 VEGA20 TS1 NPS=4"},
{1, "topo_8p_ts1_n4_1.xml", "single node 8 VEGA20 TS1 NPS=4 Alt. Model"},
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
{1, "topo_4p3l_ia.xml", "single node 8 gfx908"},
{4, "topo_4p3l_ia.xml", "4 nodes 8 gfx908"},
};
int main(int argc,char* argv[])
+7 -2
Просмотреть файл
@@ -216,7 +216,10 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
}
// AllGather3 - begin
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
@@ -397,8 +400,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
int gcn = allGather3Data[0].gcn;
for (int i=0; i<nranks; i++) {
allTopoRanks[i] = &allGather3Data[i].topoRanks;
gcn = std::min(allGather3Data[i].gcn, gcn);
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -424,7 +429,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport()) {