Increase minimal channels for gfx908 (#259)
[ROCm/rccl commit: c5cbece6d0]
Этот коммит содержится в:
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
|
||||
int nranks = comm->nRanks;
|
||||
@@ -290,6 +290,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
|
||||
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
|
||||
|
||||
int nc = nChannels*2;
|
||||
if (gcn == 908) nc = std::max(nc, 4);
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
|
||||
if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
|
||||
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
|
||||
|
||||
@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings);
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
|
||||
|
||||
@@ -759,6 +759,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int cudaCompCap;
|
||||
int fullCudaCompCap;
|
||||
int nChannels;
|
||||
int gcn;
|
||||
struct ncclGraphInfo tree;
|
||||
struct ncclGraphInfo ring;
|
||||
struct ncclGraphInfo collNet;
|
||||
@@ -766,7 +767,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
} *allGather3Data;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
||||
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
|
||||
int idx;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
|
||||
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
|
||||
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
|
||||
std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
@@ -813,8 +817,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int nChannelsOrig = comm->nChannels;
|
||||
struct ncclTopoRanks** allTopoRanks;
|
||||
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
|
||||
int gcn = allGather3Data[0].gcn;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
allTopoRanks[i] = &allGather3Data[i].topoRanks;
|
||||
gcn = std::min(allGather3Data[i].gcn, gcn);
|
||||
// Make sure we align all ranks so that the tuning is consistent across ranks
|
||||
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
|
||||
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
|
||||
@@ -840,7 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
for i in {0..33}
|
||||
for i in {0..35}
|
||||
do
|
||||
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
|
||||
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
|
||||
|
||||
@@ -25,6 +25,7 @@ struct allGather3Data_t{
|
||||
int cudaCompCap;
|
||||
int fullCudaCompCap;
|
||||
int nChannels;
|
||||
int gcn;
|
||||
struct ncclGraphInfo tree;
|
||||
struct ncclGraphInfo ring;
|
||||
struct ncclGraphInfo collNet;
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
<system version="2">
|
||||
<cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
|
||||
<pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="98" gcn="908" arch="38911" rank="0" gdr="0">
|
||||
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="98" gcn="908" arch="38911" rank="1" gdr="0">
|
||||
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="98" gcn="908" arch="38911" rank="2" gdr="0">
|
||||
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="98" gcn="908" arch="38911" rank="3" gdr="0">
|
||||
<xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="98" gcn="908" arch="38911" rank="4" gdr="0">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="98" gcn="908" arch="38911" rank="5" gdr="0">
|
||||
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="98" gcn="908" arch="38911" rank="6" gdr="0">
|
||||
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
|
||||
<pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="98" gcn="908" arch="38911" rank="7" gdr="0">
|
||||
<xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -103,6 +103,8 @@ NodeModelDesc model_descs[] = {
|
||||
{4, "topo_8p_ts1_n4.xml", "4 nodes 8 VEGA20 TS1 NPS=4"},
|
||||
{1, "topo_8p_ts1_n4_1.xml", "single node 8 VEGA20 TS1 NPS=4 Alt. Model"},
|
||||
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
|
||||
{1, "topo_4p3l_ia.xml", "single node 8 gfx908"},
|
||||
{4, "topo_4p3l_ia.xml", "4 nodes 8 gfx908"},
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
|
||||
@@ -216,7 +216,10 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
}
|
||||
|
||||
// AllGather3 - begin
|
||||
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
|
||||
int idx;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
|
||||
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
|
||||
allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
|
||||
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
|
||||
@@ -397,8 +400,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
int nChannelsOrig = comm->nChannels;
|
||||
struct ncclTopoRanks** allTopoRanks;
|
||||
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
|
||||
int gcn = allGather3Data[0].gcn;
|
||||
for (int i=0; i<nranks; i++) {
|
||||
allTopoRanks[i] = &allGather3Data[i].topoRanks;
|
||||
gcn = std::min(allGather3Data[i].gcn, gcn);
|
||||
// Make sure we align all ranks so that the tuning is consistent across ranks
|
||||
treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
|
||||
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
|
||||
@@ -424,7 +429,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport()) {
|
||||
|
||||
Ссылка в новой задаче
Block a user