Increase minimal channels for gfx908 (#259)

[ROCm/rccl commit: c5cbece6d0]
2020-08-26 11:40:11 -07:00
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
  return maxNchannels;
 }

-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
  // Gather data from all ranks
  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
  int nranks = comm->nRanks;
@@ -290,6 +290,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
  memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));

  int nc = nChannels*2;
+  if (gcn == 908) nc = std::max(nc, 4);
  if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
  if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
  int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
    struct ncclTopoRanks* topoRanks);

 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
-    struct ncclTopoRanks** allTopoRanks, int* rings);
+    struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);

 ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);

@@ -759,6 +759,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
    int cudaCompCap;
    int fullCudaCompCap;
    int nChannels;
+    int gcn;
    struct ncclGraphInfo tree;
    struct ncclGraphInfo ring;
    struct ncclGraphInfo collNet;
@@ -766,7 +767,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  } *allGather3Data;

  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
-  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  int idx;
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
+  allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
+  allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
  allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
    std::min(treeGraph.nChannels, ringGraph.nChannels);
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -813,8 +817,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int nChannelsOrig = comm->nChannels;
  struct ncclTopoRanks** allTopoRanks;
  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
+  int gcn = allGather3Data[0].gcn;
  for (int i=0; i<nranks; i++) {
    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    gcn = std::min(allGather3Data[i].gcn, gcn);
    // Make sure we align all ranks so that the tuning is consistent across ranks
    treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -840,7 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
  if (comm->nNodes > 1 &&
      ncclParamCollNetEnable() == 1 &&
      collNetSupport() && collNetGraph.nChannels) {
@@ -21,7 +21,7 @@

 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-for i in {0..33}
+for i in {0..35}
 do
 	$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
 	$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
@@ -25,6 +25,7 @@ struct allGather3Data_t{
  int cudaCompCap;
  int fullCudaCompCap;
  int nChannels;
+  int gcn;
  struct ncclGraphInfo tree;
  struct ncclGraphInfo ring;
  struct ncclGraphInfo collNet;
@@ -0,0 +1,85 @@
+<system version="2">
+  <cpu numaid="0" affinity="0003ff,f0003fff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="85">
+    <pci busid="0000:18:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:1b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:1d:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="0" sm="98" gcn="908" arch="38911" rank="0" gdr="0">
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1e:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:20:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="1" sm="98" gcn="908" arch="38911" rank="1" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:21:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="2" sm="98" gcn="908" arch="38911" rank="2" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:24:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="3" sm="98" gcn="908" arch="38911" rank="3" gdr="0">
+            <xgmi target="0000:1d:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:20:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:1a:00.0" class="0x020000" link_speed="8 GT/s" link_width="16">
+        <nic>
+          <net name="mlx5_0" dev="0" speed="100000" port="1" guid="0xf2bb2700034b6b50" maxconn="262144" gdr="1"/>
+        </nic>
+      </pci>
+   </pci>
+    <pci busid="0000:3b:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+      <pci busid="0000:3d:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:3f:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="4" sm="98" gcn="908" arch="38911" rank="4" gdr="0">
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:41:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="5" sm="98" gcn="908" arch="38911" rank="5" gdr="0">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:44:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:46:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="6" sm="98" gcn="908" arch="38911" rank="6" gdr="0">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:49:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+      <pci busid="0000:47:00.0" class="0x060400" link_speed="8 GT/s" link_width="16">
+        <pci busid="0000:49:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+          <gpu dev="7" sm="98" gcn="908" arch="38911" rank="7" gdr="0">
+            <xgmi target="0000:3f:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+            <xgmi target="0000:46:00.0" count="1" tclass="0x038000"/>
+          </gpu>
+        </pci>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -103,6 +103,8 @@ NodeModelDesc model_descs[] = {
  {4, "topo_8p_ts1_n4.xml",     "4 nodes 8 VEGA20 TS1 NPS=4"},
  {1, "topo_8p_ts1_n4_1.xml",   "single node 8 VEGA20 TS1 NPS=4 Alt. Model"},
  {4, "topo_8p_ts1_n4_1.xml",   "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
+  {1, "topo_4p3l_ia.xml",       "single node 8 gfx908"},
+  {4, "topo_4p3l_ia.xml",       "4 nodes 8 gfx908"},
 };

 int main(int argc,char* argv[])
@@ -216,7 +216,10 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
  }

  // AllGather3 - begin
-  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  int idx;
+  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
+  allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
+  allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
@@ -397,8 +400,10 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  int nChannelsOrig = comm->nChannels;
  struct ncclTopoRanks** allTopoRanks;
  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
+  int gcn = allGather3Data[0].gcn;
  for (int i=0; i<nranks; i++) {
    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    gcn = std::min(allGather3Data[i].gcn, gcn);
    // Make sure we align all ranks so that the tuning is consistent across ranks
    treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -424,7 +429,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  int *rings;
  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));

-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
  if (comm->nNodes > 1 &&
      ncclParamCollNetEnable() == 1 &&
      collNetSupport()) {