Query XGMI links from xml and adjust gfx906 channel usage (#410)

[ROCm/rccl commit: 818cdb16a8]
2021-07-27 17:32:41 -07:00
commit faea6ead5c
@@ -1036,3 +1036,38 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
  }
  return ncclSuccess;
 }
+
+ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only) {
+  int ngpus = system->nodes[GPU].count;
+  *isXGMI = false;
+  // check for direct XGMI connection
+  for (int i=0; i<ngpus; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
+      struct ncclTopoNode *node = system->nodes[GPU].nodes+i;
+      for (int k = 0; k<system->nodes[GPU].count; k++) {
+        if (node->paths[GPU][k].count == 1) {
+          struct ncclTopoLink* link = node->paths[GPU][k].list[0];
+          struct ncclTopoNode* remNode = link->remNode;
+          if (remNode->gpu.dev == cudaDev2) {
+            *isXGMI = (link->type == LINK_NVL);
+            return ncclSuccess;
+          }
+        }
+      }
+    }
+  }
+  if (direct_only) return ncclSuccess;
+  // check if there is intermediate GPU that is connected to both
+  for (int i=0; i<ngpus; i++) {
+    if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1 || system->nodes[GPU].nodes[i].gpu.dev == cudaDev2)
+      continue;
+    bool res1, res2;
+    ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev1, &res1, true);
+    ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev2, &res2, true);
+    if (res1 && res2) {
+      *isXGMI = true;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
@@ -34,6 +34,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
+ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only=false);

 // Set CPU affinity
 ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
@@ -28,7 +28,6 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include "graph/topo.h"
-#include "rocm_smi_wrap.h"

 // [RCCL]
 #include "clique/CliqueManager.h"
@@ -775,12 +774,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));

+  bool allXgmi = true;
  { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
    CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
    if (comm->localRanks == comm->nRanks)
    {
      // Check that all the GPUs have peer access to one another and are XGMI connected
-      bool allXgmi = true;
      bool hasPeerAccess = true;
      for (int i = 0; i < nranks && hasPeerAccess; i++)
      {
@@ -796,10 +795,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
            break;
          }

-          RSMI_IO_LINK_TYPE linkType;
-          int hopCount, bw;
-          NCCLCHECK(rocm_smi_getLinkInfo(i, j, &linkType, &hopCount, &bw));
-          allXgmi &= (linkType == RSMI_IOLINK_TYPE_XGMI);
+          bool isXGMI;
+          NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
+          allXgmi &= isXGMI;
        }
      }
      if (hasPeerAccess)
@@ -865,6 +863,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int idx;
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
  allGather3Data[rank].nc = 2;
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+    allGather3Data[rank].nc = 4;
  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
    allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
@@ -9,7 +9,6 @@
 #include "graph.h"
 #include "utils.h"
 #include "bootstrap.h"
-#include "rocm_smi_wrap.h"

 struct p2pConnectInfo {
  int rank;
@@ -170,13 +169,12 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));

  resources->next_hdp_reg = 0;
-  RSMI_IO_LINK_TYPE linktype;
-  int hops, bw;
-  if (rocm_smi_getLinkInfo(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops, &bw) != ncclSuccess) {
+  bool isXGMI;
+  if (ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &isXGMI) != ncclSuccess) {
    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
    return ncclInternalError;
  }
-  if (linktype != RSMI_IOLINK_TYPE_XGMI) {
+  if (!isXGMI) {
    CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
  }
@@ -391,6 +391,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));

+  bool allXgmi = true;
+  { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
+    //CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
+    if (comm->localRanks == comm->nRanks)
+    {
+      // Check that all the GPUs have peer access to one another and are XGMI connected
+      bool hasPeerAccess = true;
+      for (int i = 0; i < nranks && hasPeerAccess; i++)
+      {
+        int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
+        for (int j = 0; j < nranks; j++)
+        {
+          if (i == j) continue;
+          int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
+          //int p2p;
+          //if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
+          //{
+          //  hasPeerAccess = false;
+          //  break;
+          //}
+
+          bool isXGMI;
+          NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
+          allXgmi &= isXGMI;
+        }
+      }
+      //if (hasPeerAccess)
+      //{
+      //  if (intraRanks == nranks)
+      //    cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
+      //  else
+      //    cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
+      //}
+
+      // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
+      //if (!allXgmi && !rcclParamCliqueIgnoreTopo())
+      //{
+      //  INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
+      //  cliqueMode = CliqueManager::CLIQUE_DISABLED;
+      //}
+    }
+    //comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
+    //NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
+  } // [/RCCL]
+
  if (comm->rank == ncclParamGraphDumpFileRank()) {
    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
@@ -437,6 +482,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
  int idx;
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
  allGather3Data[rank].nc = 2;
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
+    allGather3Data[rank].nc = 4;
  if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
    allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))