From faea6ead5c867c086ba41402df00983c9463dab3 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 27 Jul 2021 17:32:41 -0700 Subject: [PATCH] Query XGMI links from xml and adjust gfx906 channel usage (#410) [ROCm/rccl commit: 818cdb16a87b4dc9031e379a1789b9a091c7b055] --- projects/rccl/src/graph/search.cc | 35 ++++++++++++++++++ projects/rccl/src/include/graph.h | 1 + projects/rccl/src/init.cc | 12 +++---- projects/rccl/src/transport/p2p.cc | 8 ++--- projects/rccl/tools/topo_expl/utils.cpp | 47 +++++++++++++++++++++++++ 5 files changed, 92 insertions(+), 11 deletions(-) diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index df34ac3554..15726f4177 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -1036,3 +1036,38 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str } return ncclSuccess; } + +ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only) { + int ngpus = system->nodes[GPU].count; + *isXGMI = false; + // check for direct XGMI connection + for (int i=0; inodes[GPU].nodes[i].gpu.dev == cudaDev1) { + struct ncclTopoNode *node = system->nodes[GPU].nodes+i; + for (int k = 0; knodes[GPU].count; k++) { + if (node->paths[GPU][k].count == 1) { + struct ncclTopoLink* link = node->paths[GPU][k].list[0]; + struct ncclTopoNode* remNode = link->remNode; + if (remNode->gpu.dev == cudaDev2) { + *isXGMI = (link->type == LINK_NVL); + return ncclSuccess; + } + } + } + } + } + if (direct_only) return ncclSuccess; + // check if there is intermediate GPU that is connected to both + for (int i=0; inodes[GPU].nodes[i].gpu.dev == cudaDev1 || system->nodes[GPU].nodes[i].gpu.dev == cudaDev2) + continue; + bool res1, res2; + ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev1, &res1, true); + ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev2, &res2, true); + if (res1 && res2) { + *isXGMI = true; + return ncclSuccess; + } + } + return ncclSuccess; +} diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h index 4fa452e7d2..5072ba1249 100644 --- a/projects/rccl/src/include/graph.h +++ b/projects/rccl/src/include/graph.h @@ -34,6 +34,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev); +ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only=false); // Set CPU affinity ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 5579cfa361..4021abbfae 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -28,7 +28,6 @@ #include #include #include "graph/topo.h" -#include "rocm_smi_wrap.h" // [RCCL] #include "clique/CliqueManager.h" @@ -775,12 +774,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); + bool allXgmi = true; { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED; if (comm->localRanks == comm->nRanks) { // Check that all the GPUs have peer access to one another and are XGMI connected - bool allXgmi = true; bool hasPeerAccess = true; for (int i = 0; i < nranks && hasPeerAccess; i++) { @@ -796,10 +795,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm break; } - RSMI_IO_LINK_TYPE linkType; - int hopCount, bw; - NCCLCHECK(rocm_smi_getLinkInfo(i, j, &linkType, &hopCount, &bw)); - allXgmi &= (linkType == RSMI_IOLINK_TYPE_XGMI); + bool isXGMI; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI)); + allXgmi &= isXGMI; } } if (hasPeerAccess) @@ -865,6 +863,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); allGather3Data[rank].nc = 2; + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) + allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index d2f98ab0b1..50e6279437 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -9,7 +9,6 @@ #include "graph.h" #include "utils.h" #include "bootstrap.h" -#include "rocm_smi_wrap.h" struct p2pConnectInfo { int rank; @@ -170,13 +169,12 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); resources->next_hdp_reg = 0; - RSMI_IO_LINK_TYPE linktype; - int hops, bw; - if (rocm_smi_getLinkInfo(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops, &bw) != ncclSuccess) { + bool isXGMI; + if (ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &isXGMI) != ncclSuccess) { INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } - if (linktype != RSMI_IOLINK_TYPE_XGMI) { + if (!isXGMI) { CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); } diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index 8ecebbe690..9360df4eb4 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -391,6 +391,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); + bool allXgmi = true; + { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager + //CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED; + if (comm->localRanks == comm->nRanks) + { + // Check that all the GPUs have peer access to one another and are XGMI connected + bool hasPeerAccess = true; + for (int i = 0; i < nranks && hasPeerAccess; i++) + { + int cudaDev1 = allGather1Data[i].peerInfo.cudaDev; + for (int j = 0; j < nranks; j++) + { + if (i == j) continue; + int cudaDev2 = allGather1Data[j].peerInfo.cudaDev; + //int p2p; + //if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p) + //{ + // hasPeerAccess = false; + // break; + //} + + bool isXGMI; + NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI)); + allXgmi &= isXGMI; + } + } + //if (hasPeerAccess) + //{ + // if (intraRanks == nranks) + // cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS; + // else + // cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE; + //} + + // For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected + //if (!allXgmi && !rcclParamCliqueIgnoreTopo()) + //{ + // INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)"); + // cliqueMode = CliqueManager::CLIQUE_DISABLED; + //} + } + //comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode); + //NCCLCHECK(comm->cliqueManager->Init(commId, rootPid)); + } // [/RCCL] + if (comm->rank == ncclParamGraphDumpFileRank()) { struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph }; NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); @@ -437,6 +482,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); allGather3Data[rank].nc = 2; + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi) + allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2); if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))