Query XGMI links from xml and adjust gfx906 channel usage (#410)

[ROCm/rccl commit: 818cdb16a8]
Este commit está contenido en:
Wenkai Du
2021-07-27 17:32:41 -07:00
cometido por GitHub
padre 765c46dd89
commit faea6ead5c
Se han modificado 5 ficheros con 92 adiciones y 11 borrados
+35
Ver fichero
@@ -1036,3 +1036,38 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
}
return ncclSuccess;
}
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only) {
int ngpus = system->nodes[GPU].count;
*isXGMI = false;
// check for direct XGMI connection
for (int i=0; i<ngpus; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
struct ncclTopoNode *node = system->nodes[GPU].nodes+i;
for (int k = 0; k<system->nodes[GPU].count; k++) {
if (node->paths[GPU][k].count == 1) {
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
struct ncclTopoNode* remNode = link->remNode;
if (remNode->gpu.dev == cudaDev2) {
*isXGMI = (link->type == LINK_NVL);
return ncclSuccess;
}
}
}
}
}
if (direct_only) return ncclSuccess;
// check if there is intermediate GPU that is connected to both
for (int i=0; i<ngpus; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1 || system->nodes[GPU].nodes[i].gpu.dev == cudaDev2)
continue;
bool res1, res2;
ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev1, &res1, true);
ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev2, &res2, true);
if (res1 && res2) {
*isXGMI = true;
return ncclSuccess;
}
}
return ncclSuccess;
}
+1
Ver fichero
@@ -34,6 +34,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only=false);
// Set CPU affinity
ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
+6 -6
Ver fichero
@@ -28,7 +28,6 @@
#include <sys/stat.h>
#include <unistd.h>
#include "graph/topo.h"
#include "rocm_smi_wrap.h"
// [RCCL]
#include "clique/CliqueManager.h"
@@ -775,12 +774,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
bool allXgmi = true;
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
if (comm->localRanks == comm->nRanks)
{
// Check that all the GPUs have peer access to one another and are XGMI connected
bool allXgmi = true;
bool hasPeerAccess = true;
for (int i = 0; i < nranks && hasPeerAccess; i++)
{
@@ -796,10 +795,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
break;
}
RSMI_IO_LINK_TYPE linkType;
int hopCount, bw;
NCCLCHECK(rocm_smi_getLinkInfo(i, j, &linkType, &hopCount, &bw));
allXgmi &= (linkType == RSMI_IOLINK_TYPE_XGMI);
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
allXgmi &= isXGMI;
}
}
if (hasPeerAccess)
@@ -865,6 +863,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
+3 -5
Ver fichero
@@ -9,7 +9,6 @@
#include "graph.h"
#include "utils.h"
#include "bootstrap.h"
#include "rocm_smi_wrap.h"
struct p2pConnectInfo {
int rank;
@@ -170,13 +169,12 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
resources->next_hdp_reg = 0;
RSMI_IO_LINK_TYPE linktype;
int hops, bw;
if (rocm_smi_getLinkInfo(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops, &bw) != ncclSuccess) {
bool isXGMI;
if (ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &isXGMI) != ncclSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
return ncclInternalError;
}
if (linktype != RSMI_IOLINK_TYPE_XGMI) {
if (!isXGMI) {
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
}
+47
Ver fichero
@@ -391,6 +391,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
bool allXgmi = true;
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
//CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
if (comm->localRanks == comm->nRanks)
{
// Check that all the GPUs have peer access to one another and are XGMI connected
bool hasPeerAccess = true;
for (int i = 0; i < nranks && hasPeerAccess; i++)
{
int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
for (int j = 0; j < nranks; j++)
{
if (i == j) continue;
int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
//int p2p;
//if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
//{
// hasPeerAccess = false;
// break;
//}
bool isXGMI;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
allXgmi &= isXGMI;
}
}
//if (hasPeerAccess)
//{
// if (intraRanks == nranks)
// cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
// else
// cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
//}
// For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
//if (!allXgmi && !rcclParamCliqueIgnoreTopo())
//{
// INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
// cliqueMode = CliqueManager::CLIQUE_DISABLED;
//}
}
//comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
//NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
} // [/RCCL]
if (comm->rank == ncclParamGraphDumpFileRank()) {
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
@@ -437,6 +482,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
int idx;
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
allGather3Data[rank].nc = 2;
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
allGather3Data[rank].nc = 4;
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))