Query XGMI links from xml and adjust gfx906 channel usage (#410)
[ROCm/rccl commit: 818cdb16a8]
Este commit está contenido en:
@@ -1036,3 +1036,38 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
*isXGMI = false;
|
||||
// check for direct XGMI connection
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
|
||||
struct ncclTopoNode *node = system->nodes[GPU].nodes+i;
|
||||
for (int k = 0; k<system->nodes[GPU].count; k++) {
|
||||
if (node->paths[GPU][k].count == 1) {
|
||||
struct ncclTopoLink* link = node->paths[GPU][k].list[0];
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->gpu.dev == cudaDev2) {
|
||||
*isXGMI = (link->type == LINK_NVL);
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (direct_only) return ncclSuccess;
|
||||
// check if there is intermediate GPU that is connected to both
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1 || system->nodes[GPU].nodes[i].gpu.dev == cudaDev2)
|
||||
continue;
|
||||
bool res1, res2;
|
||||
ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev1, &res1, true);
|
||||
ncclTopoGetLinkType(system, system->nodes[GPU].nodes[i].gpu.dev, cudaDev2, &res2, true);
|
||||
if (res1 && res2) {
|
||||
*isXGMI = true;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
|
||||
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
|
||||
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, bool direct_only=false);
|
||||
|
||||
// Set CPU affinity
|
||||
ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include "graph/topo.h"
|
||||
#include "rocm_smi_wrap.h"
|
||||
|
||||
// [RCCL]
|
||||
#include "clique/CliqueManager.h"
|
||||
@@ -775,12 +774,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
|
||||
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
|
||||
|
||||
bool allXgmi = true;
|
||||
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
|
||||
CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
if (comm->localRanks == comm->nRanks)
|
||||
{
|
||||
// Check that all the GPUs have peer access to one another and are XGMI connected
|
||||
bool allXgmi = true;
|
||||
bool hasPeerAccess = true;
|
||||
for (int i = 0; i < nranks && hasPeerAccess; i++)
|
||||
{
|
||||
@@ -796,10 +795,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
break;
|
||||
}
|
||||
|
||||
RSMI_IO_LINK_TYPE linkType;
|
||||
int hopCount, bw;
|
||||
NCCLCHECK(rocm_smi_getLinkInfo(i, j, &linkType, &hopCount, &bw));
|
||||
allXgmi &= (linkType == RSMI_IOLINK_TYPE_XGMI);
|
||||
bool isXGMI;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
if (hasPeerAccess)
|
||||
@@ -865,6 +863,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int idx;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].nc = 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
|
||||
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "graph.h"
|
||||
#include "utils.h"
|
||||
#include "bootstrap.h"
|
||||
#include "rocm_smi_wrap.h"
|
||||
|
||||
struct p2pConnectInfo {
|
||||
int rank;
|
||||
@@ -170,13 +169,12 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
|
||||
|
||||
resources->next_hdp_reg = 0;
|
||||
RSMI_IO_LINK_TYPE linktype;
|
||||
int hops, bw;
|
||||
if (rocm_smi_getLinkInfo(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops, &bw) != ncclSuccess) {
|
||||
bool isXGMI;
|
||||
if (ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &isXGMI) != ncclSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (linktype != RSMI_IOLINK_TYPE_XGMI) {
|
||||
if (!isXGMI) {
|
||||
CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev));
|
||||
TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg);
|
||||
}
|
||||
|
||||
@@ -391,6 +391,51 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
|
||||
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
|
||||
|
||||
bool allXgmi = true;
|
||||
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
|
||||
//CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
if (comm->localRanks == comm->nRanks)
|
||||
{
|
||||
// Check that all the GPUs have peer access to one another and are XGMI connected
|
||||
bool hasPeerAccess = true;
|
||||
for (int i = 0; i < nranks && hasPeerAccess; i++)
|
||||
{
|
||||
int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
|
||||
for (int j = 0; j < nranks; j++)
|
||||
{
|
||||
if (i == j) continue;
|
||||
int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
|
||||
//int p2p;
|
||||
//if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
|
||||
//{
|
||||
// hasPeerAccess = false;
|
||||
// break;
|
||||
//}
|
||||
|
||||
bool isXGMI;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, i, j, &isXGMI));
|
||||
allXgmi &= isXGMI;
|
||||
}
|
||||
}
|
||||
//if (hasPeerAccess)
|
||||
//{
|
||||
// if (intraRanks == nranks)
|
||||
// cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
|
||||
// else
|
||||
// cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
|
||||
//}
|
||||
|
||||
// For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
|
||||
//if (!allXgmi && !rcclParamCliqueIgnoreTopo())
|
||||
//{
|
||||
// INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
|
||||
// cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
//}
|
||||
}
|
||||
//comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
|
||||
//NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
|
||||
} // [/RCCL]
|
||||
|
||||
if (comm->rank == ncclParamGraphDumpFileRank()) {
|
||||
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
|
||||
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
||||
@@ -437,6 +482,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
int idx;
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].nc = 2;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 906 && allXgmi)
|
||||
allGather3Data[rank].nc = 4;
|
||||
if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908)
|
||||
allGather3Data[rank].nc = std::max(4/ringGraph.nChannels, 2);
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G))
|
||||
|
||||
Referencia en una nueva incidencia
Block a user