diff --git a/src/transport/net.cc b/src/transport/net.cc index 2867e35452..421dd8b881 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -17,6 +17,8 @@ #if defined(ENABLE_NPKIT) #include "npkit/npkit.h" #endif +#include "graph.h" +#include "../graph/topo.h" static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); @@ -165,6 +167,7 @@ struct setupReq { int useGdr; int channelId; int connIndex; + uint32_t* curr_hdp_reg; }; /* Determine if we will use this transport for this peer and return connect @@ -176,12 +179,17 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph req.channelId = channelId; req.connIndex = connIndex; req.netDev = -1; + req.curr_hdp_reg = 0; int proxyRank = myInfo->rank; if (connIndex == NCCL_CONN_IDX_P2P_NET) NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, myInfo->rank, graph, channelId, 1, &req.netDev)); if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + if (req.useGdr && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { + CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); + send->conn.curr_hdp_reg = req.curr_hdp_reg; + } NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); req.rank = myInfo->rank; @@ -457,6 +465,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc resources->useGdr = req->useGdr; resources->channelId = req->channelId; resources->connIndex = req->connIndex; + resources->curr_hdp_reg = req->curr_hdp_reg; ncclNetProperties_t props; NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); resources->maxRecvs = props.maxRecvs; diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 3a756369f4..abbfe8e416 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -8,6 +8,8 @@ #include "comm.h" #include "graph.h" #include "utils.h" +#include "graph.h" +#include "../graph/topo.h" struct ncclP2pBuff { void* directPtr; @@ -193,7 +195,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", channelId, myInfo->rank, peerInfo->rank); return ncclInternalError; } - if (!isXGMI) { + if (!isXGMI && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910) { CUDACHECK(hipDeviceGetAttribute((int*)&resources->next_hdp_reg, hipDeviceAttributeHdpMemFlushCntl,peerInfo->cudaDev)); TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", channelId, myInfo->rank, peerInfo->rank, resources->next_hdp_reg); }