Enable local sendrecv over network if GDR is available on all GPUs (#324)
[ROCm/rccl commit: c018edf0f2]
This commit is contained in:
@@ -23,6 +23,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
for (size_t i=0; i<comm->nRanks+1; ++i) {
|
||||
channel->peers[i].send.comm = comm;
|
||||
channel->peers[i].recv.comm = comm;
|
||||
channel->peers[i].p2pSend.comm = comm;
|
||||
channel->peers[i].p2pRecv.comm = comm;
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
@@ -46,10 +48,16 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
for (int r=0; r<nRanks+1; r++) {
|
||||
struct ncclPeer* peer = channel->peers+r;
|
||||
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
|
||||
if (peer->send.transportResources == peer->p2pSend.transportResources) peer->p2pSend.transportResources = NULL;
|
||||
peer->send.transportResources = NULL;
|
||||
if (peer->p2pSend.transportResources) NCCLCHECK(peer->p2pSend.transportComm->free(peer->p2pSend.transportResources));
|
||||
}
|
||||
for (int r=0; r<nRanks+1; r++) {
|
||||
struct ncclPeer* peer = channel->peers+r;
|
||||
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
|
||||
if (peer->recv.transportResources == peer->p2pRecv.transportResources) peer->p2pRecv.transportResources = NULL;
|
||||
peer->recv.transportResources = NULL;
|
||||
if (peer->p2pRecv.transportResources) NCCLCHECK(peer->p2pRecv.transportComm->free(peer->p2pRecv.transportResources));
|
||||
}
|
||||
|
||||
// Free the peer structures.
|
||||
|
||||
@@ -71,6 +71,7 @@ class ncclPrimitives {
|
||||
int peer = -1;
|
||||
int role = 0;
|
||||
int group;
|
||||
const int p2p;
|
||||
uint64_t step;
|
||||
T* direct = NULL;
|
||||
T* buff;
|
||||
@@ -198,7 +199,7 @@ class ncclPrimitives {
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
|
||||
conn = &channel->devPeers[peer].recv.conn;
|
||||
conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pRecv.conn : &channel->devPeers[peer].recv.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_RECV) {
|
||||
@@ -221,7 +222,7 @@ class ncclPrimitives {
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
|
||||
conn = &channel->devPeers[peer].send.conn;
|
||||
conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pSend.conn : &channel->devPeers[peer].send.conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
if (role & ROLE_POST_SEND) {
|
||||
@@ -251,8 +252,8 @@ class ncclPrimitives {
|
||||
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next) {
|
||||
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group, int p2p = 0)
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2p(p2p) {
|
||||
nthreads = nworkers;
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
|
||||
|
||||
@@ -56,7 +56,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
|
||||
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
|
||||
int nt = nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv, 1);
|
||||
|
||||
if (recvCount == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
@@ -71,7 +71,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
|
||||
int peer = (comm->rank+delta)%comm->nRanks;
|
||||
int nt = nThreads-nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend, 1);
|
||||
|
||||
if (sendCount == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
|
||||
@@ -391,8 +391,8 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
|
||||
NCCLCHECK(getLoopInfo(info));
|
||||
|
||||
if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv)
|
||||
&& info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
|
||||
info->nChannels =info->comm->p2pnChannels;
|
||||
&& info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME))
|
||||
info->nChannels = info->comm->p2pnChannels;
|
||||
|
||||
work->opCount = info->comm->opCount;
|
||||
work->sendbuff = info->sendbuff;
|
||||
@@ -617,7 +617,8 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send.connected == 0) {
|
||||
if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pSend.connected :
|
||||
comm->channels[channelId].peers[peer].send.connected) == 0) {
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
@@ -630,7 +631,8 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
|
||||
if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pRecv.connected :
|
||||
comm->channels[channelId].peers[peer].recv.connected ) == 0) {
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
}
|
||||
@@ -643,7 +645,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
}
|
||||
|
||||
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
|
||||
const int e = (info->comm->topo->nodes[NET].count == 0 && info->comm->topo->type == RCCL_TOPO_4P2H_ROME)
|
||||
const int e = (info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME))
|
||||
? 1 : NCCL_MAX_WORK_ELEMENTS;
|
||||
for (int s=0; s<e && work->elems[s].p2p.delta != info->delta; s++) {
|
||||
if (work->elems[s].p2p.nThreads == 0) return s;
|
||||
|
||||
@@ -267,9 +267,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
|
||||
int nc = nChannels*2;
|
||||
if (gcn == 908) nc = std::max(nc, 4);
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*4;
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) nc = nChannels*4;
|
||||
if (!nnets) nnets = comm->topo->nodes[NET].count;
|
||||
if (nnets && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = (nnets > 3 ? 2 : 4)*nnets;
|
||||
if (nnets && (comm->topo->type & RCCL_TOPO_4P2H_ROME)) nc = (nnets > 3 ? 2 : 4)*nnets;
|
||||
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
|
||||
|
||||
// Duplication should be complete now
|
||||
|
||||
@@ -468,8 +468,29 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
|
||||
}
|
||||
|
||||
int remove = 1;
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
|
||||
&& model == NCCL_TOPO_CPU_TYPE_ROME) {
|
||||
int gdr, ret = 1;
|
||||
int64_t net;
|
||||
for (int g = 0; g < system->nodes[GPU].count; g++) {
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net, 0));
|
||||
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
|
||||
if (!gdr) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ret) {
|
||||
system->type |= RCCL_TOPO_GDR_ALL;
|
||||
remove = 0;
|
||||
INFO(NCCL_GRAPH, "GDR is available on all GPUs");
|
||||
}
|
||||
}
|
||||
comm->localRanks = system->nodes[GPU].count;
|
||||
if (system->nodes[GPU].count == comm->nRanks) {
|
||||
if (system->nodes[GPU].count == comm->nRanks && remove) {
|
||||
for (int n=system->nodes[NET].count-1; n>=0; n--)
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
|
||||
}
|
||||
@@ -529,7 +550,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
|
||||
if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL)) {
|
||||
// Adjust P2P channels on Rome
|
||||
comm->p2pnChannelsPerPeer = 2;
|
||||
comm->p2pnChannels = 2;
|
||||
|
||||
@@ -525,7 +525,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
* `--> NET n (or m if crossNic)
|
||||
*/
|
||||
ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
|
||||
if (system->nodes[NET].count) {
|
||||
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
|
||||
if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
|
||||
else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
|
||||
else *backToNet = 0;
|
||||
@@ -541,7 +541,7 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in
|
||||
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
|
||||
int backToNet, backToFirstRank;
|
||||
NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
|
||||
if (system->nodes[NET].count) {
|
||||
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
|
||||
// Start from NET
|
||||
ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
|
||||
} else {
|
||||
@@ -831,9 +831,9 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclT
|
||||
dist[m] = dist[n]; dist[n] = temp;
|
||||
}
|
||||
// create chordal ring based on reference and remapped ids
|
||||
system->type = RCCL_TOPO_CR8G;
|
||||
system->type |= RCCL_TOPO_CR8G;
|
||||
NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL));
|
||||
if (system->nodes[NET].count) {
|
||||
if (system->nodes[NET].count && system->nodes[GPU].count != system->nRanks) {
|
||||
int *intra, *used;
|
||||
graph->nChannels = system->nodes[NET].count;
|
||||
NCCLCHECK(ncclCalloc(&intra, ngpus));
|
||||
@@ -909,8 +909,7 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
|
||||
count ++;
|
||||
}
|
||||
if (!romeTopo->nLinks) romeTopo->nLinks = count;
|
||||
else if (romeTopo->nLinks != count) return ncclSuccess;
|
||||
if (romeTopo->nLinks < count) romeTopo->nLinks = count;
|
||||
}
|
||||
|
||||
// trim ports and create NET map
|
||||
@@ -1050,10 +1049,10 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
|
||||
struct rcclRomeModel romeTopo;
|
||||
char pattern[256];
|
||||
int net_map[MAX_ROME_NICS];
|
||||
parseRomeSystem(system, &romeTopo, pattern, net_map);
|
||||
NCCLCHECK(parseRomeSystem(system, &romeTopo, pattern, net_map));
|
||||
|
||||
// recognize system as Rome 4P2H even if no matching model
|
||||
if (ngpus == 8 && romeTopo.nLinks) system->type = RCCL_TOPO_4P2H_ROME;
|
||||
if (ngpus > 4 && romeTopo.nLinks) system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
|
||||
int g[MAX_ROME_GPUS];
|
||||
int time = 0;
|
||||
@@ -1135,8 +1134,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
// user supplied topo
|
||||
NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL));
|
||||
if (graph->nChannels) {
|
||||
system->type = RCCL_TOPO_4P2H_ROME;
|
||||
return ncclSuccess;
|
||||
system->type |= RCCL_TOPO_4P2H_ROME;
|
||||
}
|
||||
} else {
|
||||
// try to match 8P6L
|
||||
@@ -1144,8 +1142,8 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
// try to match Rome 4P2H
|
||||
NCCLCHECK(parseRome4P2H(system, graph));
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
}
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
|
||||
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
|
||||
|
||||
@@ -1293,7 +1291,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
for (int c=0; c<graph->nChannels; c++) {
|
||||
sprintf(line, "%2d :", c);
|
||||
int offset = strlen(line);
|
||||
if (system->nodes[NET].count > 0) {
|
||||
if (system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
@@ -1301,7 +1299,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
if (system->nodes[NET].count > 0) {
|
||||
if (system->nodes[NET].count > 0 && system->nodes[GPU].count != system->nRanks) {
|
||||
sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
|
||||
offset = strlen(line);
|
||||
}
|
||||
|
||||
@@ -82,8 +82,9 @@ struct ncclTopoLinkList {
|
||||
|
||||
#define NCCL_TOPO_UNDEF (-1)
|
||||
|
||||
#define RCCL_TOPO_CR8G 1
|
||||
#define RCCL_TOPO_CR8G 1
|
||||
#define RCCL_TOPO_4P2H_ROME 2
|
||||
#define RCCL_TOPO_GDR_ALL 4
|
||||
|
||||
struct ncclTopoNode {
|
||||
int type;
|
||||
@@ -132,6 +133,7 @@ struct ncclTopoSystem {
|
||||
float maxWidth;
|
||||
float totalWidth;
|
||||
int type;
|
||||
int nRanks;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
|
||||
|
||||
@@ -121,6 +121,9 @@ struct ncclComm {
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// Flags for enable P2P NET
|
||||
uint32_t *p2pNet;
|
||||
|
||||
// Device side of the communicator
|
||||
struct ncclDevComm *devComm;
|
||||
// Host copy of the devComm (to free CUDA allocs)
|
||||
|
||||
@@ -150,6 +150,8 @@ struct ncclTree {
|
||||
struct ncclPeer {
|
||||
struct ncclConnector send;
|
||||
struct ncclConnector recv;
|
||||
struct ncclConnector p2pSend;
|
||||
struct ncclConnector p2pRecv;
|
||||
};
|
||||
|
||||
struct ncclDevComm;
|
||||
@@ -332,6 +334,9 @@ struct ncclDevComm {
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
|
||||
// Flags for enable P2P NET
|
||||
uint32_t *p2pNet;
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf* devProf;
|
||||
|
||||
@@ -352,6 +352,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->intraCC);
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->p2pNet));
|
||||
|
||||
// Poison comm to try and catch a double free
|
||||
commPoison(comm);
|
||||
@@ -362,6 +363,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
|
||||
RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1);
|
||||
RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);
|
||||
RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
|
||||
|
||||
static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
if (ndev < 1) {
|
||||
@@ -401,6 +403,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->hostDevComm.abortFlag = comm->abortFlag;
|
||||
STORE(comm->abortFlag, 0);
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->p2pNet, 1));
|
||||
comm->hostDevComm.p2pNet = comm->p2pNet;
|
||||
STORE(comm->p2pNet, 0);
|
||||
|
||||
comm->argsptr = &comm->args;
|
||||
#ifdef ENABLE_PROFILING
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.devProf, 1));
|
||||
@@ -808,6 +814,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
// Topo detection / System graph creation
|
||||
NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
|
||||
// save nRanks to ncclTopoSystem as indicator of multi-node
|
||||
comm->topo->nRanks = comm->nRanks;
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
@@ -852,7 +860,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
if (!rcclParamForceEnableClique())
|
||||
{
|
||||
// Disable clique-kernel support if not on CR8 topology
|
||||
if (!(comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G))
|
||||
if (!(comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)))
|
||||
{
|
||||
INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (force enable with RCCL_FORCE_ENABLE_CLIQUE)");
|
||||
cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
@@ -898,6 +906,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
||||
}
|
||||
|
||||
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
|
||||
if (rcclParamP2pNetDisable() == 0) {
|
||||
STORE(comm->p2pNet, 1);
|
||||
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
|
||||
}
|
||||
else
|
||||
INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
|
||||
}
|
||||
// AllGather3 - begin
|
||||
struct ncclGraphInfo {
|
||||
int pattern;
|
||||
@@ -925,7 +941,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
|
||||
allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
|
||||
allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
|
||||
allGather3Data[rank].alltoallDisable = comm->topo->nodes[NET].count? 1 : comm->alltoallDisable;
|
||||
allGather3Data[rank].alltoallDisable = comm->topo->nodes[GPU].count == comm->topo->nRanks ? 1 : comm->alltoallDisable;
|
||||
|
||||
allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
|
||||
std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "graph.h"
|
||||
#include "collectives.h"
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
enum { proxyRecv=0, proxySend=1, p2pProxyRecv=2, p2pProxySend=3 };
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
|
||||
@@ -162,10 +162,12 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
|
||||
if (peer < 0) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peerComm = args->channel->peers+peer;
|
||||
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
|
||||
struct ncclConnector* connector = type < p2pProxyRecv ? (type == proxyRecv ? &peerComm->recv : &peerComm->send)
|
||||
: (type == p2pProxyRecv ? &peerComm->p2pRecv : &peerComm->p2pSend);
|
||||
if (connector->transportComm == NULL) {
|
||||
WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
|
||||
type == proxyRecv ? "recv" : "send", peer, args->channel->id);
|
||||
type < p2pProxyRecv ? (type == proxyRecv ? "recv" : "send") : (type == p2pProxyRecv ? "p2pRecv" : "p2pSend"),
|
||||
peer, args->channel->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (connector->transportComm->proxy == NULL) return ncclSuccess;
|
||||
@@ -238,7 +240,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
args.recvbytes = info->recvbytes;
|
||||
args.sendbytes = 0;
|
||||
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, &args));
|
||||
NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxyRecv : proxyRecv, peerrecv, &args));
|
||||
}
|
||||
if (info->delta > 0 && info->sendbytes >= 0) {
|
||||
int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
|
||||
@@ -246,7 +248,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel
|
||||
if (args.nsteps == 0) args.nsteps = 1;
|
||||
args.sendbytes = info->sendbytes;
|
||||
args.recvbytes = 0;
|
||||
NCCLCHECK(SaveProxy(proxySend, peersend, &args));
|
||||
NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxySend : proxySend, peersend, &args));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "comm.h"
|
||||
#include "info.h"
|
||||
#include "bootstrap.h"
|
||||
#include "../graph/topo.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
@@ -19,6 +20,34 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
netTransport,
|
||||
};
|
||||
|
||||
static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
if (info1->hostHash != info2->hostHash) return ncclSuccess;
|
||||
int g1, g2;
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1));
|
||||
NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2));
|
||||
if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransportN(struct ncclComm* comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId, int n) {
|
||||
for (int t=n; t<NTRANSPORTS; t++) {
|
||||
if (t == TRANSPORT_SHM) continue;
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, NULL, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(comm, NULL, myInfo, peerInfo, connect, connector, channelId));
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("No transport found !");
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
template <int type>
|
||||
static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
@@ -63,6 +92,7 @@ void dumpData(struct ncclConnect* data, int ndata) {
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
|
||||
struct ncclConnect data[2*MAXCHANNELS];
|
||||
uint32_t p2pNet = LOAD(comm->p2pNet);
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
@@ -73,15 +103,41 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (recvMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
|
||||
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
|
||||
int xgmi = 0;
|
||||
if (p2pNet && graph == NULL) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].p2pRecv;
|
||||
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer));
|
||||
if (xgmi) {
|
||||
NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_P2P));
|
||||
}
|
||||
else {
|
||||
NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_NET));
|
||||
}
|
||||
}
|
||||
else {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
|
||||
NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c));
|
||||
}
|
||||
}
|
||||
}
|
||||
struct ncclConnect* sendData = recvData+recvChannels;
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (sendMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
|
||||
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
|
||||
int xgmi = 0;
|
||||
if (p2pNet && graph == NULL) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].p2pSend;
|
||||
NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer));
|
||||
if (xgmi) {
|
||||
NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_P2P));
|
||||
}
|
||||
else {
|
||||
NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_NET));
|
||||
}
|
||||
}
|
||||
else {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
|
||||
NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,18 +157,22 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (sendMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send;
|
||||
struct ncclConnector* conn = (p2pNet && graph == NULL) ? &comm->channels[c].peers[sendPeer].p2pSend
|
||||
: &comm->channels[c].peers[sendPeer].send;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].p2pSend, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
if (recvMask & (1<<c)) {
|
||||
struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv;
|
||||
struct ncclConnector* conn = (p2pNet && graph == NULL) ? &comm->channels[c].peers[recvPeer].p2pRecv
|
||||
: &comm->channels[c].peers[recvPeer].recv;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].p2pRecv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
||||
|
||||
@@ -207,6 +207,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
|
||||
|
||||
// Topo detection / System graph creation
|
||||
//NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
|
||||
comm->topo->nRanks = comm->nRanks;
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
|
||||
مرجع در شماره جدید
Block a user