Select sendrecv path based on collective data size (#391)

* Select sendrecv path based on collective data size

* Add comments on packing and unpacking group field

* Toggling RCCL_P2P_NET_DISABLE in combined calls unit tests
Этот коммит содержится в:
Wenkai Du
2021-06-10 17:51:04 -07:00
коммит произвёл GitHub
родитель f6f5e16fe6
Коммит 6dcae8a459
17 изменённых файлов: 102 добавлений и 45 удалений
+1 -1
Просмотреть файл
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+1 -1
Просмотреть файл
@@ -37,7 +37,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
+1 -1
Просмотреть файл
@@ -33,7 +33,7 @@ class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+11 -6
Просмотреть файл
@@ -49,6 +49,13 @@
#define ROLE_POST_SEND 0x10
#define ROLE_POST_RECV 0x20
// Connection index is used to select P2P and NET and needs to be passed into ncclPrimitives constructor.
// To avoid adding another parameter which requires changes to every places ncclPrimitives are constructed,
// we pack group (max 7) and connection index (max 2) to original group which is 32-bit.
#define PACK_GROUP(gr, idx) (gr | (idx<<16))
#define TO_GR(group) (group&0xffff)
#define TO_IDX(group) (group>>16)
// Implementation of primitive types
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
class ncclPrimitives {
@@ -75,7 +82,7 @@ class ncclPrimitives {
T* direct = NULL;
T* buff;
struct ncclDevComm* comm;
const int p2pNet;
const int connIndex;
const T** srcs;
T** dsts;
@@ -247,8 +254,6 @@ class ncclPrimitives {
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
conn = &channel->devPeers[peer].recv[connIndex].conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
@@ -273,8 +278,6 @@ class ncclPrimitives {
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
conn = &channel->devPeers[peer].send[connIndex].conn;
step = conn->step;
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
@@ -308,7 +311,9 @@ class ncclPrimitives {
public:
__device__ __forceinline__
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2pNet(*comm->p2pNet) {
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[TO_GR(group)].srcs), dsts((T**)ptrs[TO_GR(group)].dsts),
group(TO_GR(group)), barriers(&ptrs[TO_GR(group)].barrier), barrier_next(ptrs[TO_GR(group)].barrier_next),
connIndex((NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? TO_GR(group)/2 : TO_IDX(group)) {
nthreads = nworkers;
// For send operations, we need an extra warp to overlap the threadfence and the copy
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, U
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+1 -1
Просмотреть файл
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUN
T * __restrict__ thisOutput = (T*)args->recvbuff;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+2 -2
Просмотреть файл
@@ -57,7 +57,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
int nt = nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupRecv, args->p2p.recvIdx));
if (recvCount == 0) {
prims.recv(recvbuff, 0);
@@ -73,7 +73,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
int peer = (comm->rank+delta)%comm->nRanks;
int nt = nThreads-nThreadsSplit;
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupSend, args->p2p.sendIdx));
if (sendCount == 0) {
prims.send(sendbuff, 0);
+18 -4
Просмотреть файл
@@ -701,9 +701,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer].send[0].connected == 0) {
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
comm->connect[0] = 1;
}
if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
}
}
}
@@ -714,9 +718,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) {
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
comm->connect[0] = 1;
}
if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
}
}
}
@@ -792,10 +800,16 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
eqElem->proxyArgs.sendIdx = info->sendIdx;
eqElem->proxyArgs.recvIdx = info->recvIdx;
eqElem->work.p2p.sendIdx = info->sendIdx;
eqElem->work.p2p.recvIdx = info->recvIdx;
int channelId = info->channelId;
hipLaunchParams* params = comm->myParams;
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
+41 -6
Просмотреть файл
@@ -36,6 +36,7 @@ struct ncclInitArgs {
};
struct ncclCollArgs {
ncclComm_t comm;
uint16_t connIndex;
};
enum ncclAsyncFuncType {
@@ -118,7 +119,8 @@ ncclResult_t ncclGroupStart() {
return ncclSuccess;
}
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes,
void* recvbuff, ssize_t sendbytes, const void* sendbuff, uint16_t sendIdx, uint16_t recvIdx) {
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
1, 1 };
@@ -126,6 +128,8 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
info.channelId = channelId;
info.sendbytes = sendbytes;
info.recvbytes = recvbytes;
info.sendIdx = sendIdx;
info.recvIdx = recvIdx;
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
@@ -135,7 +139,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P));
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
return args;
}
@@ -150,6 +154,8 @@ static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels
return size;
}
RCCL_PARAM(P2pNetThreshold, "RCCL_P2P_NET_THRESHOLD", 131072);
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
@@ -195,14 +201,15 @@ ncclResult_t ncclGroupEnd() {
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
args->coll.connIndex = 0;
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
@@ -210,7 +217,29 @@ ncclResult_t ncclGroupEnd() {
}
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect = 0;
args->coll.comm->connect[0] = 0;
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
}
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
}
}
@@ -253,6 +282,12 @@ sched_delta:
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
uint16_t sendIdx = 0, recvIdx = 0;
if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
sendIdx = NCCL_CONN_IDX_P2P_NET;
if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
recvIdx = NCCL_CONN_IDX_P2P_NET;
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int sendRemaining = 1, recvRemaining = 1;
@@ -270,7 +305,7 @@ sched_delta:
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL, sendIdx, recvIdx), ret, group_cleanup);
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
+2 -2
Просмотреть файл
@@ -74,7 +74,7 @@ struct ncclComm {
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect;
int connect[NCCL_MAX_CONNS];
uint32_t* connectSend;
uint32_t* connectRecv;
@@ -128,7 +128,7 @@ struct ncclComm {
volatile uint32_t *abortFlag;
// Flags for enable P2P NET
uint32_t *p2pNet;
uint32_t p2pNet;
uint32_t useIntraNet;
// Device side of the communicator
+8 -5
Просмотреть файл
@@ -163,7 +163,6 @@ struct ncclDirect {
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_CONNS 3
struct ncclPeer {
@@ -208,7 +207,14 @@ struct ncclWorkElem {
int sendChunkSize;
int recvChunkSize;
int32_t delta;
uint16_t nThreads;
union {
struct {
uint16_t nThreads:12;
uint16_t sendIdx:2;
uint16_t recvIdx:2;
};
uint16_t padding;
};
} p2p;
struct {
uint16_t padding[15];
@@ -357,9 +363,6 @@ struct ncclDevComm {
// Channels, device side
struct ncclChannel* channels;
// Flags for enable P2P NET
uint32_t *p2pNet;
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf* devProf;
+2
Просмотреть файл
@@ -53,6 +53,8 @@ struct ncclInfo {
int sendChunkSize;
uint32_t delta;
int channelId;
uint16_t sendIdx;
uint16_t recvIdx;
};
#endif
+2
Просмотреть файл
@@ -61,6 +61,8 @@ struct ncclProxyArgs {
int idle;
uint64_t hdp_flushed;
uint8_t connIndex;
uint8_t sendIdx;
uint8_t recvIdx;
// Element linking
pthread_mutex_t mutex;
+3 -7
Просмотреть файл
@@ -377,7 +377,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->intraCC);
}
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
NCCLCHECK(ncclCudaHostFree((void *)comm->p2pNet));
// Poison comm to try and catch a double free
commPoison(comm);
@@ -430,9 +429,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->hostDevComm.abortFlag = comm->abortFlag;
STORE(comm->abortFlag, 0);
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->p2pNet, 1));
comm->hostDevComm.p2pNet = comm->p2pNet;
STORE(comm->p2pNet, 0);
comm->collOpCount = 0;
comm->p2pOpCount = 0x8000;
@@ -466,8 +462,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks*NCCL_MAX_CONNS));
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks*NCCL_MAX_CONNS));
comm->p2pSendCount = comm->p2pRecvCount = 0;
NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
@@ -837,7 +833,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
if (rcclParamP2pNetDisable() == 0) {
STORE(comm->p2pNet, 1);
comm->p2pNet = 1;
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
}
else
+2 -2
Просмотреть файл
@@ -287,7 +287,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
sub->sendbytes = 0;
sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
if (sub->nsteps == 0) sub->nsteps = 1;
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, NCCL_CONN_IDX_P2P));
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, args->recvIdx));
}
if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
int peersend = (comm->rank+sub->delta)%comm->nRanks;
@@ -295,7 +295,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
sub->recvbytes = 0;
sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
if (sub->nsteps == 0) sub->nsteps = 1;
NCCLCHECK(SaveProxy(proxySend, peersend, args, NCCL_CONN_IDX_P2P));
NCCLCHECK(SaveProxy(proxySend, peersend, args, args->sendIdx));
}
// Reset proxy args for potentially multiple cuda graph launches
// It is safe as long as SaveProxy copies contents of args to op
+5 -5
Просмотреть файл
@@ -69,12 +69,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
comm->connectRecv[peer] |= mask;
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
comm->connectSend[peer] |= mask;
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
}
return ncclSuccess;
}
@@ -98,8 +98,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer];
uint32_t sendMask = comm->connectSend[sendPeer];
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
@@ -145,7 +145,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
}
}
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
}
CUDACHECK(hipStreamSynchronize(transportSetupStream));
CUDACHECK(hipStreamDestroy(transportSetupStream));
+1 -1
Просмотреть файл
@@ -122,6 +122,6 @@ namespace CorrectnessTests
testing::Values(2,3,4,5,6,7,8),
// In-place or not
testing::Values(false),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
CorrectnessTest::PrintToStringParamName());
} // namespace