Select sendrecv path based on collective data size (#391)
* Select sendrecv path based on collective data size * Add comments on packing and unpacking group field * Toggling RCCL_P2P_NET_DISABLE in combined calls unit tests
Этот коммит содержится в:
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
|
||||
@@ -37,7 +37,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
|
||||
ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
|
||||
|
||||
@@ -33,7 +33,7 @@ class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
|
||||
@@ -49,6 +49,13 @@
|
||||
#define ROLE_POST_SEND 0x10
|
||||
#define ROLE_POST_RECV 0x20
|
||||
|
||||
// Connection index is used to select P2P and NET and needs to be passed into ncclPrimitives constructor.
|
||||
// To avoid adding another parameter which requires changes to every places ncclPrimitives are constructed,
|
||||
// we pack group (max 7) and connection index (max 2) to original group which is 32-bit.
|
||||
#define PACK_GROUP(gr, idx) (gr | (idx<<16))
|
||||
#define TO_GR(group) (group&0xffff)
|
||||
#define TO_IDX(group) (group>>16)
|
||||
|
||||
// Implementation of primitive types
|
||||
template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
|
||||
class ncclPrimitives {
|
||||
@@ -75,7 +82,7 @@ class ncclPrimitives {
|
||||
T* direct = NULL;
|
||||
T* buff;
|
||||
struct ncclDevComm* comm;
|
||||
const int p2pNet;
|
||||
const int connIndex;
|
||||
|
||||
const T** srcs;
|
||||
T** dsts;
|
||||
@@ -247,8 +254,6 @@ class ncclPrimitives {
|
||||
__device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
|
||||
if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
|
||||
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
|
||||
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
|
||||
((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
|
||||
conn = &channel->devPeers[peer].recv[connIndex].conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
@@ -273,8 +278,6 @@ class ncclPrimitives {
|
||||
__device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
|
||||
if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
|
||||
// For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
|
||||
const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
|
||||
((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
|
||||
conn = &channel->devPeers[peer].send[connIndex].conn;
|
||||
step = conn->step;
|
||||
step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
|
||||
@@ -308,7 +311,9 @@ class ncclPrimitives {
|
||||
public:
|
||||
__device__ __forceinline__
|
||||
ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2pNet(*comm->p2pNet) {
|
||||
: comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[TO_GR(group)].srcs), dsts((T**)ptrs[TO_GR(group)].dsts),
|
||||
group(TO_GR(group)), barriers(&ptrs[TO_GR(group)].barrier), barrier_next(ptrs[TO_GR(group)].barrier_next),
|
||||
connIndex((NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? TO_GR(group)/2 : TO_IDX(group)) {
|
||||
nthreads = nworkers;
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
// int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
|
||||
|
||||
@@ -34,7 +34,7 @@ class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, U
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
|
||||
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUN
|
||||
T * __restrict__ thisOutput = (T*)args->recvbuff;
|
||||
|
||||
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
|
||||
prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));
|
||||
|
||||
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
|
||||
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
|
||||
|
||||
@@ -57,7 +57,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
|
||||
int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
|
||||
int nt = nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
|
||||
prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupRecv, args->p2p.recvIdx));
|
||||
|
||||
if (recvCount == 0) {
|
||||
prims.recv(recvbuff, 0);
|
||||
@@ -73,7 +73,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
|
||||
int peer = (comm->rank+delta)%comm->nRanks;
|
||||
int nt = nThreads-nThreadsSplit;
|
||||
ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
|
||||
prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupSend, args->p2p.sendIdx));
|
||||
|
||||
if (sendCount == 0) {
|
||||
prims.send(sendbuff, 0);
|
||||
|
||||
+18
-4
@@ -701,9 +701,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
|
||||
if (comm->channels[channelId].peers[peer].send[0].connected == 0) {
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
comm->connect[0] = 1;
|
||||
}
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
|
||||
comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
|
||||
comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -714,9 +718,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
|
||||
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) {
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
comm->connect = 1;
|
||||
comm->connect[0] = 1;
|
||||
}
|
||||
if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
|
||||
comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
|
||||
comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -792,10 +800,16 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
|
||||
// Compute cuda kernel arg and proxy arg templates
|
||||
struct ncclQueueElem* eqElem;
|
||||
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
|
||||
|
||||
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
|
||||
NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
|
||||
NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
|
||||
|
||||
eqElem->proxyArgs.sendIdx = info->sendIdx;
|
||||
eqElem->proxyArgs.recvIdx = info->recvIdx;
|
||||
eqElem->work.p2p.sendIdx = info->sendIdx;
|
||||
eqElem->work.p2p.recvIdx = info->recvIdx;
|
||||
|
||||
int channelId = info->channelId;
|
||||
hipLaunchParams* params = comm->myParams;
|
||||
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
|
||||
|
||||
+41
-6
@@ -36,6 +36,7 @@ struct ncclInitArgs {
|
||||
};
|
||||
struct ncclCollArgs {
|
||||
ncclComm_t comm;
|
||||
uint16_t connIndex;
|
||||
};
|
||||
|
||||
enum ncclAsyncFuncType {
|
||||
@@ -118,7 +119,8 @@ ncclResult_t ncclGroupStart() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
|
||||
static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes,
|
||||
void* recvbuff, ssize_t sendbytes, const void* sendbuff, uint16_t sendIdx, uint16_t recvIdx) {
|
||||
struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
|
||||
sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
@@ -126,6 +128,8 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
|
||||
info.channelId = channelId;
|
||||
info.sendbytes = sendbytes;
|
||||
info.recvbytes = recvbytes;
|
||||
info.sendIdx = sendIdx;
|
||||
info.recvIdx = recvIdx;
|
||||
if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
|
||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
@@ -135,7 +139,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P));
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
|
||||
return args;
|
||||
}
|
||||
|
||||
@@ -150,6 +154,8 @@ static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels
|
||||
return size;
|
||||
}
|
||||
|
||||
RCCL_PARAM(P2pNetThreshold, "RCCL_P2P_NET_THRESHOLD", 131072);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
@@ -195,14 +201,15 @@ ncclResult_t ncclGroupEnd() {
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
|
||||
args->coll.connIndex = 0;
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
@@ -210,7 +217,29 @@ ncclResult_t ncclGroupEnd() {
|
||||
}
|
||||
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect = 0;
|
||||
args->coll.comm->connect[0] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
|
||||
args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -253,6 +282,12 @@ sched_delta:
|
||||
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
|
||||
uint16_t sendIdx = 0, recvIdx = 0;
|
||||
if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
|
||||
sendIdx = NCCL_CONN_IDX_P2P_NET;
|
||||
if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
|
||||
recvIdx = NCCL_CONN_IDX_P2P_NET;
|
||||
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int sendRemaining = 1, recvRemaining = 1;
|
||||
@@ -270,7 +305,7 @@ sched_delta:
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
|
||||
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
|
||||
sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL, sendIdx, recvIdx), ret, group_cleanup);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
|
||||
@@ -74,7 +74,7 @@ struct ncclComm {
|
||||
|
||||
void* bootstrap;
|
||||
// Bitmasks for ncclTransportP2pSetup
|
||||
int connect;
|
||||
int connect[NCCL_MAX_CONNS];
|
||||
uint32_t* connectSend;
|
||||
uint32_t* connectRecv;
|
||||
|
||||
@@ -128,7 +128,7 @@ struct ncclComm {
|
||||
volatile uint32_t *abortFlag;
|
||||
|
||||
// Flags for enable P2P NET
|
||||
uint32_t *p2pNet;
|
||||
uint32_t p2pNet;
|
||||
uint32_t useIntraNet;
|
||||
|
||||
// Device side of the communicator
|
||||
|
||||
@@ -163,7 +163,6 @@ struct ncclDirect {
|
||||
int down[NCCL_MAX_DIRECT_ARITY];
|
||||
};
|
||||
|
||||
#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
|
||||
#define NCCL_CONN_IDX_P2P_NET 2
|
||||
#define NCCL_MAX_CONNS 3
|
||||
struct ncclPeer {
|
||||
@@ -208,7 +207,14 @@ struct ncclWorkElem {
|
||||
int sendChunkSize;
|
||||
int recvChunkSize;
|
||||
int32_t delta;
|
||||
uint16_t nThreads;
|
||||
union {
|
||||
struct {
|
||||
uint16_t nThreads:12;
|
||||
uint16_t sendIdx:2;
|
||||
uint16_t recvIdx:2;
|
||||
};
|
||||
uint16_t padding;
|
||||
};
|
||||
} p2p;
|
||||
struct {
|
||||
uint16_t padding[15];
|
||||
@@ -357,9 +363,6 @@ struct ncclDevComm {
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
|
||||
// Flags for enable P2P NET
|
||||
uint32_t *p2pNet;
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf* devProf;
|
||||
|
||||
@@ -53,6 +53,8 @@ struct ncclInfo {
|
||||
int sendChunkSize;
|
||||
uint32_t delta;
|
||||
int channelId;
|
||||
uint16_t sendIdx;
|
||||
uint16_t recvIdx;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -61,6 +61,8 @@ struct ncclProxyArgs {
|
||||
int idle;
|
||||
uint64_t hdp_flushed;
|
||||
uint8_t connIndex;
|
||||
uint8_t sendIdx;
|
||||
uint8_t recvIdx;
|
||||
|
||||
// Element linking
|
||||
pthread_mutex_t mutex;
|
||||
|
||||
+3
-7
@@ -377,7 +377,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
free(comm->intraCC);
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->p2pNet));
|
||||
|
||||
// Poison comm to try and catch a double free
|
||||
commPoison(comm);
|
||||
@@ -430,9 +429,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->hostDevComm.abortFlag = comm->abortFlag;
|
||||
STORE(comm->abortFlag, 0);
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->p2pNet, 1));
|
||||
comm->hostDevComm.p2pNet = comm->p2pNet;
|
||||
STORE(comm->p2pNet, 0);
|
||||
comm->collOpCount = 0;
|
||||
comm->p2pOpCount = 0x8000;
|
||||
|
||||
@@ -466,8 +462,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
|
||||
static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
|
||||
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks*NCCL_MAX_CONNS));
|
||||
NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks*NCCL_MAX_CONNS));
|
||||
|
||||
comm->p2pSendCount = comm->p2pRecvCount = 0;
|
||||
NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
|
||||
@@ -837,7 +833,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
|
||||
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
|
||||
if (rcclParamP2pNetDisable() == 0) {
|
||||
STORE(comm->p2pNet, 1);
|
||||
comm->p2pNet = 1;
|
||||
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
|
||||
}
|
||||
else
|
||||
|
||||
+2
-2
@@ -287,7 +287,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
|
||||
sub->sendbytes = 0;
|
||||
sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
|
||||
if (sub->nsteps == 0) sub->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, NCCL_CONN_IDX_P2P));
|
||||
NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, args->recvIdx));
|
||||
}
|
||||
if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
|
||||
int peersend = (comm->rank+sub->delta)%comm->nRanks;
|
||||
@@ -295,7 +295,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
|
||||
sub->recvbytes = 0;
|
||||
sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
|
||||
if (sub->nsteps == 0) sub->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(proxySend, peersend, args, NCCL_CONN_IDX_P2P));
|
||||
NCCLCHECK(SaveProxy(proxySend, peersend, args, args->sendIdx));
|
||||
}
|
||||
// Reset proxy args for potentially multiple cuda graph launches
|
||||
// It is safe as long as SaveProxy copies contents of args to op
|
||||
|
||||
@@ -69,12 +69,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
||||
comm->connectRecv[peer] |= mask;
|
||||
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
|
||||
comm->connectSend[peer] |= mask;
|
||||
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -98,8 +98,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer];
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
|
||||
|
||||
struct ncclConnect* recvData = data;
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
@@ -145,7 +145,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
|
||||
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
|
||||
}
|
||||
CUDACHECK(hipStreamSynchronize(transportSetupStream));
|
||||
CUDACHECK(hipStreamDestroy(transportSetupStream));
|
||||
|
||||
@@ -122,6 +122,6 @@ namespace CorrectnessTests
|
||||
testing::Values(2,3,4,5,6,7,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
|
||||
testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
|
||||
CorrectnessTest::PrintToStringParamName());
|
||||
} // namespace
|
||||
|
||||
Ссылка в новой задаче
Block a user