diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a80d1a8af..7b24d0abcf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,8 +103,6 @@ set(CU_SOURCES src/collectives/device/broadcast.cu src/collectives/device/reduce_scatter.cu src/collectives/device/sendrecv.cu - src/collectives/device/all_to_all.cu - src/collectives/device/all_to_allv.cu src/collectives/device/functions.cu) set(CPP_SOURCES) diff --git a/src/channel.cc b/src/channel.cc index 635f07869b..12cc44238d 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -29,14 +29,12 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) { // Per-channel operation list. NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); - NCCLCHECK(ncclCudaHostCalloc(&channel->a2avParams, comm->nRanks*NCCL_MAX_OPS*4)); return ncclSuccess; } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (channel->id == -1) return ncclSuccess; // Operation list - NCCLCHECK(ncclCudaHostFree(channel->a2avParams)); NCCLCHECK(ncclCudaHostFree(channel->workFifo)); // Free Ring index to rank tables diff --git a/src/collectives/all_to_all_api.cc b/src/collectives/all_to_all_api.cc index 9ecc1c47e9..a58f42e1eb 100644 --- a/src/collectives/all_to_all_api.cc +++ b/src/collectives/all_to_all_api.cc @@ -12,22 +12,15 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_ ncclComm_t comm, hipStream_t stream); ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { - if (comm->alltoallDisable) { - int nRanks; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - size_t rankOffset = count * ncclTypeSize(datatype); - if (count == 0) return ncclSuccess; - NCCLCHECK(ncclGroupStart()); - for (int r=0; ralltoallDisable) { - int nRanks; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - NCCLCHECK(ncclGroupStart()); - for (int r=0; r -class ncclFunction { - public: - __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { - const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int nChannels = args->coll.nChannels; - struct ncclDevComm* comm = args->comm; - struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclRing* ring = &channel->ring; - const ssize_t size = args->coll.count; - const int nranks = comm->nRanks; - const int bid = args->coll.bid; - const int rank = ring->devUserRanks[0]; - const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS; - const int peersPerChan = DIVUP(nranks, nChannels); - const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize); - - // Compute pointers - const T * __restrict__ thisInput = (const T*)args->sendbuff; - T * __restrict__ thisOutput = (T*)args->recvbuff; - - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - for (int i = 0; i < peersPerChan; i++) { - if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) || - (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks)) - continue; - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - if ((blockIdx.x*peersPerChan+i)%nranks == 0) { - if (tid < nthreads && thisInput != thisOutput) { - const T* sendbuff = thisInput+chunkOffset+rank*size; - T* recvbuff = thisOutput+chunkOffset+rank*size; - // local copy - ReduceOrCopyMulti(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem); - } - } - } - } - - for (int i = 0; i < peersPerChan; i++) { - if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) || - (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks)) - continue; - if ((blockIdx.x*peersPerChan+i)%nranks != 0) { - int nthreadsSplit = nthreads/2; - if (tid < nthreadsSplit ) { - int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks; - ncclPrimitives - prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - ssize_t send_offset = chunkOffset + peerSend*size; - prims.send(thisInput+send_offset, nelem); - } - } else { - int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks; - ncclPrimitives - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - ssize_t recv_offset = chunkOffset + peerRecv*size; - prims.recv(thisOutput+recv_offset, nelem); - } - } - } - } - } -}; \ No newline at end of file diff --git a/src/collectives/device/all_to_allv.cu b/src/collectives/device/all_to_allv.cu deleted file mode 100644 index 49d883c530..0000000000 --- a/src/collectives/device/all_to_allv.cu +++ /dev/null @@ -1,11 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "all_to_allv.h" -#include "common.h" -#include "collectives.h" - -IMPL_COLL_FUNC(AllToAllv, RING, SIMPLE, Sum, int8_t); diff --git a/src/collectives/device/all_to_allv.h b/src/collectives/device/all_to_allv.h deleted file mode 100644 index dffa2e1d60..0000000000 --- a/src/collectives/device/all_to_allv.h +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "devcomm.h" -#include "primitives.h" -#include "collectives.h" - -template -class ncclFunction { - public: - __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { - const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int nChannels = args->a2av.nChannels; - struct ncclDevComm* comm = args->comm; - struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclRing* ring = &channel->ring; - const ssize_t typesize = args->a2av.count; - const int nranks = comm->nRanks; - const int bid = args->a2av.bid; - const int rank = ring->devUserRanks[0]; - const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS; - const int peersPerChan = DIVUP(nranks, nChannels); - const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize); - - // Compute pointers - const T * __restrict__ thisInput = (const T*)args->sendbuff; - T * __restrict__ thisOutput = (T*)args->recvbuff; - - size_t* params = channel->a2avParams + nranks*4*args->index; - size_t *sendcounts = params; - size_t *sdispls = params + nranks; - size_t *recvcounts = params + nranks*2; - size_t *rdispls = params + nranks*3; - ssize_t size = sendcounts[rank]*typesize; - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - for (int i = 0; i < peersPerChan; i++) { - if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) || - (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks)) - continue; - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - if ((blockIdx.x*peersPerChan+i)%nranks == 0) { - if (tid < nthreads && thisInput != thisOutput) { - const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize; - T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize; - // local copy - ReduceOrCopyMulti(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem); - } - } - } - } - - for (int i = 0; i < peersPerChan; i++) { - if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) || - (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks)) - continue; - if ((blockIdx.x*peersPerChan+i)%nranks != 0) { - int nthreadsSplit = nthreads/2; - if (tid < nthreadsSplit ) { - int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks; - ncclPrimitives - prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0); - size = sendcounts[peerSend]*typesize; - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize; - prims.send(thisInput+send_offset, nelem); - } - } else { - int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks; - ncclPrimitives - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1); - size = recvcounts[peerRecv]*typesize; - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1))); - ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0); - int nelem = min(realChunkSize, size-chunkOffset); - ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize; - prims.recv(thisOutput+recv_offset, nelem); - } - } - } - } - } -}; \ No newline at end of file diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 80fdca2b63..6fd5ea28ae 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -106,9 +106,7 @@ NCCL_FUNCS2B(AllGather), \ NCCL_FUNCS2A(ReduceScatter), \ NCCL_FUNCS2C(AllReduce), \ - NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \ - NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \ - NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) } + NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) } // [/RCCL] // Must be consistent with the ncclFuncSet enum @@ -125,8 +123,6 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ NCCL_FUNCS2A(ReduceScatter), NCCL_FUNCS2C(AllReduce), NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), - NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), - NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t), #endif }; @@ -176,9 +172,7 @@ void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept { else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c); } else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c); - else if (c->funcIndex == 1800) ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex == 1801) ncclFunction_AllToAll_RING_SIMPLE_Sum_int8_t(c); - else if (c->funcIndex == 1802) ncclFunction_AllToAllv_RING_SIMPLE_Sum_int8_t(c); + else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c); } static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { @@ -213,10 +207,6 @@ class ncclFunction { if (fIdx == FUNC_INDEX_P2P) { \ comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \ comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \ - } else if (fIdx == FUNC_INDEX_A2AV) { \ - comm->collTrace[pos].coll.nThreads = w->nThreads; \ - comm->collTrace[pos].coll.bid = w->a2av.bid; \ - comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \ } else { \ comm->collTrace[pos].coll.nThreads = w->nThreads; \ comm->collTrace[pos].coll.bid = w->coll.bid; \ diff --git a/src/enqueue.cc b/src/enqueue.cc index e331a86b47..002fb20db8 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -280,13 +280,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) { struct ncclComm* comm = info->comm; float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete. // Find algorithm / protocol. - if (info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) { - info->algorithm = NCCL_ALGO_RING; - info->protocol = NCCL_PROTO_SIMPLE; - info->nChannels = comm->nChannels; - info->nThreads = NCCL_MAX_NTHREADS; - return ncclSuccess; - } info->algorithm = -1; info->protocol = -1; int nAlgos = NCCL_NUM_ALGORITHMS; @@ -348,9 +341,6 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) { info->pattern = ncclPatternRing; break; case ncclFuncAllReduce: info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; - case ncclFuncAllToAll: - case ncclFuncAllToAllv: - info->pattern = ncclPatternAll; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); return ncclInternalError; @@ -372,9 +362,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternRingTwice: info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; - case ncclPatternAll: - info->nstepsPerLoop = 1; - info->nchunksPerLoop = info->comm->nRanks; break; default: WARN("Unknown pattern %d", info->pattern); return ncclInternalError; @@ -390,21 +377,12 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); - if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) - && info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME)) - info->nChannels = info->comm->p2pnChannels; - work->opCount = info->comm->opCount; work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; - if (info->coll == ncclFuncAllToAllv) { - work->a2av.count = info->count; - work->a2av.nChannels = info->nChannels; - } else { - work->coll.root = info->root; - work->coll.count = info->count; - work->coll.nChannels = info->nChannels; - } + work->coll.root = info->root; + work->coll.count = info->count; + work->coll.nChannels = info->nChannels; work->nThreads = info->nThreads; work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol); @@ -522,7 +500,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) { info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, info->nThreads); - int nChannels = (info->coll == ncclFuncAllToAllv) ? work.a2av.nChannels : work.coll.nChannels; + int nChannels = work.coll.nChannels; int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1; for (int bid=0; bidpattern, info->root, info->comm->nRanks)); info->comm->myParams->gridDim.x++; - if (info->coll == ncclFuncAllToAllv) { - work.a2av.bid = bid % work.a2av.nChannels; - } else { - work.coll.bid = bid % nChannels; - } // [RCCL] Setup pointers to where all the input/output pointers will be if (info->protocol == NCCL_PROTO_CLIQUE) { @@ -551,16 +524,8 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) { } // [/RCCL] - struct ncclWork* w; - NCCLCHECK(getNextOp(channel, &w, &work)); - if (info->coll == ncclFuncAllToAllv) { - struct ncclWorkElem* e = w->elems; - size_t* params = channel->a2avParams + info->comm->nRanks*4*e->index; - memcpy(params, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks)); - memcpy(params+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks)); - memcpy(params+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks)); - memcpy(params+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks)); - } + work.coll.bid = bid % nChannels; + NCCLCHECK(getNextOp(channel, NULL, &work)); } info->comm->opCount++; return ncclSuccess; @@ -714,12 +679,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); NCCLCHECKGOTO(checkSetStream(info), ret, end); - if (info->coll == ncclFuncAllToAllv) - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls, - info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - else if (info->coll != ncclFuncSendRecv) - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); @@ -741,12 +701,7 @@ end: NCCLCHECK(ArgsCheck(info)); NCCLCHECK(checkSetStream(info)); - if (info->coll == ncclFuncAllToAllv) - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls, - info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - else - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); diff --git a/src/include/collectives.h b/src/include/collectives.h index eef7a3cd16..ee4cc9b53c 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -9,12 +9,7 @@ #define NCCL_COLLECTIVES_H_ #define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) -#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1) -#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2) - -#define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \ - ? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \ - : ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))) +#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) #define NCCL_FUNC_NAME(func, algo, proto, redop, type) \ ncclFunction_##func##_##algo##_##proto##_##redop##_##type @@ -65,8 +60,6 @@ DECL(ReduceScatter) \ DECL(AllReduce) \ DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \ - DECL5(AllToAll, RING, SIMPLE, Sum, int8_t) \ - DECL5(AllToAllv, RING, SIMPLE, Sum, int8_t) \ DECL_ALL @@ -88,12 +81,4 @@ DECL_ALL #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define SENDRECV_SLICEFACTOR 1 -#define GATHER_SLICESTEPS 4 -#define GATHER_CHUNKSTEPS 4 -#define SCATTER_SLICESTEPS 4 -#define SCATTER_CHUNKSTEPS 4 -#define ALLTOALL_SLICESTEPS 4 -#define ALLTOALL_CHUNKSTEPS 4 -#define ALLTOALLV_SLICESTEPS 4 -#define ALLTOALLV_CHUNKSTEPS 4 #endif diff --git a/src/include/comm.h b/src/include/comm.h index a507a5616f..cbdb66a9ea 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -163,7 +163,6 @@ struct ncclComm { int p2pRecvCount; // [RCCL] - bool alltoallDisable; // RCCL AllToAll/Scatter/Gather API CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels int rootPid; // Process ID of root // [/RCCL] diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 06a2742acf..9997de7f7c 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -27,8 +27,8 @@ #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t; -extern const char* ncclFuncStr[]; +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t; +extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1]; #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet #define NCCL_ALGO_TREE 0 @@ -191,11 +191,6 @@ struct ncclWorkElem { int32_t delta; uint16_t nThreads; } p2p; - struct { - size_t count; - uint8_t bid; - uint8_t nChannels; - } a2av; // [RCCL] Clique-based arguments // NOTE: Follows same field structure as coll // because nChannels is accessed from "coll" struct. @@ -232,7 +227,6 @@ struct ncclChannel { struct ncclWork* workFifo; int workCount; uint64_t workFifoTail; // Only used by CPU - size_t* a2avParams; #ifdef ENABLE_PROFILING struct timeval tvs; diff --git a/src/include/info.h b/src/include/info.h index fa9402f350..6b71492bd1 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -20,8 +20,7 @@ typedef enum { ncclPatternTreeDown, ncclPatternTreeUpDown, ncclPatternCollTreeUp, - ncclPatternCollTreeDown, - ncclPatternAll + ncclPatternCollTreeDown } ncclPattern_t; // Used to pass NCCL call information between functions @@ -40,11 +39,6 @@ struct ncclInfo { // Algorithm details int chunkSteps; int sliceSteps; - // For alltoallv - const size_t *sendcounts; - const size_t *sdispls; - const size_t *recvcounts; - const size_t *rdispls; // Computed later int algorithm; int protocol; diff --git a/src/init.cc b/src/init.cc index 1c7c81f7b8..3ebe870465 100644 --- a/src/init.cc +++ b/src/init.cc @@ -45,7 +45,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch; #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream #endif -const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" }; +const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; const char* ncclRedOpStr[ncclNumOps] = { "Sum", "Prod", "Max", "Min" }; @@ -168,7 +168,7 @@ void *ncclCommThreadMain(void *arg) { ncclComm_t comm = (ncclComm_t)arg; int head = comm->hostDevComm.collTraceHead; #define MAX_NAME_LENGTH 32 - char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_A2AV+1)); + char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+1)); for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) { for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) { for (int type = 0; type < ncclNumTypes; type++) { @@ -182,10 +182,8 @@ void *ncclCommThreadMain(void *arg) { } } } - for (int func = NCCL_NUM_FUNCTIONS; func < NCCL_NUM_FUNCTIONS+3; func++) { - char* line = func_names+MAX_NAME_LENGTH*(FUNC_INDEX_P2P+func-NCCL_NUM_FUNCTIONS); - sprintf(line, "%s", ncclFuncStr[func]); - } + char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX_P2P; + sprintf(line, "%s", ncclFuncStr[NCCL_NUM_FUNCTIONS]); do { int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS; int count; @@ -223,7 +221,7 @@ void *ncclCommThreadMain(void *arg) { sprintf(line+offset, " KL HWID %8x %s ", td->data_0, func_names+MAX_NAME_LENGTH*fIdx); offset = strlen(line); - if (fIdx > FUNC_INDEX_A2AV) + if (fIdx > FUNC_INDEX_P2P) sprintf(line+offset, "ERROR bad function index %d", fIdx); else if (fIdx == FUNC_INDEX_P2P) sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); @@ -234,7 +232,7 @@ void *ncclCommThreadMain(void *arg) { if (fIdx != 0xffff) { sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx); offset = strlen(line); - if (fIdx > FUNC_INDEX_A2AV) + if (fIdx > FUNC_INDEX_P2P) sprintf(line+offset, "ERROR bad function index %d", fIdx); else if (fIdx == FUNC_INDEX_P2P) sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta); @@ -318,8 +316,8 @@ static ncclResult_t commFree(ncclComm_t comm) { #ifdef ENABLE_COLLTRACE STORE(&comm->hostDevComm.collTraceExit, 1); if (comm->hostDevComm.collTraceThread) pthread_join(comm->hostDevComm.collTraceThread, NULL); - CUDACHECK(hipHostFree((void *)comm->hostDevComm.collTrace)); - CUDACHECK(hipHostFree((void *)comm->hostDevComm.collTraceTail)); + NCCLCHECK(ncclCudaHostFree((void *)comm->hostDevComm.collTrace)); + NCCLCHECK(ncclCudaHostFree((void *)comm->hostDevComm.collTraceTail)); #endif free(comm->peerInfo); @@ -361,7 +359,6 @@ static ncclResult_t commFree(ncclComm_t comm) { return ncclSuccess; } -RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1); RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0); RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0); @@ -413,8 +410,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { #endif #ifdef ENABLE_COLLTRACE - CUDACHECK(hipHostMalloc((void**) &comm->hostDevComm.collTraceTail, sizeof(uint32_t), hipHostMallocMapped)); - CUDACHECK(hipHostMalloc((void**) &comm->hostDevComm.collTrace, sizeof(struct ncclCollTrace) * COLLTRACE_NUM_ITEMS, hipHostMallocMapped)); + NCCLCHECK(ncclCudaHostCalloc(&comm->hostDevComm.collTraceTail, 1)); + NCCLCHECK(ncclCudaHostCalloc(&comm->hostDevComm.collTrace, COLLTRACE_NUM_ITEMS)); memset(comm->hostDevComm.collTrace, 0, sizeof(struct ncclCollTrace) * COLLTRACE_NUM_ITEMS); comm->hostDevComm.collTraceExit = comm->hostDevComm.collTraceHead = *comm->hostDevComm.collTraceTail = 0; if ((ncclDebugLevel >= NCCL_LOG_INFO) && (ncclDebugMask & NCCL_COLL)) @@ -440,9 +437,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { // Mark channels as non initialized. for (int c=0; cchannels[c].id = -1; - comm->alltoallDisable = true; - if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false; - *comret = comm; return ncclSuccess; } @@ -929,7 +923,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int fullCudaCompCap; int nChannels; int gcn; - int alltoallDisable; struct ncclGraphInfo tree; struct ncclGraphInfo ring; struct ncclGraphInfo collNet; @@ -941,7 +934,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap; allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn; - allGather3Data[rank].alltoallDisable = comm->topo->nodes[GPU].count == comm->topo->nRanks ? 1 : comm->alltoallDisable; allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); @@ -991,11 +983,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); int gcn = allGather3Data[0].gcn; - int alltoallDisable = 0; for (int i=0; inChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); @@ -1015,11 +1005,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); } - if (comm->alltoallDisable != alltoallDisable) { - comm->alltoallDisable = alltoallDisable; - } - INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled"); - // count NETs used by ring int nNets = 0; int nets[MAXCHANNELS*2]; @@ -1134,29 +1119,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); - if (!alltoallDisable) { - int nc = comm->p2pnChannels; - for (int c=0; cchannels[c].peers[peerSend].send.connected == 0) { - comm->connectSend[peerSend] |= (1<connect = 1; - } - if (comm->channels[c].peers[peerRecv].recv.connected == 0) { - comm->connectRecv[peerRecv] |= (1<connect = 1; - } - } - } - NCCLCHECK(ncclTransportP2pSetup(comm, NULL)); - } - NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm)); if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm)); diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 741a3d6141..716cd53408 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -46,16 +46,11 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { } // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. info->nBytes = info->count * ncclTypeSize(info->datatype); - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAll) { + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { info->count = info->nBytes; info->datatype = ncclInt8; } - if (info->coll == ncclFuncAllToAllv) { - // Use count to store data type size for alltoallv - info->count = ncclTypeSize(info->datatype); - info->datatype = ncclInt8; - } - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllToAll) info->nBytes *= info->comm->nRanks; // count is per rank + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank if (info->op < 0 || info->op >= ncclNumOps) { WARN("%s : invalid reduction operation %d", info->opName, info->op); diff --git a/test/test_AllToAll.cpp b/test/test_AllToAll.cpp index a1b0c00510..17ab932978 100644 --- a/test/test_AllToAll.cpp +++ b/test/test_AllToAll.cpp @@ -62,6 +62,6 @@ namespace CorrectnessTests testing::Values(2,3,4,5,6,7,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_AllToAllMultiProcess.cpp b/test/test_AllToAllMultiProcess.cpp index 842c1ee9e7..8b4ae6d417 100644 --- a/test/test_AllToAllMultiProcess.cpp +++ b/test/test_AllToAllMultiProcess.cpp @@ -55,6 +55,6 @@ namespace CorrectnessTests testing::Values(2,3,4,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp index fe78b0f352..1d204fb30e 100644 --- a/test/test_AllToAllv.cpp +++ b/test/test_AllToAllv.cpp @@ -70,6 +70,6 @@ namespace CorrectnessTests testing::Values(2,3,4,5,6,7,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp index 9e71e75b8a..ac980b2bef 100644 --- a/test/test_CombinedCalls.cpp +++ b/test/test_CombinedCalls.cpp @@ -115,6 +115,6 @@ namespace CorrectnessTests testing::Values(2,3,4,5,6,7,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_Gather.cpp b/test/test_Gather.cpp index bf917c739d..837ec30ea7 100644 --- a/test/test_Gather.cpp +++ b/test/test_Gather.cpp @@ -66,6 +66,6 @@ namespace CorrectnessTests testing::Values(2,3,4,5,6,7,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_GatherMultiProcess.cpp b/test/test_GatherMultiProcess.cpp index 2ba477e168..c95f0ee21d 100644 --- a/test/test_GatherMultiProcess.cpp +++ b/test/test_GatherMultiProcess.cpp @@ -55,6 +55,6 @@ namespace CorrectnessTests testing::Values(2,3,4,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_Scatter.cpp b/test/test_Scatter.cpp index aaca748217..82f4f9088b 100644 --- a/test/test_Scatter.cpp +++ b/test/test_Scatter.cpp @@ -66,6 +66,6 @@ namespace CorrectnessTests testing::Values(2,3,4,5,6,7,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/test/test_ScatterMultiProcess.cpp b/test/test_ScatterMultiProcess.cpp index 0f12c84308..c732e4e91d 100644 --- a/test/test_ScatterMultiProcess.cpp +++ b/test/test_ScatterMultiProcess.cpp @@ -55,6 +55,6 @@ namespace CorrectnessTests testing::Values(2,3,4,8), // In-place or not testing::Values(false), - testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")), + testing::Values("")), CorrectnessTest::PrintToStringParamName()); } // namespace diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 0a82949eaf..d8a438bea6 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -30,7 +30,7 @@ #include "model.h" #include "utils.h" -const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" }; +const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv" }; const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; @@ -270,7 +270,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t int fullCudaCompCap; int nChannels; int gcn; - int alltoallDisable; struct ncclGraphInfo tree; struct ncclGraphInfo ring; struct ncclGraphInfo collNet; @@ -283,7 +282,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap; allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn; - allGather3Data[rank].alltoallDisable = comm->topo->nodes[NET].count? 1 : comm->alltoallDisable; allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); @@ -450,8 +448,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* } -RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 0); - ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { int rank = comm->rank; @@ -481,11 +477,9 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); int gcn = allGather3Data[0].gcn; - int alltoallDisable = 0; for (int i=0; inChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); @@ -505,11 +499,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); } - if (comm->alltoallDisable != alltoallDisable) { - comm->alltoallDisable = alltoallDisable; - } - INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled"); - // count NETs used by ring int nNets = 0; int nets[MAXCHANNELS*2]; @@ -625,29 +614,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); - if (!alltoallDisable) { - int nc = comm->nChannels; - for (int c=0; cchannels[c].peers[peerSend].send.connected == 0) { - comm->connectSend[peerSend] |= (1<connect = 1; - } - if (comm->channels[c].peers[peerRecv].recv.connected == 0) { - comm->connectRecv[peerRecv] |= (1<connect = 1; - } - } - } - NCCLCHECK(ncclTransportP2pSetup(comm, NULL)); - } - //NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm)); //if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));