Add support for CUDA graphs.
Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439.
Fix bootstrap issue caused by connection reordering.
Fix CPU locking block.
Improve CollNet algorithm.
Improve performance on DGX A100 for communicators with only one GPU per node.
Tento commit je obsažen v:
Sylvain Jeaugey
2021-04-12 16:00:11 -07:00
rodič 911d61f214
revize a46ea10583
43 změnil soubory, kde provedl 2687 přidání a 1244 odebrání
+355 -107
Zobrazit soubor
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,6 +7,7 @@
#include "enqueue.h"
#include "argcheck.h"
#include "coll_net.h"
#include "gdrwrap.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(func, algo, redop, dtype) \
@@ -63,6 +64,21 @@ static void* const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_N
NCCL_FUNCS2A(AllReduce)
};
// Determine the maximum kernel stack size of all CUDA kernels
size_t ncclKernMaxLocalSize() {
ncclResult_t res = ncclSuccess;
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
cudaFuncAttributes attr = {0};
size_t max = 0;
for (int i = 0; i < numNcclKerns; i++) {
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error);
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
}
error:
return (res != ncclSuccess) ? 0 : max;
}
/*****************************************************************************/
/* Launch system : synchronization and CUDA kernel launch */
/*****************************************************************************/
@@ -108,14 +124,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
return ncclSuccess;
}
static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
ncclComm_t comm = eqInfo->comm;
struct cudaLaunchParams* params = comm->myParams;
// Only launch blocks where we have work to do.
for (int c=0; c<comm->p2pnChannels; c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
// This is not supported when we are in cudaGraph mode.
// Because in cudaGraph mode the launch param needs to be determined
// at capture time instead of launch time.
if (!usingCudaGraph) {
for (int c=0; c<comm->p2pnChannels; c++) {
if (comm->channels[c].workCount) params->gridDim.x = c+1;
}
eqInfo->maxChannels = params->gridDim.x;
}
// Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<params->gridDim.x; c++) {
for (int c=0; c<eqInfo->maxChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->workCount == 0) {
struct ncclWork* w;
@@ -126,18 +151,35 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams*
e->p2p.nThreads = 0;
}
channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2;
if (c == 0) {
// Find the first operation, choose the kernel accordingly and pass it as the first argument.
// Note that changing cuda launch argument after capture is not supported by cudaGraph
struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
if (!usingCudaGraph) {
params->func = ncclKerns[elem->funcIndex];
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
}
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
}
if (channel->gdrMemDesc) {
// GDRCOPY support
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
uint64_t nelems = channel->workCount;
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld last %ld nelems %zi",
channel->workFifo, channel->workFifoGdr, first, last, nelems);
for (int i = 0; i < nelems; i++) {
int elem = (first+i) % NCCL_MAX_OPS;
// Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping
NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1));
}
}
}
// Find the first operation, choose the kernel accordingly and pass it
// as the first argument.
struct ncclChannel* c0 = comm->channels;
struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS);
struct ncclWorkElem* elem = work->elems;
memcpy(&comm->args, elem, sizeof(struct ncclWorkElem));
// As we inline the first coll directly, we can free it immediately.
if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0;
params->func = ncclKerns[elem->funcIndex];
return ncclSuccess;
}
@@ -180,21 +222,23 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
struct cudaLaunchParams* params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
NCCLCHECK(setupLaunch(comm, params));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
if (comm->launchMode == ncclComm::GROUP &&
(comm->groupCudaStream ||
comm->userStream == cudaStreamDefault ||
comm->userStream == cudaStreamLegacy ||
comm->userStream == cudaStreamPerThread)) {
// Enqueue event in user stream
CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
CUDACHECK(cudaEventRecord(comm->intDoneEvent, comm->userStream));
// Create dependency between user stream and internal NCCL stream
CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0));
params->stream = comm->groupStream;
} else {
if (comm->userStream != params->stream) {
if (comm->userStream != params->stream && !comm->usingCudaGraph) {
// Stream changed from last call, create dependency against last NCCL kernel launch
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
}
@@ -213,7 +257,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
struct cudaLaunchParams *params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
@@ -226,44 +270,73 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
}
if (comm->launchMode == ncclComm::PARALLEL) {
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
} else {
if (comm->launchMode == ncclComm::GROUP) {
NCCLCHECK(ncclCpuBarrierOut(comm));
} else {
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
}
return ncclSuccess;
}
static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a cudaFree between the CUDA
// launch and the ncclProxyStart call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
uint64_t max = 0ULL;
for (int r=0; r<params->gridDim.x; r++) {
ncclComm_t comm = eqInfo->comm;
if (eqInfo->maxChannels == 0) return ncclSuccess;
for (int r=0; r<eqInfo->maxChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
max = std::max(max, channel->workFifoTail);
channel->workCount = 0;
}
for (int r=0; r<comm->p2pnChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
channel->workFifoTail = max;
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = max;
comm->lastChannel = 0;
NCCLCHECK(ncclProxyStart(comm));
return ncclSuccess;
}
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
ncclResult_t ncclRecordEvents(ncclComm_t comm) {
struct cudaLaunchParams *params = comm->myParams;
// Enqueue event after NCCL kernel
CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
// Enqueue event after NCCL kernel (only in non-graph mode)
if (!comm->usingCudaGraph) CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
if (comm->launchMode == ncclComm::GROUP &&
(comm->groupCudaStream ||
comm->userStream == cudaStreamDefault ||
comm->userStream == cudaStreamLegacy ||
comm->userStream == cudaStreamPerThread)) {
CUDACHECK(cudaEventRecord(comm->intDoneEvent, params->stream));
// Create dependency between NCCL internal stream and user stream
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0));
}
return ncclSuccess;
}
ncclResult_t ncclLaunchReset(ncclComm_t comm) {
comm->userStreamSet = false;
// We are finishing capture of the current launch
// But we need to keep the current enqueue info for CUDA graph
// Thus we need to creating a new enqueue info for the next run
if (comm->usingCudaGraph) {
NCCLCHECK(ncclCalloc(&comm->enqueueInfo, 1));
comm->enqueueInfo->comm = comm;
} else {
// If not in CUDA graph mode, we reuse the same info space
NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
}
struct cudaLaunchParams *params = comm->myParams;
params->gridDim.x = params->blockDim.x = 0;
params->func = NULL;
// Reset launch mode to GROUP if changed
if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP;
comm->usingCudaGraph = 0;
return ncclSuccess;
}
@@ -280,10 +353,10 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
int nAlgos = NCCL_NUM_ALGORITHMS;
// Check collNet support
int collNetTypeSupport = 0;
if (info->comm->collNetSupport)
if (info->comm->collNetSupport > 0)
NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport));
if (collNetTypeSupport != 1) nAlgos--;
for (int a=0; a<nAlgos; a++) {
if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time));
@@ -301,17 +374,31 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
int nc = (info->nChannels > 0) ? info->nChannels :
(info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
else if ((nt % 128) == 0) nt/=2;
else break;
if (info->algorithm == NCCL_ALGO_COLLNET) {
int ncSwitch = 16;
bool flag = true;
while (ncSwitch >= 1 && flag) {
while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) {
if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
nc--;
}
ncSwitch /= 2;
}
} else {
while (info->nBytes < nc*nt*threadThreshold) {
if (nc >= 2) nc--;
else if ((nt % 128) == 0) nt/=2;
else break;
}
}
if (info->protocol == NCCL_PROTO_SIMPLE) {
nt += WARP_SIZE; // Extra warp for sync
if (info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
}
if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE;
info->nChannels = nc;
info->nThreads = nt;
return ncclSuccess;
@@ -327,7 +414,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
case ncclFuncAllGather:
info->pattern = ncclPatternRing; break;
case ncclFuncAllReduce:
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
return ncclInternalError;
@@ -342,9 +429,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternTreeUpDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
case ncclPatternCollTreeUp:
case ncclPatternCollTreeDown:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternCollTreeUpDown:
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
case ncclPatternRingTwice:
@@ -390,9 +477,10 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*32 && chunkSize > 262144) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth/2 && chunkSize > 16384) chunkSize /= 2;
// Use lastChunkSize as chunkSize
work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
@@ -417,20 +505,23 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->chunkSize = chunkSize;
proxyArgs->protocol = info->protocol;
proxyArgs->dtype = info->datatype;
proxyArgs->redOp = info->op;
proxyArgs->redOp = (info->algorithm == NCCL_ALGO_COLLNET) ? info->op : ncclNumOps; // Only set redOp when using CollNet
proxyArgs->pattern = info->pattern;
proxyArgs->root = info->root;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps;
proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm);
return ncclSuccess;
}
@@ -445,64 +536,95 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
return ncclSuccess;
}
ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
if (info->comm->nRanks == 1) {
// Compute enqueue element, save it in list
// Compute CUDA launch parameters
// Capture time code in view of CUDA graph
static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
ncclComm_t comm = info->comm;
if (comm->nRanks == 1) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
}
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &work, &proxyArgs));
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
struct ncclWorkElem* work = &eqElem->work;
eqElem->proxyArgs.nsubs = 1;
NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs));
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
// Determine grid size
struct cudaLaunchParams* params = comm->myParams;
params->gridDim.x += info->nChannels;
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
int nChannels = work.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
// Inline the first kernel
if (params->func == NULL) {
params->func = ncclKerns[work->funcIndex];
memcpy(&comm->args, work, sizeof(struct ncclWorkElem));
comm->args.coll.bid = 0; // Only inline for channel 0
comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode
}
for (int bid=0; bid<nChannels*nSubChannels; bid++) {
int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
return ncclSuccess;
}
// Dynamic enqueue code
static ncclResult_t ncclEnqueueCollKernel(ncclComm_t comm, struct ncclQueueElem* eqElem) {
struct ncclWorkElem* work = &eqElem->work;
struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
int nChannels = work->coll.nChannels;
for (int bid=0; bid<nChannels; bid++) {
int channelId = comm->lastChannel % comm->nChannels;
struct ncclChannel* channel = comm->channels+channelId;
// Proxy
proxyArgs.channel = channel;
// Adjust pattern for CollNet based on channel index
if (nSubChannels == 2) {
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
}
proxyArgs->subs[0].channel = channel;
proxyArgs->opCount = comm->collOpCount;
proxyArgs->commOpCount = comm->opCount;
if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
info->comm->myParams->gridDim.x++;
work.coll.bid = bid % nChannels;
NCCLCHECK(getNextOp(channel, NULL, &work));
comm->lastChannel++;
work->coll.bid = bid % nChannels;
NCCLCHECK(getNextOp(channel, NULL, work));
//INFO(NCCL_COLL, "Host enqueue: bid %d channel %d index %ld nThreads %d funcIndex %d count %ld nChannels %d",
// work->coll.bid, channelId, channel->workFifoTail, work->nThreads, work->funcIndex, work->coll.count, work->coll.nChannels);
}
comm->collOpCount++;
return ncclSuccess;
}
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
ncclResult_t ncclSaveCommKernels(ncclComm_t comm) {
ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
if (comm->asyncOpCount == 0) {
return ncclSuccess;
} else if (comm->asyncOpCount == 1) {
// No aggregation
struct ncclInfo* info = comm->asyncOps;
info->nChannels = 0;
NCCLCHECK(ncclSaveKernel(info));
NCCLCHECK(ncclSetupCollKernel(info));
} else {
// Aggregation
size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases
// Reduce the per-channel size if we cannot fully utilize the channels
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
int channelUsed = 0;
for (int c = 0; c < comm->asyncOpCount; c++) {
struct ncclInfo* info = comm->asyncOps+c;
info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels
NCCLCHECK(ncclSaveKernel(info));
channelUsed += info->nChannels;
NCCLCHECK(ncclSetupCollKernel(info));
}
// If we wrap around on channels, then the inlined op on channel 0 is not the last one on this channel
// Then we need to change active from 2 to 1
if (channelUsed > comm->nChannels) comm->args.active = 1;
}
// Reset counters
comm->asyncOpCount = 0;
@@ -533,7 +655,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send.connected == 0) {
if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
}
@@ -546,7 +668,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv.connected == 0) {
if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
}
@@ -558,56 +680,165 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
return ncclSuccess;
}
static int getSegment(struct ncclInfo* info, struct ncclWork* work) {
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != info->delta; s++) {
static int getSegment(int delta, struct ncclWork* work) {
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) {
if (work->elems[s].p2p.nThreads == 0) return s;
}
return -1;
}
static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) {
struct ncclWorkElem* elem = work->elems+s;
static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) {
elem->comm = info->comm->devComm;
elem->funcIndex = FUNC_INDEX_P2P;
elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS;
elem->nThreads = NCCL_MAX_NTHREADS;
elem->sendbuff = info->sendbuff;
elem->recvbuff = info->recvbuff;
elem->p2p.sendCount = info->sendbytes;
elem->p2p.recvCount = info->recvbytes;
elem->p2p.sendChunkSize = info->sendChunkSize;
elem->p2p.recvChunkSize = info->recvChunkSize;
elem->p2p.delta = info->delta;
return ncclSuccess;
}
static ncclResult_t enqueueP2pOp(struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s) {
// Copy element into corresponding segment of ncclWork
memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
// Determine nThreads at dynamic time
const int nsegments = s+1;
int nThreads = 512;
while (nsegments*nThreads > 512) nThreads /= 2;
if (nThreads >= 128) nThreads += WARP_SIZE;
for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
return ncclSuccess;
}
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) {
int channelId = info->channelId;
struct ncclChannel* channel = info->comm->channels+channelId;
ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
struct ncclWorkElem* workElem = &eqElem->work;
struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
// Try to reuse last p2p operation if not full yet
struct ncclChannel* channel = proxyArgs->subs[0].channel;
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
int segment = -1;
if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) {
// Try to pack more segments into a single operation
segment = getSegment(info, w);
segment = getSegment(workElem->p2p.delta, w);
}
if (segment == -1) {
NCCLCHECK(getNextOp(channel, &w, NULL));
segment = 0;
}
NCCLCHECK(ncclProxySaveP2p(info, channel, segment));
NCCLCHECK(saveP2pOp(info, w, segment));
info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
// store work element into FIFO
NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
NCCLCHECK(enqueueP2pOp(workElem, w, segment));
return ncclSuccess;
}
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
ncclComm* comm = info->comm;
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
int channelId = info->channelId;
struct cudaLaunchParams* params = comm->myParams;
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
// Record the first kernel to launch
// Just for CUDA kernel to know this is a P2P operation
// The CUDA kernel does not use the inlined first work element as fastpath argument
if (params->func == NULL) {
params->func = ncclKerns[eqElem->work.funcIndex];
memcpy(&comm->args, &eqElem->work, sizeof(struct ncclWorkElem));
}
return ncclSuccess;
}
template<int USING_CUDA_GRAPH>
void CUDART_CB ncclEnqueueHostSetup(void* arg) {
ncclResult_t ret;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
ncclComm_t comm = eqInfo->comm;
// Iterate through the element list
struct ncclQueueElem* eqElem = eqInfo->elemList.head;
while (eqElem != eqInfo->elemList.tail) { // The queue always has one extra element
if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
} else {
NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem), ret, cb_end);
}
eqElem = eqElem->next;
}
NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end);
NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end);
cb_end:
if (ret != ncclSuccess) {
WARN("Failure in host setup : %s", ncclGetErrorString(ret));
}
eqInfo->ret = ret;
}
template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
comm->usingCudaGraph = 0;
#if CUDART_VERSION >= 11030
cudaStreamCaptureStatus captureStatus;
unsigned long long cudaGraphId;
CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
if (captureStatus == cudaStreamCaptureStatusActive) {
if (cudaGraphId != comm->lastCudaGraphId) {
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
// We are in a new graph, hence need to forget the last setup node so that
// the first setup node in the new graph will not have a dependency
comm->lastCudaGraphId = cudaGraphId;
comm->lastSetupNode = NULL;
}
if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
comm->usingCudaGraph = 1;
}
#endif
return ncclSuccess;
}
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
#if CUDART_VERSION >= 11030
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
// Create a CUDA object to wrap around the argument space
// which CUDA graph would manage lifetime of
cudaUserObject_t object;
CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
// Add a CPU node to the graph
cudaGraphNode_t setupNode;
cudaHostNodeParams setupNodeParams = {fn, eqInfo};
int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
comm->lastSetupNode = setupNode;
return ncclSuccess;
#else
WARN("NCCL does not support this CUDA version for CUDA graph feature");
return ncclInternalError;
#endif
}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
// Launch asynchronously if needed
if (ncclAsyncMode()) {
@@ -647,10 +878,27 @@ end:
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
NCCLCHECK(ncclSaveKernel(info));
NCCLCHECK(ncclBarrierEnqueue(info->comm));
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
NCCLCHECK(ncclEnqueueEvents(info->comm));
// Check whether we are in cuda graph mode
cudaGraph_t graph;
ncclComm_t comm = info->comm;
NCCLCHECK(ncclGetCudaGraph(comm, &graph));
// Common part between graph mode and non-graph mode
NCCLCHECK(ncclSetupCollKernel(info));
// Host setup
if (comm->usingCudaGraph) {
NCCLCHECK(ncclCudaGraphHostSetup(comm, graph));
} else {
ncclEnqueueHostSetup<0>(comm->enqueueInfo);
NCCLCHECK(comm->enqueueInfo->ret);
}
// Common part between graph mode and non-graph mode
NCCLCHECK(ncclLaunchBarrier(comm));
NCCLCHECK(ncclLaunchKernel(comm));
NCCLCHECK(ncclRecordEvents(comm));
NCCLCHECK(ncclLaunchReset(comm));
return ncclSuccess;
}
}