2018-12-13 15:56:12 -08:00
|
|
|
/*************************************************************************
|
2022-01-07 06:39:55 -08:00
|
|
|
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
|
2018-12-13 15:56:12 -08:00
|
|
|
*
|
|
|
|
|
* See LICENSE.txt for license information
|
|
|
|
|
************************************************************************/
|
|
|
|
|
|
|
|
|
|
#include "enqueue.h"
|
2019-11-19 14:57:39 -08:00
|
|
|
#include "argcheck.h"
|
2020-01-16 16:02:42 -08:00
|
|
|
#include "coll_net.h"
|
2021-04-12 16:00:11 -07:00
|
|
|
#include "gdrwrap.h"
|
2021-09-08 13:56:25 -07:00
|
|
|
#include "bootstrap.h"
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-09-23 09:52:42 -07:00
|
|
|
#include <cstring> // std::memcpy
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
// Only generate inline kernels for LL
|
2021-09-08 13:56:25 -07:00
|
|
|
#define NCCL_FUNC5(func, algo, devredop, dtype) \
|
|
|
|
|
(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
|
|
|
|
|
(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype), \
|
|
|
|
|
(void*)NCCL_KERN_NAME(func, algo, LL, devredop, dtype)
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
#define NCCL_FUNC4(func, devredop, type) \
|
|
|
|
|
(void*)NCCL_FUNC5(func, TREE, devredop, type), \
|
|
|
|
|
(void*)NCCL_FUNC5(func, RING, devredop, type), \
|
|
|
|
|
(void*)NCCL_FUNC5(func, COLLNET, devredop, type)
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-07-08 14:12:04 -07:00
|
|
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
|
|
|
|
// Must be consistent with ncclDataType_t
|
2021-09-08 13:56:25 -07:00
|
|
|
#define NCCL_FUNCS3A(func, devredop) \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int32_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint32_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int64_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint64_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, half), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, float), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, double), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, __nv_bfloat16)
|
|
|
|
|
#define NCCL_FUNCS3B(func, devredop) \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t)
|
2021-07-08 14:12:04 -07:00
|
|
|
#else
|
2018-12-13 15:56:12 -08:00
|
|
|
// Must be consistent with ncclDataType_t
|
2021-09-08 13:56:25 -07:00
|
|
|
#define NCCL_FUNCS3A(func, devredop) \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int32_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint32_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int64_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, uint64_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, half), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, float), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, double)
|
|
|
|
|
#define NCCL_FUNCS3B(func, devredop) \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t), \
|
|
|
|
|
(void*)NCCL_FUNC4(func, devredop, int8_t)
|
2021-07-08 14:12:04 -07:00
|
|
|
#endif
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
// Must be consistent with ncclDevRedOp_t -- but we only generate kernel for sums.
|
2020-09-04 14:35:05 -07:00
|
|
|
#define NCCL_FUNCS2A(func) \
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCL_FUNCS3A(func, Sum), /*Sum*/ \
|
|
|
|
|
NCCL_FUNCS3A(func, Sum), /*Prod*/ \
|
|
|
|
|
NCCL_FUNCS3A(func, Sum), /*Max*/ \
|
|
|
|
|
NCCL_FUNCS3A(func, Sum), /*Min*/ \
|
|
|
|
|
NCCL_FUNCS3A(func, Sum), /*PreMulSum*/ \
|
|
|
|
|
NCCL_FUNCS3A(func, Sum) /*SumPostDiv*/
|
2020-09-04 14:35:05 -07:00
|
|
|
#define NCCL_FUNCS2B(func) \
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCL_FUNCS3B(func, Sum), /*Sum*/ \
|
|
|
|
|
NCCL_FUNCS3B(func, Sum), /*Prod*/ \
|
|
|
|
|
NCCL_FUNCS3B(func, Sum), /*Max*/ \
|
|
|
|
|
NCCL_FUNCS3B(func, Sum), /*Min*/ \
|
|
|
|
|
NCCL_FUNCS3B(func, Sum), /*PreMulSum*/ \
|
|
|
|
|
NCCL_FUNCS3B(func, Sum) /*SumPostDiv*/
|
2018-12-13 15:56:12 -08:00
|
|
|
|
|
|
|
|
// Must be consistent with the ncclFuncSet enum
|
2021-09-08 13:56:25 -07:00
|
|
|
static void* const ncclKerns[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
2020-09-04 14:35:05 -07:00
|
|
|
(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
2021-09-08 13:56:25 -07:00
|
|
|
// We don't bake special kernels for the one-rank reductions
|
|
|
|
|
/*int8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*uint8*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*int32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*uint32*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*int64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*uint64*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*half*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*float*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
/*double*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
|
|
|
|
/*bfloat16*/(void*)NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
|
|
|
|
#endif
|
2020-09-04 14:35:05 -07:00
|
|
|
NCCL_FUNCS2B(Broadcast),
|
|
|
|
|
NCCL_FUNCS2A(Reduce),
|
|
|
|
|
NCCL_FUNCS2B(AllGather),
|
|
|
|
|
NCCL_FUNCS2A(ReduceScatter),
|
|
|
|
|
NCCL_FUNCS2A(AllReduce)
|
2018-12-13 15:56:12 -08:00
|
|
|
};
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// Determine the maximum kernel stack size of all CUDA kernels
|
|
|
|
|
size_t ncclKernMaxLocalSize() {
|
|
|
|
|
ncclResult_t res = ncclSuccess;
|
|
|
|
|
int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
|
|
|
|
|
cudaFuncAttributes attr = {0};
|
|
|
|
|
size_t max = 0;
|
|
|
|
|
for (int i = 0; i < numNcclKerns; i++) {
|
|
|
|
|
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, ncclKerns[i]), res, error);
|
|
|
|
|
if (attr.localSizeBytes > max) max = attr.localSizeBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
error:
|
|
|
|
|
return (res != ncclSuccess) ? 0 : max;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
/*****************************************************************************/
|
|
|
|
|
/* Launch system : synchronization and CUDA kernel launch */
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
|
|
|
|
#if CUDART_VERSION >= 9000
|
|
|
|
|
if (cgMode & 0x01) {
|
|
|
|
|
CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
|
|
|
|
|
// These flags are to reduce the latency of using this API
|
|
|
|
|
cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
int savedDev;
|
|
|
|
|
CUDACHECK(cudaGetDevice(&savedDev));
|
|
|
|
|
for (int i = 0; i < numDevices; i++) {
|
|
|
|
|
struct cudaLaunchParams* params = paramsList+i;
|
|
|
|
|
CUDACHECK(cudaSetDevice(cudaDevs[i]));
|
|
|
|
|
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
|
|
|
|
|
}
|
|
|
|
|
CUDACHECK(cudaSetDevice(savedDev));
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-04 14:35:05 -07:00
|
|
|
static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** work, struct ncclWorkElem* base) {
|
|
|
|
|
if (channel->workCount == NCCL_MAX_OPS) {
|
|
|
|
|
WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
|
|
|
|
|
return ncclInvalidUsage;
|
|
|
|
|
}
|
|
|
|
|
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
|
|
|
|
|
struct ncclWork* w = channel->workFifo+opIndex;
|
2022-01-07 06:39:55 -08:00
|
|
|
volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type;
|
|
|
|
|
while (typePtr[0] != ncclWorkTypeUnused) sched_yield();
|
2020-09-04 14:35:05 -07:00
|
|
|
memset(w, 0, sizeof(struct ncclWork));
|
|
|
|
|
// Initialize with work elem if provided
|
2022-01-07 06:39:55 -08:00
|
|
|
if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem));
|
2020-09-04 14:35:05 -07:00
|
|
|
channel->workFifoTail++;
|
|
|
|
|
channel->workCount++;
|
|
|
|
|
if (work) *work = w;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Finalize channel work FIFO states before launch
|
|
|
|
|
// Called during dynamic enqueue
|
2021-04-12 16:00:11 -07:00
|
|
|
static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
|
|
|
|
|
ncclComm_t comm = eqInfo->comm;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Do not use comm->myParams in this function unless in non-graph mode
|
|
|
|
|
// In graph mode, enqueue is async to capture, myParams can have been changed
|
2021-04-12 16:00:11 -07:00
|
|
|
struct cudaLaunchParams* params = comm->myParams;
|
|
|
|
|
|
2020-05-12 14:40:18 -07:00
|
|
|
// Only launch blocks where we have work to do.
|
2021-04-12 16:00:11 -07:00
|
|
|
// This is not supported when we are in cudaGraph mode.
|
|
|
|
|
// Because in cudaGraph mode the launch param needs to be determined
|
|
|
|
|
// at capture time instead of launch time.
|
|
|
|
|
if (!usingCudaGraph) {
|
2021-05-11 18:16:30 -07:00
|
|
|
int nChannels = std::max(comm->nChannels, comm->p2pnChannels);
|
|
|
|
|
for (int c=0; c<nChannels; c++) {
|
2021-04-12 16:00:11 -07:00
|
|
|
if (comm->channels[c].workCount) params->gridDim.x = c+1;
|
|
|
|
|
}
|
|
|
|
|
eqInfo->maxChannels = params->gridDim.x;
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case).
|
2021-04-12 16:00:11 -07:00
|
|
|
for (int c=0; c<eqInfo->maxChannels; c++) {
|
2020-05-12 14:40:18 -07:00
|
|
|
struct ncclChannel* channel = comm->channels+c;
|
2020-09-04 14:35:05 -07:00
|
|
|
if (channel->workCount == 0) {
|
|
|
|
|
struct ncclWork* w;
|
|
|
|
|
NCCLCHECK(getNextOp(channel, &w, NULL));
|
2022-01-07 06:39:55 -08:00
|
|
|
w->header.funcIndex = FUNC_INDEX_P2P;
|
|
|
|
|
w->header.type = ncclWorkTypeP2p;
|
|
|
|
|
w->header.nWarps = 0;
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1;
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
if (c == 0) {
|
2021-07-08 14:12:04 -07:00
|
|
|
// As we inline the first coll directly, we can free it immediately.
|
2021-09-08 13:56:25 -07:00
|
|
|
// Except P2P or aggregation or registration cases
|
2021-04-12 16:00:11 -07:00
|
|
|
struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
|
2022-01-07 06:39:55 -08:00
|
|
|
if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1)
|
|
|
|
|
work->header.type = ncclWorkTypeUnused;
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (channel->gdrMemDesc) {
|
|
|
|
|
// GDRCOPY support
|
|
|
|
|
uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS;
|
|
|
|
|
uint64_t nelems = channel->workCount;
|
2021-05-11 18:16:30 -07:00
|
|
|
TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi",
|
|
|
|
|
channel->workFifo, channel->workFifoGdr, first, nelems);
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
for (int i = 0; i < nelems; i++) {
|
|
|
|
|
int elem = (first+i) % NCCL_MAX_OPS;
|
|
|
|
|
// Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping
|
|
|
|
|
NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-09-04 14:35:05 -07:00
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
|
|
|
|
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
|
|
|
|
int val = *ptr;
|
|
|
|
|
bool done = false;
|
|
|
|
|
while (done == false) {
|
|
|
|
|
if (val >= comm->intraRanks) {
|
2020-09-04 14:35:05 -07:00
|
|
|
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclInvalidUsage;
|
|
|
|
|
}
|
|
|
|
|
if (val+1 == comm->intraRanks) {
|
|
|
|
|
// Reset the barrier.
|
|
|
|
|
comm->intraBarrier[comm->intraPhase^1] = 0;
|
|
|
|
|
*isLast = 1;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
done = __sync_bool_compare_and_swap(ptr, val, val+1);
|
|
|
|
|
val++;
|
|
|
|
|
}
|
|
|
|
|
*isLast = 0;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
|
|
|
|
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
|
|
|
|
int val = *ptr;
|
|
|
|
|
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
|
2020-09-04 14:35:05 -07:00
|
|
|
WARN("Trying to launch too many work elements, max is %d", NCCL_MAX_OPS);
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclInternalError;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
|
|
|
|
|
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
|
|
|
|
while (*ptr < comm->intraRanks) pthread_yield();
|
|
|
|
|
comm->intraPhase ^= 1;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Check dependency wrt outside streams or previous launches
|
|
|
|
|
// Launch kernel in GROUP mode
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
|
2018-12-13 15:56:12 -08:00
|
|
|
struct cudaLaunchParams* params = comm->myParams;
|
2020-05-12 14:40:18 -07:00
|
|
|
if (params->gridDim.x == 0) return ncclSuccess;
|
2018-12-13 15:56:12 -08:00
|
|
|
|
|
|
|
|
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
2021-04-12 16:00:11 -07:00
|
|
|
if (comm->launchMode == ncclComm::GROUP &&
|
|
|
|
|
(comm->groupCudaStream ||
|
|
|
|
|
comm->userStream == cudaStreamDefault ||
|
|
|
|
|
comm->userStream == cudaStreamLegacy ||
|
|
|
|
|
comm->userStream == cudaStreamPerThread)) {
|
2018-12-13 15:56:12 -08:00
|
|
|
// Enqueue event in user stream
|
2021-04-12 16:00:11 -07:00
|
|
|
CUDACHECK(cudaEventRecord(comm->intDoneEvent, comm->userStream));
|
2018-12-13 15:56:12 -08:00
|
|
|
// Create dependency between user stream and internal NCCL stream
|
2021-04-12 16:00:11 -07:00
|
|
|
CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0));
|
2018-12-13 15:56:12 -08:00
|
|
|
params->stream = comm->groupStream;
|
|
|
|
|
} else {
|
2021-04-12 16:00:11 -07:00
|
|
|
if (comm->userStream != params->stream && !comm->usingCudaGraph) {
|
2018-12-13 15:56:12 -08:00
|
|
|
// Stream changed from last call, create dependency against last NCCL kernel launch
|
|
|
|
|
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
|
|
|
|
}
|
|
|
|
|
params->stream = comm->userStream;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-12 14:40:18 -07:00
|
|
|
if (comm->launchMode == ncclComm::GROUP) {
|
|
|
|
|
int isLast = 0;
|
|
|
|
|
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
|
|
|
|
|
if (isLast) {
|
2018-12-13 15:56:12 -08:00
|
|
|
// I'm the last. Launch all operations.
|
|
|
|
|
NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
|
2020-05-12 14:40:18 -07:00
|
|
|
NCCLCHECK(ncclCpuBarrierLast(comm));
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Launch kernel in PARALLEL mode
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
|
2020-05-12 14:40:18 -07:00
|
|
|
struct cudaLaunchParams *params = comm->myParams;
|
|
|
|
|
if (params->gridDim.x == 0) return ncclSuccess;
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
// We can't print the CG mode before the first barrier happened.
|
|
|
|
|
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
|
|
|
|
|
*comm->intraCGMode ^= 0x10;
|
|
|
|
|
INFO(NCCL_INIT,"Launch mode %s%s%s",
|
|
|
|
|
comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
|
|
|
|
|
*comm->intraCGMode ? "/CGMD" : "",
|
|
|
|
|
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
if (comm->launchMode == ncclComm::GROUP) {
|
2020-05-12 14:40:18 -07:00
|
|
|
NCCLCHECK(ncclCpuBarrierOut(comm));
|
2021-04-12 16:00:11 -07:00
|
|
|
} else {
|
|
|
|
|
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
2020-05-12 14:40:18 -07:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Launch network proxy
|
2021-04-12 16:00:11 -07:00
|
|
|
static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
|
2018-12-13 15:56:12 -08:00
|
|
|
// Start the network proxies as soon as the kernel has been launched. We can't
|
|
|
|
|
// perform any CUDA call between the two or having a cudaFree between the CUDA
|
2020-05-12 14:40:18 -07:00
|
|
|
// launch and the ncclProxyStart call could cause a deadlock.
|
2018-12-13 15:56:12 -08:00
|
|
|
// Also, starting the proxies after the CUDA launch seems to be better for
|
|
|
|
|
// performance (latency).
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclComm_t comm = eqInfo->comm;
|
|
|
|
|
if (eqInfo->maxChannels == 0) return ncclSuccess;
|
|
|
|
|
|
|
|
|
|
for (int r=0; r<eqInfo->maxChannels; r++) {
|
2018-12-13 15:56:12 -08:00
|
|
|
struct ncclChannel* channel = comm->channels+r;
|
2020-09-04 14:35:05 -07:00
|
|
|
channel->workCount = 0;
|
2021-07-08 14:12:04 -07:00
|
|
|
channel->totalSize = 0;
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
2021-04-12 16:00:11 -07:00
|
|
|
comm->lastChannel = 0;
|
2020-05-12 14:40:18 -07:00
|
|
|
NCCLCHECK(ncclProxyStart(comm));
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Record done event for current launch
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclRecordEvents(ncclComm_t comm) {
|
2018-12-13 15:56:12 -08:00
|
|
|
struct cudaLaunchParams *params = comm->myParams;
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
// Enqueue event after NCCL kernel (only in non-graph mode)
|
|
|
|
|
if (!comm->usingCudaGraph) CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
|
2018-12-13 15:56:12 -08:00
|
|
|
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
2021-04-12 16:00:11 -07:00
|
|
|
if (comm->launchMode == ncclComm::GROUP &&
|
|
|
|
|
(comm->groupCudaStream ||
|
|
|
|
|
comm->userStream == cudaStreamDefault ||
|
|
|
|
|
comm->userStream == cudaStreamLegacy ||
|
|
|
|
|
comm->userStream == cudaStreamPerThread)) {
|
|
|
|
|
CUDACHECK(cudaEventRecord(comm->intDoneEvent, params->stream));
|
2018-12-13 15:56:12 -08:00
|
|
|
// Create dependency between NCCL internal stream and user stream
|
2021-04-12 16:00:11 -07:00
|
|
|
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0));
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
2021-04-12 16:00:11 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Reset parameter space for launch
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclLaunchReset(ncclComm_t comm) {
|
2018-12-13 15:56:12 -08:00
|
|
|
comm->userStreamSet = false;
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
// We are finishing capture of the current launch
|
|
|
|
|
// But we need to keep the current enqueue info for CUDA graph
|
|
|
|
|
// Thus we need to creating a new enqueue info for the next run
|
|
|
|
|
if (comm->usingCudaGraph) {
|
2021-07-08 14:12:04 -07:00
|
|
|
NCCLCHECK(ncclCreateQueueInfo(&comm->enqueueInfo, comm));
|
2021-04-12 16:00:11 -07:00
|
|
|
} else {
|
|
|
|
|
// If not in CUDA graph mode, we reuse the same info space
|
|
|
|
|
NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// After capturing an op in graph mode or launching the op in non-graph mode
|
|
|
|
|
// we can reset myParams for use in next op
|
2021-04-12 16:00:11 -07:00
|
|
|
struct cudaLaunchParams *params = comm->myParams;
|
|
|
|
|
params->gridDim.x = params->blockDim.x = 0;
|
|
|
|
|
params->func = NULL;
|
|
|
|
|
|
|
|
|
|
// Reset launch mode to GROUP if changed
|
|
|
|
|
if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP;
|
|
|
|
|
comm->usingCudaGraph = 0;
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
/* Enqueueing system : computation of kernel and proxy operations parameters */
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
2021-07-08 14:12:04 -07:00
|
|
|
static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
|
|
|
|
|
if (info->comm->collNetSupport > 0) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Translate ncclAvg and PreMulSum
|
2021-09-08 13:56:25 -07:00
|
|
|
ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
|
2021-07-08 14:12:04 -07:00
|
|
|
NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
|
|
|
|
|
} else {
|
|
|
|
|
*collNetTypeSupport = 0;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
|
2021-07-08 14:12:04 -07:00
|
|
|
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
|
2019-11-19 14:57:39 -08:00
|
|
|
struct ncclComm* comm = info->comm;
|
2021-09-08 13:56:25 -07:00
|
|
|
if (comm->nRanks == 1) {
|
|
|
|
|
info->algorithm = NCCL_ALGO_RING;
|
|
|
|
|
info->protocol = NCCL_PROTO_SIMPLE;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
|
|
|
|
|
// Find algorithm / protocol.
|
|
|
|
|
info->algorithm = -1;
|
|
|
|
|
info->protocol = -1;
|
|
|
|
|
int nAlgos = NCCL_NUM_ALGORITHMS;
|
|
|
|
|
for (int a=0; a<nAlgos; a++) {
|
|
|
|
|
if (a == NCCL_ALGO_COLLNET && collNetTypeSupport != 1) continue;
|
|
|
|
|
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
|
|
|
|
float time;
|
|
|
|
|
NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, numPipeOps, &time));
|
|
|
|
|
if (time >= 0 && time < minTime) {
|
|
|
|
|
info->algorithm = a;
|
|
|
|
|
info->protocol = p;
|
|
|
|
|
minTime = time;
|
|
|
|
|
}
|
2019-11-19 14:57:39 -08:00
|
|
|
}
|
|
|
|
|
}
|
2021-09-08 13:56:25 -07:00
|
|
|
if (info->algorithm == -1 || info->protocol == -1) {
|
|
|
|
|
WARN("Error : no algorithm/protocol available");
|
|
|
|
|
return ncclInternalError;
|
|
|
|
|
}
|
|
|
|
|
//if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
|
|
|
|
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
2019-11-19 14:57:39 -08:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels;
|
2020-01-16 16:02:42 -08:00
|
|
|
int nt = comm->maxThreads[info->algorithm][info->protocol];
|
2019-11-19 14:57:39 -08:00
|
|
|
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
|
2021-04-12 16:00:11 -07:00
|
|
|
if (info->algorithm == NCCL_ALGO_COLLNET) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// CollNet channel tuning
|
2021-04-12 16:00:11 -07:00
|
|
|
int ncSwitch = 16;
|
|
|
|
|
bool flag = true;
|
|
|
|
|
while (ncSwitch >= 1 && flag) {
|
|
|
|
|
while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) {
|
|
|
|
|
if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
|
|
|
|
|
nc--;
|
|
|
|
|
}
|
|
|
|
|
ncSwitch /= 2;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Ring/Tree channel tuning
|
2021-04-12 16:00:11 -07:00
|
|
|
while (info->nBytes < nc*nt*threadThreshold) {
|
|
|
|
|
if (nc >= 2) nc--;
|
|
|
|
|
else if ((nt % 128) == 0) nt/=2;
|
|
|
|
|
else break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (info->protocol == NCCL_PROTO_SIMPLE) {
|
|
|
|
|
nt += WARP_SIZE; // Extra warp for sync
|
2022-01-07 06:39:55 -08:00
|
|
|
// More threads or sync warps needed due to split thread model
|
2021-07-08 14:12:04 -07:00
|
|
|
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
|
2021-04-12 16:00:11 -07:00
|
|
|
if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
|
2019-11-19 14:57:39 -08:00
|
|
|
}
|
|
|
|
|
info->nChannels = nc;
|
|
|
|
|
info->nThreads = nt;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ncclResult_t getPatternInfo(struct ncclInfo* info) {
|
|
|
|
|
switch (info->coll) {
|
2020-09-04 14:35:05 -07:00
|
|
|
case ncclFuncBroadcast:
|
2019-11-19 14:57:39 -08:00
|
|
|
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
|
2020-09-04 14:35:05 -07:00
|
|
|
case ncclFuncReduce:
|
2019-11-19 14:57:39 -08:00
|
|
|
info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
|
2020-09-04 14:35:05 -07:00
|
|
|
case ncclFuncReduceScatter:
|
|
|
|
|
case ncclFuncAllGather:
|
2019-11-19 14:57:39 -08:00
|
|
|
info->pattern = ncclPatternRing; break;
|
2020-09-04 14:35:05 -07:00
|
|
|
case ncclFuncAllReduce:
|
2021-04-12 16:00:11 -07:00
|
|
|
info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
|
2019-11-19 14:57:39 -08:00
|
|
|
default:
|
|
|
|
|
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
|
|
|
|
|
return ncclInternalError;
|
|
|
|
|
}
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
|
|
|
|
switch (info->pattern) {
|
|
|
|
|
case ncclPatternTreeUp:
|
|
|
|
|
case ncclPatternTreeDown:
|
|
|
|
|
case ncclPatternTreeUpDown:
|
|
|
|
|
case ncclPatternPipelineFrom:
|
|
|
|
|
case ncclPatternPipelineTo:
|
|
|
|
|
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
|
2021-04-12 16:00:11 -07:00
|
|
|
case ncclPatternCollTreeUpDown:
|
|
|
|
|
info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break;
|
2018-12-13 15:56:12 -08:00
|
|
|
case ncclPatternRing:
|
|
|
|
|
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
|
|
|
|
|
case ncclPatternRingTwice:
|
|
|
|
|
info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
|
|
|
|
|
default:
|
2021-02-09 15:34:08 -08:00
|
|
|
WARN("Unknown pattern %d", info->pattern);
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclInternalError;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
|
2021-07-08 14:12:04 -07:00
|
|
|
int collNetTypeSupport = 0;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Check whether algo and proto have been preset (as in aggregation case)
|
|
|
|
|
// If so, skip the calculation
|
2021-07-08 14:12:04 -07:00
|
|
|
if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
|
|
|
|
|
NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
|
|
|
|
|
NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
|
|
|
|
|
|
|
|
|
|
comp_next:
|
2018-12-13 15:56:12 -08:00
|
|
|
// Set nstepsPerLoop and nchunksPerLoop
|
|
|
|
|
NCCLCHECK(getPatternInfo(info));
|
|
|
|
|
NCCLCHECK(getLoopInfo(info));
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
work->header.type = ncclWorkTypeColl;
|
2020-09-04 14:35:05 -07:00
|
|
|
work->sendbuff = info->sendbuff;
|
|
|
|
|
work->recvbuff = info->recvbuff;
|
2022-01-07 06:39:55 -08:00
|
|
|
work->root = info->root;
|
|
|
|
|
work->count = info->count;
|
|
|
|
|
work->nChannels = info->nChannels;
|
|
|
|
|
work->header.nWarps = info->nThreads / WARP_SIZE;
|
|
|
|
|
work->redOpArg = info->opFull.scalarArg;
|
2021-09-08 13:56:25 -07:00
|
|
|
work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
|
|
|
|
|
|
|
|
|
|
if (info->comm->nRanks == 1) {
|
|
|
|
|
// one-rank reduce index
|
2022-01-07 06:39:55 -08:00
|
|
|
work->header.funcIndex = 1 + int(info->datatype);
|
2021-09-08 13:56:25 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2020-05-12 14:40:18 -07:00
|
|
|
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
|
2019-11-19 14:57:39 -08:00
|
|
|
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
|
|
|
|
|
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
|
2018-12-13 15:56:12 -08:00
|
|
|
int chunkSize = stepSize*chunkSteps;
|
|
|
|
|
|
|
|
|
|
// Compute lastChunkSize
|
2019-11-19 14:57:39 -08:00
|
|
|
if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
|
2018-12-13 15:56:12 -08:00
|
|
|
if (info->pattern == ncclPatternTreeUpDown) {
|
|
|
|
|
// Optimize chunkSize / nSteps
|
2020-09-04 14:35:05 -07:00
|
|
|
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
|
|
|
|
|
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
|
|
|
|
|
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
|
|
|
|
// Use lastChunkSize as chunkSize
|
2022-01-07 06:39:55 -08:00
|
|
|
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
2020-01-16 16:02:42 -08:00
|
|
|
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
|
|
|
|
|
// Optimize chunkSize / nSteps
|
2021-07-08 14:12:04 -07:00
|
|
|
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
|
|
|
|
|
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
|
2021-04-12 16:00:11 -07:00
|
|
|
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
|
2020-01-16 16:02:42 -08:00
|
|
|
// Use lastChunkSize as chunkSize
|
2022-01-07 06:39:55 -08:00
|
|
|
work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
|
2021-09-08 13:56:25 -07:00
|
|
|
// Set direct direction for broadcast-gather (read or write)
|
|
|
|
|
work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
|
2019-11-19 14:57:39 -08:00
|
|
|
} else if (info->protocol == NCCL_PROTO_LL) {
|
2020-05-12 14:40:18 -07:00
|
|
|
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
|
2019-11-19 14:57:39 -08:00
|
|
|
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
|
2022-01-07 06:39:55 -08:00
|
|
|
work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
|
|
|
|
|
ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t));
|
|
|
|
|
work->lastChunkSize /= ncclTypeSize(info->datatype);
|
2019-11-19 14:57:39 -08:00
|
|
|
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
|
2020-05-12 14:40:18 -07:00
|
|
|
int nNodes = info->comm->nNodes;
|
|
|
|
|
float ppn = info->comm->nRanks / (float)nNodes;
|
|
|
|
|
float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
|
|
|
|
|
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
|
|
|
|
|
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
|
2019-11-19 14:57:39 -08:00
|
|
|
// Use lastChunkSize as chunkSize
|
2022-01-07 06:39:55 -08:00
|
|
|
work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Compute nSteps for proxies
|
2019-11-19 14:57:39 -08:00
|
|
|
int chunkEffectiveSize = chunkSize;
|
|
|
|
|
if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
|
|
|
|
|
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
|
|
|
|
|
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
|
|
|
|
|
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
|
2022-01-07 06:39:55 -08:00
|
|
|
proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
|
|
|
|
|
proxyOp->sliceSteps = sliceSteps;
|
|
|
|
|
proxyOp->chunkSteps = chunkSteps;
|
|
|
|
|
proxyOp->chunkSize = chunkSize;
|
|
|
|
|
proxyOp->protocol = info->protocol;
|
|
|
|
|
proxyOp->dtype = info->datatype;
|
|
|
|
|
proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
|
2021-09-08 13:56:25 -07:00
|
|
|
info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
|
2021-07-08 14:12:04 -07:00
|
|
|
info->op;
|
2022-01-07 06:39:55 -08:00
|
|
|
proxyOp->pattern = info->pattern;
|
|
|
|
|
proxyOp->root = info->root;
|
2020-09-04 14:35:05 -07:00
|
|
|
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
|
|
|
|
|
// because some protocols need to transmit more than the total size, plus they sometimes
|
|
|
|
|
// round up
|
2022-01-07 06:39:55 -08:00
|
|
|
proxyOp->nbytes = stepSize*proxyOp->sliceSteps;
|
2020-09-04 14:35:05 -07:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
|
2022-01-07 06:39:55 -08:00
|
|
|
proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
|
|
|
|
|
nLoops, proxyOp->nsteps, chunkSize, info->comm);
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-12 14:40:18 -07:00
|
|
|
static ncclResult_t checkSetStream(struct ncclInfo* info) {
|
|
|
|
|
if (info->comm->userStreamSet == false) {
|
|
|
|
|
info->comm->userStream = info->stream;
|
|
|
|
|
info->comm->userStreamSet = true;
|
|
|
|
|
} else if (info->stream != info->comm->userStream) {
|
|
|
|
|
WARN("Error : mixing different streams within a group call is not supported.");
|
|
|
|
|
return ncclInvalidUsage;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Handle structure for user buffer registration (IPC) exchange
|
2021-09-08 13:56:25 -07:00
|
|
|
struct ncclBuffRegHandle {
|
|
|
|
|
cudaIpcMemHandle_t sendBuffIpc;
|
|
|
|
|
cudaIpcMemHandle_t recvBuffIpc;
|
|
|
|
|
ssize_t sendBuffOffset;
|
|
|
|
|
ssize_t recvBuffOffset;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Register input and output buffers
|
|
|
|
|
// Exchange with ranks on the same host
|
|
|
|
|
static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuffRegInfo* regInfo) {
|
|
|
|
|
ncclComm_t comm = info->comm;
|
|
|
|
|
if (comm->localRanks == 1) return ncclSuccess;
|
|
|
|
|
if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess; // CUDA toolkit or driver version too old
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS];
|
2021-09-08 13:56:25 -07:00
|
|
|
// Get IPC handles
|
|
|
|
|
// Note: the handle only corresponds to the base address of the allocation
|
2022-01-07 06:39:55 -08:00
|
|
|
CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback);
|
|
|
|
|
CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback);
|
2021-09-08 13:56:25 -07:00
|
|
|
// Get offset of user buffer within allocation
|
|
|
|
|
void* baseAddr;
|
|
|
|
|
size_t size;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Get base address
|
2021-09-08 13:56:25 -07:00
|
|
|
CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
|
2022-01-07 06:39:55 -08:00
|
|
|
regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
|
2021-09-08 13:56:25 -07:00
|
|
|
CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
|
2022-01-07 06:39:55 -08:00
|
|
|
regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
|
|
|
|
|
TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset);
|
2021-09-08 13:56:25 -07:00
|
|
|
|
|
|
|
|
// Exchange handles within node
|
2022-01-07 06:39:55 -08:00
|
|
|
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
|
2021-09-08 13:56:25 -07:00
|
|
|
// Open handles at local process
|
|
|
|
|
for (int i=0; i<comm->localRanks; i++) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Skip myself
|
|
|
|
|
if (i == comm->localRank) {
|
2021-09-08 13:56:25 -07:00
|
|
|
regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
// Get base address of mapping
|
2021-09-08 13:56:25 -07:00
|
|
|
CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess));
|
|
|
|
|
CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess));
|
2022-01-07 06:39:55 -08:00
|
|
|
// Get real buffer address by adding offset in the mapping
|
2021-09-08 13:56:25 -07:00
|
|
|
regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
|
|
|
|
|
regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
|
|
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
// Marks the operation as being buffer registered
|
2021-09-08 13:56:25 -07:00
|
|
|
regInfo->nBuffs = comm->localRanks;
|
|
|
|
|
TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
|
|
|
|
|
return ncclSuccess;
|
2022-01-07 06:39:55 -08:00
|
|
|
|
|
|
|
|
reg_fallback:
|
|
|
|
|
// If we cannot register specific buffer types, we just bypass this stage, and continue without failing
|
|
|
|
|
(void)ret;
|
|
|
|
|
WARN("Unable to register user buffers");
|
|
|
|
|
return ncclSuccess;
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// Compute enqueue element, save it in list
|
|
|
|
|
// Compute CUDA launch parameters
|
|
|
|
|
// Capture time code in view of CUDA graph
|
|
|
|
|
static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
|
|
|
|
|
ncclComm_t comm = info->comm;
|
2021-09-08 13:56:25 -07:00
|
|
|
if (comm->nRanks == 1 &&
|
|
|
|
|
// User-defined reduction ops may need alter the data even for unitary reductions
|
|
|
|
|
info->op < ncclNumOps) {
|
2018-12-13 15:56:12 -08:00
|
|
|
if (info->sendbuff != info->recvbuff)
|
|
|
|
|
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// Compute cuda kernel arg and proxy arg templates
|
|
|
|
|
struct ncclQueueElem* eqElem;
|
2021-07-08 14:12:04 -07:00
|
|
|
NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWork* work = &eqElem->work;
|
|
|
|
|
NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp));
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// Determine grid size
|
|
|
|
|
struct cudaLaunchParams* params = comm->myParams;
|
|
|
|
|
params->gridDim.x += info->nChannels;
|
|
|
|
|
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
|
|
|
|
params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
|
|
|
|
|
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
|
|
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
// Register and exchange input and output buffers
|
|
|
|
|
if (comm->usingCudaGraph && // only in CUDA graph mode
|
|
|
|
|
comm->graphRegister == 1 && // when registration is enabled
|
|
|
|
|
info->algorithm == NCCL_ALGO_COLLNET && // limited to CollNet for now
|
|
|
|
|
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
|
|
|
|
|
comm->intraRanks == 1) { // only in multi-process mode
|
|
|
|
|
NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
|
|
|
|
|
comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
|
2022-01-07 06:39:55 -08:00
|
|
|
work->header.type = ncclWorkTypeRegColl;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Inline the first kernel
|
|
|
|
|
if (params->func == NULL) {
|
|
|
|
|
params->func = ncclKerns[work->header.funcIndex];
|
|
|
|
|
if (work->header.type == ncclWorkTypeColl) {
|
|
|
|
|
// Copy the first operation to the inline argument. Type may be set later to
|
|
|
|
|
// ncclWorkTypeUnused if we have more than one coll element.
|
|
|
|
|
memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem));
|
|
|
|
|
comm->args.bid = 0; // Only inline for channel 0
|
|
|
|
|
comm->args.header.isLast = 1; // I am so far the last element
|
|
|
|
|
}
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
2020-05-12 14:40:18 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Find the channel with the least enqueued work (counted in bytes)
|
2021-07-08 14:12:04 -07:00
|
|
|
static inline int findShortestChannel(ncclComm_t comm) {
|
|
|
|
|
size_t minSize = SIZE_MAX;
|
|
|
|
|
int minC = 0;
|
|
|
|
|
for (int c=0; c<comm->nChannels; c++) {
|
|
|
|
|
struct ncclChannel* channel = comm->channels+c;
|
|
|
|
|
if (channel->totalSize < minSize) {
|
|
|
|
|
minSize = channel->totalSize;
|
|
|
|
|
minC = c;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return minC;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Get next channel based on shortest-queue mode or round-robin mode
|
2021-09-08 13:56:25 -07:00
|
|
|
static inline int getNextChannel(ncclComm_t comm, int aggMode) {
|
|
|
|
|
int nextChannel = 0;
|
|
|
|
|
if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
|
|
|
|
|
nextChannel = findShortestChannel(comm);
|
2021-07-08 14:12:04 -07:00
|
|
|
} else {
|
2021-09-08 13:56:25 -07:00
|
|
|
nextChannel = comm->lastChannel % comm->nChannels;
|
2021-07-08 14:12:04 -07:00
|
|
|
comm->lastChannel++;
|
|
|
|
|
}
|
2021-09-08 13:56:25 -07:00
|
|
|
return nextChannel;
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Setup aggregated kernels
|
|
|
|
|
// Op info has been previously saved in comm->asyncOps
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
|
2020-09-04 14:35:05 -07:00
|
|
|
if (comm->asyncOpCount == 0) {
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
} else if (comm->asyncOpCount == 1) {
|
|
|
|
|
// No aggregation
|
|
|
|
|
struct ncclInfo* info = comm->asyncOps;
|
|
|
|
|
info->nChannels = 0;
|
2021-04-12 16:00:11 -07:00
|
|
|
NCCLCHECK(ncclSetupCollKernel(info));
|
2020-09-04 14:35:05 -07:00
|
|
|
} else {
|
|
|
|
|
// Aggregation
|
2022-01-07 06:39:55 -08:00
|
|
|
// Determine a per-channel chunk size used to divide an operation into multiple channels
|
2021-07-08 14:12:04 -07:00
|
|
|
size_t channelSize;
|
|
|
|
|
if (comm->channelSize > 0) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Set by user
|
2021-07-08 14:12:04 -07:00
|
|
|
channelSize = comm->channelSize;
|
|
|
|
|
} else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// CollNet specific size (tuned based on experiments)
|
2021-07-08 14:12:04 -07:00
|
|
|
channelSize = 256 * 1024;
|
|
|
|
|
} else {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Latency increases as scale increases
|
|
|
|
|
// We would thus want to increase the chunk size to compensate for the lost efficiency
|
|
|
|
|
channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
2020-09-04 14:35:05 -07:00
|
|
|
// Reduce the per-channel size if we cannot fully utilize the channels
|
|
|
|
|
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork)
|
2021-04-12 16:00:11 -07:00
|
|
|
int channelUsed = 0;
|
2021-08-31 14:33:48 -07:00
|
|
|
int homogeneous = 1;
|
2021-07-08 14:12:04 -07:00
|
|
|
int allCollNetSupport = comm->collNetSupport;
|
2020-09-04 14:35:05 -07:00
|
|
|
for (int c = 0; c < comm->asyncOpCount; c++) {
|
|
|
|
|
struct ncclInfo* info = comm->asyncOps+c;
|
2021-07-08 14:12:04 -07:00
|
|
|
info->nChannels = std::min(std::max(1, (int)DIVUP(info->nBytes, channelSize)), comm->nChannels); // assign number of channels
|
2021-04-12 16:00:11 -07:00
|
|
|
channelUsed += info->nChannels;
|
2021-07-08 14:12:04 -07:00
|
|
|
// We can use fast path if all collectives are the same
|
2021-08-31 14:33:48 -07:00
|
|
|
homogeneous &= info->coll == comm->asyncOps[0].coll &&
|
2021-09-08 13:56:25 -07:00
|
|
|
info->opFull.op == comm->asyncOps[0].opFull.op &&
|
2021-08-31 14:33:48 -07:00
|
|
|
info->datatype == comm->asyncOps[0].datatype;
|
|
|
|
|
if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
|
|
|
|
// Compute algo, proto, nthreads for the entire kernel
|
2022-01-07 06:39:55 -08:00
|
|
|
// Prepare a synthetic op info to calculate the collective algo
|
2021-07-08 14:12:04 -07:00
|
|
|
struct ncclInfo total;
|
|
|
|
|
total.comm = comm;
|
2021-08-31 14:33:48 -07:00
|
|
|
total.coll = comm->asyncOps[0].coll;
|
2021-07-08 14:12:04 -07:00
|
|
|
total.nBytes = comm->asyncTotalSize;
|
|
|
|
|
total.nChannels = std::min(channelUsed, comm->nChannels);
|
|
|
|
|
int perChannelOps = DIVUP(channelUsed, total.nChannels);
|
2021-08-31 14:33:48 -07:00
|
|
|
if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps));
|
2022-01-07 06:39:55 -08:00
|
|
|
// Set for each op
|
2021-07-08 14:12:04 -07:00
|
|
|
for (int c = 0; c < comm->asyncOpCount; c++) {
|
|
|
|
|
struct ncclInfo* info = comm->asyncOps+c;
|
2021-08-31 14:33:48 -07:00
|
|
|
if (homogeneous) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Set fields to skip the individual computeColl in ncclSetupCollKernel
|
2021-07-08 14:12:04 -07:00
|
|
|
info->algorithm = total.algorithm;
|
|
|
|
|
info->protocol = total.protocol;
|
|
|
|
|
info->nThreads = total.nThreads;
|
|
|
|
|
}
|
2021-04-12 16:00:11 -07:00
|
|
|
NCCLCHECK(ncclSetupCollKernel(info));
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
comm->args.header.type = ncclWorkTypeUnused; // disable inline argument
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
|
|
|
|
// Reset counters
|
|
|
|
|
comm->asyncOpCount = 0;
|
|
|
|
|
comm->asyncTotalSize = 0;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Store aggregated operations info
|
2020-09-04 14:35:05 -07:00
|
|
|
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
|
|
|
|
|
ncclComm_t comm = info->comm;
|
|
|
|
|
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
|
|
|
|
|
WARN("Too many async operations in progress, max is %d", NCCL_MAX_OPS);
|
|
|
|
|
return ncclInvalidUsage;
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
2020-09-04 14:35:05 -07:00
|
|
|
memcpy(comm->asyncOps+comm->asyncOpCount, info, sizeof(struct ncclInfo));
|
|
|
|
|
comm->asyncOpCount++;
|
|
|
|
|
comm->asyncTotalSize += info->nBytes;
|
2018-12-13 15:56:12 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-04 14:35:05 -07:00
|
|
|
// Save p2p operations in comm->p2pSends and p2pRecvs. Operations will be posted to channels
|
2020-05-12 14:40:18 -07:00
|
|
|
// during ncclGroupEnd()
|
2020-09-04 14:35:05 -07:00
|
|
|
static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
|
2020-05-12 14:40:18 -07:00
|
|
|
struct ncclComm* comm = info->comm;
|
|
|
|
|
int peer = info->root;
|
|
|
|
|
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
|
2022-01-07 06:39:55 -08:00
|
|
|
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
|
|
|
|
|
int peerNode = comm->rankToNode[peer];
|
|
|
|
|
int peerIndex = comm->rankToLocalRank[peer];
|
|
|
|
|
int nsteps = comm->maxLocalRanks;
|
|
|
|
|
int rankIndex = comm->rankToLocalRank[comm->rank];
|
|
|
|
|
if (info->coll == ncclFuncSend) {
|
2020-05-12 14:40:18 -07:00
|
|
|
if (peer != comm->rank) {
|
2022-01-07 06:39:55 -08:00
|
|
|
int step = (nsteps + peerIndex - rankIndex)%nsteps;
|
|
|
|
|
int delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
|
|
|
|
|
if (comm->nNodes == 1) delta = (comm->nRanks + peer - comm->rank) % comm->nRanks;
|
|
|
|
|
// Mark channels that need pre-connect
|
2020-05-12 14:40:18 -07:00
|
|
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
2022-01-07 06:39:55 -08:00
|
|
|
int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
|
|
|
|
int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
|
|
|
|
|
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
2020-09-04 14:35:05 -07:00
|
|
|
comm->connectSend[peer] |= (1<<channelId);
|
|
|
|
|
comm->connect = 1;
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes));
|
2020-09-04 14:35:05 -07:00
|
|
|
comm->p2pSendCount++;
|
2020-05-12 14:40:18 -07:00
|
|
|
} else {
|
|
|
|
|
if (peer != comm->rank) {
|
2022-01-07 06:39:55 -08:00
|
|
|
int step = (nsteps + rankIndex - peerIndex)%nsteps;
|
|
|
|
|
int delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
|
|
|
|
|
if (comm->nNodes == 1) delta = (comm->nRanks - peer + comm->rank) % comm->nRanks;
|
|
|
|
|
// Mark channels that need pre-connect
|
2020-05-12 14:40:18 -07:00
|
|
|
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
2022-01-07 06:39:55 -08:00
|
|
|
int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
|
|
|
|
|
int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
|
|
|
|
|
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
2020-09-04 14:35:05 -07:00
|
|
|
comm->connectRecv[peer] |= (1<<channelId);
|
|
|
|
|
comm->connect = 1;
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-07-08 14:12:04 -07:00
|
|
|
NCCLCHECK(ncclSaveP2pInfo(comm->p2pRecvs[info->root], info->recvbuff, nBytes));
|
2020-09-04 14:35:05 -07:00
|
|
|
comm->p2pRecvCount++;
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
2018-12-13 15:56:12 -08:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) {
|
|
|
|
|
if (work->header.type && (work->header.type != type)) return -1;
|
2021-09-08 13:56:25 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
if (type == ncclWorkTypeP2p) { // P2P
|
|
|
|
|
int start = subType == ncclWorkSubTypeRecv ? 0 : 1;
|
|
|
|
|
for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P; s+=2) {
|
|
|
|
|
if (work->p2pElems[s].peer == -1) return s;
|
|
|
|
|
// Do not aggregate multiple sends to the same peer (or receives from the same peer)
|
|
|
|
|
if (work->p2pElems[s].peer == peer) return -1;
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
} else if (type == ncclWorkTypeRegColl) { // CollNet
|
|
|
|
|
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) {
|
|
|
|
|
if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s;
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
} else if (type == ncclWorkTypeColl) { // Ring or Tree
|
2021-07-08 14:12:04 -07:00
|
|
|
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
|
2022-01-07 06:39:55 -08:00
|
|
|
if (work->elems[s].header.type == ncclWorkTypeUnused) return s;
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Compute kernel arguments for P2P ops
|
|
|
|
|
static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) {
|
|
|
|
|
elem->header.type = ncclWorkTypeP2p;
|
|
|
|
|
elem->header.funcIndex = FUNC_INDEX_P2P;
|
|
|
|
|
elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
|
|
|
|
|
elem->buff = info->recvbuff;
|
|
|
|
|
elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv;
|
|
|
|
|
elem->count = info->count;
|
|
|
|
|
elem->chunkSize = info->chunkSize;
|
|
|
|
|
elem->peer = info->root;
|
2021-04-12 16:00:11 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Equeue work elements into segment of ncclWork
|
|
|
|
|
// Supporting both collectives (aggregated or not) and P2P
|
|
|
|
|
static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s,
|
2021-09-08 13:56:25 -07:00
|
|
|
struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
|
2022-01-07 06:39:55 -08:00
|
|
|
|
|
|
|
|
if (type == ncclWorkTypeP2p) {
|
|
|
|
|
memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p));
|
|
|
|
|
int nelems = 0;
|
|
|
|
|
for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) {
|
|
|
|
|
if (work->p2pElems[i].header.type) nelems = i+1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ngroups = 1;
|
|
|
|
|
while (ngroups < nelems) ngroups *= 2;
|
|
|
|
|
int nWarps = 1;
|
|
|
|
|
while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2;
|
|
|
|
|
|
|
|
|
|
for (int i=0; i<ngroups; i++) {
|
|
|
|
|
work->p2pElems[i].ngroups = ngroups;
|
|
|
|
|
work->p2pElems[i].warpStart =
|
|
|
|
|
i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups;
|
|
|
|
|
int extraWarp = nWarps >= 2 ? i%2 : 0;
|
|
|
|
|
work->p2pElems[i].nWarps = nWarps + extraWarp;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
2021-04-12 16:00:11 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
|
|
|
|
|
|
|
|
|
|
if (regInfo->nBuffs == 0) return ncclSuccess;
|
|
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
// Copy registered buffer addresses into ncclWork
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s);
|
|
|
|
|
// For CollNet
|
|
|
|
|
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
|
|
|
|
|
int peer = channel->collTree.down[i];
|
|
|
|
|
if (peer == -1) break;
|
|
|
|
|
// Get intra-node slot
|
|
|
|
|
int j = comm->rankToLocalRank[peer];
|
|
|
|
|
if (j < 0) {
|
|
|
|
|
WARN("Invalid intra-node rank %d for peer %d", j, peer);
|
|
|
|
|
return ncclInternalError;
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
// Input buffer of leaf peer
|
|
|
|
|
regElem->dnInputs[i] = regInfo->sendbuffs[j];
|
|
|
|
|
// Output buffer of leaf peer
|
|
|
|
|
regElem->dnOutputs[i] = regInfo->recvbuffs[j];
|
|
|
|
|
}
|
|
|
|
|
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
|
|
|
|
|
int peer = channel->collTree.up[i];
|
|
|
|
|
if (peer == -1) break;
|
|
|
|
|
int j = comm->rankToLocalRank[peer];
|
|
|
|
|
if (j < 0) {
|
|
|
|
|
WARN("Invalid intra-node rank %d for peer %d", j, peer);
|
|
|
|
|
return ncclInternalError;
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
// Output buffer of root peer
|
|
|
|
|
regElem->upOutputs[i] = regInfo->recvbuffs[j];
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
work->elems[s].regUsed = 1;
|
2020-09-04 14:35:05 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Enqueue P2P op
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems;
|
|
|
|
|
struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
|
2020-09-04 14:35:05 -07:00
|
|
|
|
|
|
|
|
// Try to reuse last p2p operation if not full yet
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclChannel* channel = comm->channels+proxyOp->channelId;
|
2020-09-04 14:35:05 -07:00
|
|
|
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
|
|
|
|
|
struct ncclWork* w = channel->workFifo+opIndex;
|
|
|
|
|
int segment = -1;
|
2021-09-08 13:56:25 -07:00
|
|
|
if (channel->workCount) {
|
2020-09-04 14:35:05 -07:00
|
|
|
// Try to pack more segments into a single operation
|
2022-01-07 06:39:55 -08:00
|
|
|
segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w);
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
|
|
|
|
if (segment == -1) {
|
|
|
|
|
NCCLCHECK(getNextOp(channel, &w, NULL));
|
2022-01-07 06:39:55 -08:00
|
|
|
segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1;
|
|
|
|
|
// Initialize work as P2P, set peer=-1 to designate the p2p elem is not used.
|
|
|
|
|
w->header.type = ncclWorkTypeP2p;
|
|
|
|
|
for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) w->p2pElems[i].peer = -1;
|
2020-09-04 14:35:05 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
//printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment);
|
2020-09-04 14:35:05 -07:00
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// store work element into FIFO
|
2022-01-07 06:39:55 -08:00
|
|
|
NCCLCHECK(ncclProxySaveP2p(comm, proxyOp));
|
|
|
|
|
NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm));
|
2021-04-12 16:00:11 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Setup P2P op
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
|
|
|
|
|
ncclComm* comm = info->comm;
|
|
|
|
|
// Compute cuda kernel arg and proxy arg templates
|
|
|
|
|
struct ncclQueueElem* eqElem;
|
2021-07-08 14:12:04 -07:00
|
|
|
NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
|
2021-04-12 16:00:11 -07:00
|
|
|
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
|
2022-01-07 06:39:55 -08:00
|
|
|
NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp));
|
|
|
|
|
NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems));
|
|
|
|
|
// Compute grid size
|
2021-04-12 16:00:11 -07:00
|
|
|
int channelId = info->channelId;
|
|
|
|
|
struct cudaLaunchParams* params = comm->myParams;
|
|
|
|
|
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
|
2022-01-07 06:39:55 -08:00
|
|
|
params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE);
|
2021-04-12 16:00:11 -07:00
|
|
|
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
|
|
|
|
|
|
|
|
|
|
// Record the first kernel to launch
|
|
|
|
|
// Just for CUDA kernel to know this is a P2P operation
|
|
|
|
|
// The CUDA kernel does not use the inlined first work element as fastpath argument
|
|
|
|
|
if (params->func == NULL) {
|
2022-01-07 06:39:55 -08:00
|
|
|
params->func = ncclKerns[eqElem->work.header.funcIndex];
|
|
|
|
|
comm->args.header.type = ncclWorkTypeUnused;
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
2020-09-04 14:35:05 -07:00
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
// Dynamic enqueue function for collective kernels
|
|
|
|
|
// Supports both aggregated and non-aggregated modes
|
|
|
|
|
ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
|
2022-01-07 06:39:55 -08:00
|
|
|
struct ncclWork* work = &eqElem->work;
|
|
|
|
|
struct ncclWorkElem* elem = work->elems;
|
|
|
|
|
struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
|
2021-07-08 14:12:04 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
int nChannels = elem->nChannels;
|
|
|
|
|
size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels;
|
|
|
|
|
enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl; // redOp is only set when using CollNet
|
2021-09-08 13:56:25 -07:00
|
|
|
|
2021-07-08 14:12:04 -07:00
|
|
|
for (int bid=0; bid<nChannels; bid++) {
|
2021-09-08 13:56:25 -07:00
|
|
|
int channelId = getNextChannel(comm, aggMode);
|
2021-07-08 14:12:04 -07:00
|
|
|
struct ncclChannel* channel = comm->channels+channelId;
|
|
|
|
|
|
|
|
|
|
// Proxy
|
2022-01-07 06:39:55 -08:00
|
|
|
proxyOp->channelId = channelId;
|
|
|
|
|
proxyOp->opCount = comm->collOpCount;
|
|
|
|
|
if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks));
|
2021-07-08 14:12:04 -07:00
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
elem->bid = bid % nChannels;
|
2021-09-08 13:56:25 -07:00
|
|
|
struct ncclWork* w = NULL;
|
2021-07-08 14:12:04 -07:00
|
|
|
int segment = -1;
|
2021-09-08 13:56:25 -07:00
|
|
|
if (aggMode && channel->workCount) {
|
2021-07-08 14:12:04 -07:00
|
|
|
// Try to pack more segments into a single operation
|
2021-09-08 13:56:25 -07:00
|
|
|
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
|
|
|
|
|
w = channel->workFifo+opIndex;
|
|
|
|
|
// All elems in work must have same (funcIndex,nThreads),
|
|
|
|
|
// see "src/collectives/device/common.h"
|
2022-01-07 06:39:55 -08:00
|
|
|
if (w->header.funcIndex == work->header.funcIndex &&
|
|
|
|
|
w->header.nWarps == work->header.nWarps) {
|
|
|
|
|
segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w);
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2021-07-08 14:12:04 -07:00
|
|
|
}
|
|
|
|
|
if (segment == -1) {
|
|
|
|
|
NCCLCHECK(getNextOp(channel, &w, NULL));
|
|
|
|
|
segment = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// store work element into FIFO
|
2022-01-07 06:39:55 -08:00
|
|
|
NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
|
2021-07-08 14:12:04 -07:00
|
|
|
channel->totalSize += channelSize;
|
|
|
|
|
}
|
|
|
|
|
comm->collOpCount++;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Host setup node for CUDA Graph
|
|
|
|
|
// Performs the enqueue job
|
2021-04-12 16:00:11 -07:00
|
|
|
template<int USING_CUDA_GRAPH>
|
|
|
|
|
void CUDART_CB ncclEnqueueHostSetup(void* arg) {
|
2022-01-07 06:39:55 -08:00
|
|
|
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ret;
|
2022-01-07 06:39:55 -08:00
|
|
|
// All work for current launch has been captured in Queue Info
|
2021-04-12 16:00:11 -07:00
|
|
|
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
|
|
|
|
|
ncclComm_t comm = eqInfo->comm;
|
2021-09-08 13:56:25 -07:00
|
|
|
int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
// Iterate through the element list
|
2021-07-08 14:12:04 -07:00
|
|
|
struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
|
|
|
|
|
while (eqElem != NULL) {
|
2022-01-07 06:39:55 -08:00
|
|
|
if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) {
|
2021-04-12 16:00:11 -07:00
|
|
|
NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
|
|
|
|
|
} else {
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
2021-07-08 14:12:04 -07:00
|
|
|
eqElem = eqInfo->elemList->getNext();
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end);
|
|
|
|
|
NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end);
|
|
|
|
|
|
|
|
|
|
cb_end:
|
|
|
|
|
if (ret != ncclSuccess) {
|
|
|
|
|
WARN("Failure in host setup : %s", ncclGetErrorString(ret));
|
|
|
|
|
}
|
|
|
|
|
eqInfo->ret = ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
|
|
|
|
|
template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// CUDA Graph helper thread
|
|
|
|
|
// for de-registering user buffers
|
2021-09-08 13:56:25 -07:00
|
|
|
void* graphHelperFunc(void *args) {
|
|
|
|
|
struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
|
|
|
|
|
if (res == NULL) {
|
|
|
|
|
WARN("CUDA Graph helper resource is null");
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
int dev = res->comm->cudaDev;
|
|
|
|
|
CUDACHECKIGNORE(cudaSetDevice(dev));
|
|
|
|
|
INFO(NCCL_COLL, "CUDA Graph helper thread created for device %d", dev);
|
|
|
|
|
|
|
|
|
|
volatile enum helperThreadState* state = &res->threadState;
|
|
|
|
|
volatile int* ipcTail = &res->ipcTail;
|
|
|
|
|
while (1) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Last IPC entry enqueue so far
|
2021-09-08 13:56:25 -07:00
|
|
|
int ipcTailMark = *ipcTail;
|
|
|
|
|
int ipcCount = 0;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Close IPC till the last entry
|
2021-09-08 13:56:25 -07:00
|
|
|
while (res->ipcHead != ipcTailMark) {
|
|
|
|
|
if (res->ipcBases[res->ipcHead] != NULL)
|
|
|
|
|
CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
|
|
|
|
|
res->ipcBases[res->ipcHead] = NULL;
|
|
|
|
|
res->ipcHead = (res->ipcHead+1)%NCCL_IPC_POOL_SIZE;
|
|
|
|
|
ipcCount++;
|
|
|
|
|
}
|
|
|
|
|
TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
|
|
|
|
|
pthread_mutex_lock(&res->threadLock);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Check for exit signal
|
2021-09-08 13:56:25 -07:00
|
|
|
while (res->ipcHead == *ipcTail && *state != ThreadStop) {
|
|
|
|
|
pthread_cond_wait(&res->threadCond, &res->threadLock);
|
|
|
|
|
}
|
|
|
|
|
pthread_mutex_unlock(&res->threadLock);
|
|
|
|
|
if (*state == ThreadStop) {
|
|
|
|
|
INFO(NCCL_COLL, "CUDA Graph helper thread for device %d returning", dev);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Check if we are in CUDA Graph capture mode
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
|
|
|
|
|
comm->usingCudaGraph = 0;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Feature requires CUDA 11.3/R465 or above
|
2021-04-12 16:00:11 -07:00
|
|
|
#if CUDART_VERSION >= 11030
|
|
|
|
|
cudaStreamCaptureStatus captureStatus;
|
|
|
|
|
unsigned long long cudaGraphId;
|
2022-01-07 06:39:55 -08:00
|
|
|
ncclResult_t ret = ncclSuccess;
|
2021-05-11 18:16:30 -07:00
|
|
|
if (comm->driverVersion < 11030) {
|
2022-01-07 06:39:55 -08:00
|
|
|
// Runtime driver version older than compiler version
|
|
|
|
|
// Enhanced compat fallback
|
|
|
|
|
goto enh_compat_end;
|
2021-05-11 18:16:30 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
// Get CUDA Graph handle
|
|
|
|
|
CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
|
2021-04-12 16:00:11 -07:00
|
|
|
if (captureStatus == cudaStreamCaptureStatusActive) {
|
|
|
|
|
if (cudaGraphId != comm->lastCudaGraphId) {
|
|
|
|
|
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
|
|
|
|
|
// We are in a new graph, hence need to forget the last setup node so that
|
|
|
|
|
// the first setup node in the new graph will not have a dependency
|
|
|
|
|
comm->lastCudaGraphId = cudaGraphId;
|
|
|
|
|
comm->lastSetupNode = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH;
|
|
|
|
|
comm->usingCudaGraph = 1;
|
2021-09-08 13:56:25 -07:00
|
|
|
|
|
|
|
|
// Create helper thread that closes IPC handles during graph destruction
|
|
|
|
|
// Only create this thread when buffer registration is enabled
|
|
|
|
|
if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
|
|
|
|
|
pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Init signaling method between Graph destroy function and helper thread
|
2021-09-08 13:56:25 -07:00
|
|
|
pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Set state
|
2021-09-08 13:56:25 -07:00
|
|
|
comm->graphHelperResources->threadState = ThreadStart;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Create thread
|
2021-09-08 13:56:25 -07:00
|
|
|
pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
|
2022-01-07 06:39:55 -08:00
|
|
|
// Name thread
|
|
|
|
|
ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev);
|
2021-09-08 13:56:25 -07:00
|
|
|
}
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
2022-01-07 06:39:55 -08:00
|
|
|
return ncclSuccess;
|
|
|
|
|
|
|
|
|
|
enh_compat_end: // Enhanced compat fallback
|
|
|
|
|
(void)ret;
|
|
|
|
|
CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
|
|
|
|
|
if (captureStatus != cudaStreamCaptureStatusNone) {
|
|
|
|
|
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
|
|
|
|
|
return ncclInvalidUsage;
|
|
|
|
|
}
|
|
|
|
|
// If we are not in capture mode, we can ignore the driver being lower
|
2021-04-12 16:00:11 -07:00
|
|
|
#endif
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
// Create host setup node in CUDA Graph
|
2021-04-12 16:00:11 -07:00
|
|
|
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
|
|
|
|
|
#if CUDART_VERSION >= 11030
|
|
|
|
|
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
|
|
|
|
|
// Create a CUDA object to wrap around the argument space
|
|
|
|
|
// which CUDA graph would manage lifetime of
|
|
|
|
|
cudaUserObject_t object;
|
|
|
|
|
CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
|
2022-01-07 06:39:55 -08:00
|
|
|
// Hand over ownership to CUDA Graph
|
2021-04-12 16:00:11 -07:00
|
|
|
CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
|
|
|
|
|
|
|
|
|
|
cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
|
|
|
|
|
// Add a CPU node to the graph
|
|
|
|
|
cudaGraphNode_t setupNode;
|
2022-01-07 06:39:55 -08:00
|
|
|
// Function + parameter space for that function (i.e. enqueue info)
|
2021-04-12 16:00:11 -07:00
|
|
|
cudaHostNodeParams setupNodeParams = {fn, eqInfo};
|
|
|
|
|
int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
|
|
|
|
|
CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
|
2022-01-07 06:39:55 -08:00
|
|
|
// Create dependency from last setup node in the same graph
|
2021-04-12 16:00:11 -07:00
|
|
|
CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
|
|
|
|
|
comm->lastSetupNode = setupNode;
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
#else
|
|
|
|
|
WARN("NCCL does not support this CUDA version for CUDA graph feature");
|
|
|
|
|
return ncclInternalError;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-08 13:56:25 -07:00
|
|
|
static ncclResult_t hostToDevRedOp(
|
|
|
|
|
ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
|
|
|
|
|
) {
|
|
|
|
|
union {
|
|
|
|
|
int8_t i8;
|
|
|
|
|
uint8_t u8;
|
|
|
|
|
int32_t i32;
|
|
|
|
|
uint32_t u32;
|
|
|
|
|
int64_t i64;
|
|
|
|
|
uint64_t u64;
|
|
|
|
|
half f16;
|
|
|
|
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
|
|
|
|
__nv_bfloat16 bf16;
|
|
|
|
|
#endif
|
|
|
|
|
float f32;
|
|
|
|
|
double f64;
|
|
|
|
|
void *ptr;
|
|
|
|
|
};
|
|
|
|
|
u64 = 0;
|
|
|
|
|
opFull->scalarArgIsPtr = false;
|
|
|
|
|
switch (int(op)) {
|
|
|
|
|
case ncclSum: opFull->op = ncclDevSum; break;
|
|
|
|
|
case ncclProd: opFull->op = ncclDevProd; break;
|
|
|
|
|
case ncclMax: opFull->op = ncclDevMax; break;
|
|
|
|
|
case ncclMin: opFull->op = ncclDevMin; break;
|
|
|
|
|
case ncclAvg:
|
|
|
|
|
switch ((int)datatype) {
|
|
|
|
|
case ncclInt8: case ncclInt32: case ncclInt64:
|
|
|
|
|
case ncclUint8: case ncclUint32: case ncclUint64:
|
|
|
|
|
opFull->op = ncclDevSumPostDiv;
|
|
|
|
|
u64 = comm->nRanks;
|
|
|
|
|
break;
|
|
|
|
|
case ncclFloat16:
|
|
|
|
|
opFull->op = ncclDevPreMulSum;
|
|
|
|
|
f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x
|
|
|
|
|
break;
|
|
|
|
|
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
|
|
|
|
case ncclBfloat16:
|
|
|
|
|
opFull->op = ncclDevPreMulSum;
|
|
|
|
|
bf16 = __float2bfloat16(float(1.0/comm->nRanks));
|
|
|
|
|
break;
|
|
|
|
|
#endif
|
|
|
|
|
case ncclFloat32:
|
|
|
|
|
opFull->op = ncclDevPreMulSum;
|
|
|
|
|
f32 = float(1.0/comm->nRanks);
|
|
|
|
|
break;
|
|
|
|
|
case ncclFloat64:
|
|
|
|
|
opFull->op = ncclDevPreMulSum;
|
|
|
|
|
f64 = 1.0/comm->nRanks;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
opFull->scalarArgIsPtr = false;
|
|
|
|
|
opFull->scalarArg = u64;
|
|
|
|
|
break;
|
|
|
|
|
default: // user created
|
|
|
|
|
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
|
|
|
|
|
ncclUserRedOp *user = &comm->userRedOps[ix];
|
|
|
|
|
if (datatype != user->datatype) {
|
|
|
|
|
WARN("Data type supplied to user-created ncclRedOp_t does not match type "
|
|
|
|
|
"given to reduction operation");
|
|
|
|
|
return ncclInvalidArgument;
|
|
|
|
|
}
|
|
|
|
|
*opFull = user->opFull;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
2021-09-08 13:56:25 -07:00
|
|
|
ncclResult_t ret = ncclSuccess;
|
|
|
|
|
bool isAsync = ncclAsyncMode();
|
|
|
|
|
int savedDev = -1;
|
|
|
|
|
// Check arguments
|
|
|
|
|
NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
|
|
|
|
|
if (isAsync && info->comm->checkPointers) {
|
|
|
|
|
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
|
|
|
|
|
CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
|
|
|
|
|
}
|
|
|
|
|
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
|
|
|
|
|
|
|
|
|
|
// Copy reduction op state from op handle into info struct here since the
|
|
|
|
|
// op handle may be destroyed before ncclGroupEnd().
|
|
|
|
|
NCCLCHECKGOTO(hostToDevRedOp(&info->opFull, info->op, info->datatype, info->comm), ret, end);
|
|
|
|
|
|
2018-12-13 15:56:12 -08:00
|
|
|
// Launch asynchronously if needed
|
2021-09-08 13:56:25 -07:00
|
|
|
if (isAsync) {
|
2018-12-13 15:56:12 -08:00
|
|
|
// Always register comm even in case of error to make sure ncclGroupEnd
|
|
|
|
|
// cleans it up.
|
|
|
|
|
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
2020-05-12 14:40:18 -07:00
|
|
|
NCCLCHECKGOTO(checkSetStream(info), ret, end);
|
|
|
|
|
|
|
|
|
|
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
|
|
|
|
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
|
|
|
|
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
|
|
|
|
|
2022-01-07 06:39:55 -08:00
|
|
|
if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately
|
2020-05-12 14:40:18 -07:00
|
|
|
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
|
|
|
|
|
} else {
|
2020-09-04 14:35:05 -07:00
|
|
|
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
|
2020-05-12 14:40:18 -07:00
|
|
|
}
|
2018-12-13 15:56:12 -08:00
|
|
|
} else {
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(checkSetStream(info), ret, end);
|
2020-05-12 14:40:18 -07:00
|
|
|
|
|
|
|
|
INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
|
|
|
|
|
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
|
|
|
|
|
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
|
|
|
|
|
|
2021-04-12 16:00:11 -07:00
|
|
|
// Check whether we are in cuda graph mode
|
|
|
|
|
cudaGraph_t graph;
|
|
|
|
|
ncclComm_t comm = info->comm;
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(ncclGetCudaGraph(comm, &graph), ret, end);
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
// Common part between graph mode and non-graph mode
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(ncclSetupCollKernel(info), ret, end);
|
2021-04-12 16:00:11 -07:00
|
|
|
|
|
|
|
|
// Host setup
|
|
|
|
|
if (comm->usingCudaGraph) {
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(ncclCudaGraphHostSetup(comm, graph), ret, end);
|
2021-04-12 16:00:11 -07:00
|
|
|
} else {
|
|
|
|
|
ncclEnqueueHostSetup<0>(comm->enqueueInfo);
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(comm->enqueueInfo->ret, ret, end);
|
2021-04-12 16:00:11 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Common part between graph mode and non-graph mode
|
2021-09-08 13:56:25 -07:00
|
|
|
NCCLCHECKGOTO(ncclLaunchBarrier(comm), ret, end);
|
|
|
|
|
NCCLCHECKGOTO(ncclLaunchKernel(comm), ret, end);
|
|
|
|
|
NCCLCHECKGOTO(ncclRecordEvents(comm), ret, end);
|
|
|
|
|
NCCLCHECKGOTO(ncclLaunchReset(comm), ret, end);
|
|
|
|
|
}
|
|
|
|
|
end:
|
|
|
|
|
if (isAsync && savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
|
|
|
|
|
if (isAsync) ncclAsyncErrCheck(ret);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
|
|
|
|
|
ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
|
|
|
|
|
if (comm->userRedOpFreeHead == comm->userRedOpCapacity) {
|
|
|
|
|
// double capacity and resize
|
|
|
|
|
int cap = 2*comm->userRedOpCapacity;
|
|
|
|
|
if (cap < 4) cap = 4;
|
|
|
|
|
ncclUserRedOp *ops = new ncclUserRedOp[cap];
|
|
|
|
|
std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
|
|
|
|
|
for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
|
|
|
|
|
ops[ix].freeNext = ix + 1;
|
|
|
|
|
delete[] comm->userRedOps;
|
|
|
|
|
comm->userRedOps = ops;
|
|
|
|
|
comm->userRedOpCapacity = cap;
|
|
|
|
|
}
|
|
|
|
|
// pop from free list
|
|
|
|
|
int ix = comm->userRedOpFreeHead;
|
|
|
|
|
ncclUserRedOp *user = &comm->userRedOps[ix];
|
|
|
|
|
comm->userRedOpFreeHead = user->freeNext;
|
|
|
|
|
|
|
|
|
|
user->freeNext = -1; // allocated
|
|
|
|
|
user->datatype = datatype;
|
|
|
|
|
user->opFull.op = ncclDevPreMulSum;
|
|
|
|
|
if (residence == ncclScalarHostImmediate) {
|
|
|
|
|
user->opFull.scalarArgIsPtr = false;
|
|
|
|
|
std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
|
|
|
|
|
} else {
|
|
|
|
|
user->opFull.scalarArgIsPtr = true;
|
|
|
|
|
user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|
2021-09-08 13:56:25 -07:00
|
|
|
*op = ncclRedOp_t(int(ncclNumOps) + ix);
|
|
|
|
|
*op = ncclUserRedOpMangle(comm, *op);
|
|
|
|
|
return ncclSuccess;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
|
|
|
|
|
ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) {
|
|
|
|
|
if (0 <= int(op) && int(op) < int(ncclNumOps)) {
|
|
|
|
|
WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
|
|
|
|
|
return ncclInvalidArgument;
|
|
|
|
|
}
|
|
|
|
|
if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
|
|
|
|
|
WARN("ncclRedOpDestroy : operator is garbage.");
|
|
|
|
|
return ncclInvalidArgument;
|
|
|
|
|
}
|
|
|
|
|
int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
|
|
|
|
|
if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
|
|
|
|
|
WARN("ncclRedOpDestroy : operator unknown to this communicator.");
|
|
|
|
|
return ncclInvalidArgument;
|
|
|
|
|
}
|
|
|
|
|
// push to free list
|
|
|
|
|
comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead;
|
|
|
|
|
comm->userRedOpFreeHead = ix;
|
|
|
|
|
return ncclSuccess;
|
2018-12-13 15:56:12 -08:00
|
|
|
}
|