RCCL 2.4 update
Этот коммит содержится в:
+56
-57
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -12,34 +13,33 @@
|
||||
|
||||
// Only generate inline kernels for LL
|
||||
#define NCCL_FUNC5(coll, op, dtype) \
|
||||
(void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
(void*)NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype), \
|
||||
NCCL_KERN_NAME(coll##LL, op, dtype)
|
||||
|
||||
#define NCCL_FUNC4(coll, op, dtype) \
|
||||
(void*)NCCL_FUNC5(coll##Ring, op, dtype), \
|
||||
(void*)NCCL_FUNC5(coll##Tree, op, dtype)
|
||||
NCCL_FUNC5(coll##Ring, op, dtype)
|
||||
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(coll, op) \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, u8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i32), \
|
||||
(void*)NCCL_FUNC4(coll, op, u32), \
|
||||
(void*)NCCL_FUNC4(coll, op, i64), \
|
||||
(void*)NCCL_FUNC4(coll, op, u64), \
|
||||
(void*)NCCL_FUNC4(coll, op, f16), \
|
||||
(void*)NCCL_FUNC4(coll, op, f32), \
|
||||
(void*)NCCL_FUNC4(coll, op, f64)
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, u8), \
|
||||
NCCL_FUNC4(coll, op, i32), \
|
||||
NCCL_FUNC4(coll, op, u32), \
|
||||
NCCL_FUNC4(coll, op, i64), \
|
||||
NCCL_FUNC4(coll, op, u64), \
|
||||
NCCL_FUNC4(coll, op, f16), \
|
||||
NCCL_FUNC4(coll, op, f32), \
|
||||
NCCL_FUNC4(coll, op, f64)
|
||||
#define NCCL_FUNCS3B(coll, op) \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8), \
|
||||
(void*)NCCL_FUNC4(coll, op, i8)
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8), \
|
||||
NCCL_FUNC4(coll, op, i8)
|
||||
|
||||
// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
|
||||
#define NCCL_FUNCS2A(coll) \
|
||||
@@ -53,8 +53,9 @@
|
||||
NCCL_FUNCS3B(coll, copy), \
|
||||
NCCL_FUNCS3B(coll, copy)
|
||||
|
||||
typedef void(*ncclKern_t)(struct ncclColl);
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
|
||||
static ncclKern_t const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
|
||||
NCCL_FUNCS2B(ncclBroadcast),
|
||||
NCCL_FUNCS2A(ncclReduce),
|
||||
NCCL_FUNCS2B(ncclAllGather),
|
||||
@@ -66,33 +67,31 @@ static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
|
||||
/* Launch system : synchronization and CUDA kernel launch */
|
||||
/*****************************************************************************/
|
||||
|
||||
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
||||
#if CUDART_VERSION >= 9000
|
||||
ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
|
||||
if (cgMode & 0x01) {
|
||||
CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
|
||||
CUDACHECK(hipExtLaunchMultiKernelMultiDevice(paramsList, numDevices,
|
||||
// These flags are to reduce the latency of using this API
|
||||
cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
|
||||
0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
int savedDev;
|
||||
CUDACHECK(cudaGetDevice(&savedDev));
|
||||
CUDACHECK(hipGetDevice(&savedDev));
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
struct cudaLaunchParams* params = paramsList+i;
|
||||
CUDACHECK(cudaSetDevice(cudaDevs[i]));
|
||||
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
|
||||
hipLaunchParams* params = paramsList+i;
|
||||
CUDACHECK(hipSetDevice(cudaDevs[i]));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
|
||||
}
|
||||
CUDACHECK(cudaSetDevice(savedDev));
|
||||
CUDACHECK(hipSetDevice(savedDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
|
||||
ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
|
||||
params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
|
||||
|
||||
// Set active = 2 for the last operation
|
||||
for (int r=0; r<params->gridDim.x; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
|
||||
STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
|
||||
}
|
||||
|
||||
// Find the first operation, choose the kernel accordingly and pass it
|
||||
@@ -100,15 +99,15 @@ ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params)
|
||||
struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
|
||||
memcpy(&comm->args, coll, sizeof(struct ncclColl));
|
||||
// As we pass that coll directly, we can free it immediately.
|
||||
coll->active = 0;
|
||||
STORE(&coll->active, 0);
|
||||
|
||||
params->func = ncclKerns[coll->funcIndex];
|
||||
params->func = (void *)ncclKerns[coll->funcIndex];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = *ptr;
|
||||
int val = LOAD(ptr);
|
||||
bool done = false;
|
||||
while (done == false) {
|
||||
if (val >= comm->intraRanks) {
|
||||
@@ -130,7 +129,7 @@ ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
|
||||
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
int val = *ptr;
|
||||
int val = LOAD(ptr);
|
||||
if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
|
||||
WARN("Trying to launch too many collectives");
|
||||
return ncclInternalError;
|
||||
@@ -140,28 +139,28 @@ ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
|
||||
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
|
||||
volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
|
||||
while (*ptr < comm->intraRanks) pthread_yield();
|
||||
while (LOAD(ptr) < comm->intraRanks) pthread_yield();
|
||||
comm->intraPhase ^= 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
|
||||
if (comm->nRanks == 1) return ncclSuccess;
|
||||
struct cudaLaunchParams* params = comm->myParams;
|
||||
hipLaunchParams* params = comm->myParams;
|
||||
|
||||
NCCLCHECK(setupLaunch(comm, params));
|
||||
|
||||
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
||||
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
|
||||
// Enqueue event in user stream
|
||||
CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
|
||||
CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream));
|
||||
// Create dependency between user stream and internal NCCL stream
|
||||
CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
|
||||
CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
|
||||
params->stream = comm->groupStream;
|
||||
} else {
|
||||
if (comm->userStream != params->stream) {
|
||||
// Stream changed from last call, create dependency against last NCCL kernel launch
|
||||
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
}
|
||||
params->stream = comm->userStream;
|
||||
}
|
||||
@@ -192,12 +191,12 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
|
||||
NCCLCHECK(ncclCpuBarrierOut(comm));
|
||||
|
||||
struct cudaLaunchParams *params = comm->myParams;
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
if (comm->launchMode == ncclComm::PARALLEL) {
|
||||
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
|
||||
hipLaunchKernelGGL(((void (*)(struct ncclColl))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclColl **)(params->args)));
|
||||
}
|
||||
// Start the network proxies as soon as the kernel has been launched. We can't
|
||||
// perform any CUDA call between the two or having a cudaFree between the CUDA
|
||||
// perform any CUDA call between the two or having a hipFree between the CUDA
|
||||
// launch and the transportStartProxy call could cause a deadlock.
|
||||
// Also, starting the proxies after the CUDA launch seems to be better for
|
||||
// performance (latency).
|
||||
@@ -212,13 +211,13 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
|
||||
struct cudaLaunchParams *params = comm->myParams;
|
||||
hipLaunchParams *params = comm->myParams;
|
||||
// Enqueue event after NCCL kernel
|
||||
CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
|
||||
CUDACHECK(hipEventRecord(comm->doneEvent, params->stream));
|
||||
// Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
|
||||
if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
|
||||
// Create dependency between NCCL internal stream and user stream
|
||||
CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
|
||||
}
|
||||
comm->userStreamSet = false;
|
||||
return ncclSuccess;
|
||||
@@ -292,7 +291,7 @@ static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* n
|
||||
} else {
|
||||
*llMode = 0;
|
||||
*nChannels = info->comm->nChannels;
|
||||
*nThreads = info->comm->nThreads+1;
|
||||
*nThreads = info->comm->nThreads;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,7 +355,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
|
||||
static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
if (info->comm->nRanks == 1) {
|
||||
if (info->sendbuff != info->recvbuff)
|
||||
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
|
||||
CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -390,12 +389,12 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
|
||||
int opIndex = channel->collFifoTail;
|
||||
struct ncclColl* c = channel->collectives+opIndex;
|
||||
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
|
||||
while (activePtr[0] != 0) sched_yield();
|
||||
while (LOAD(activePtr) != 0) sched_yield();
|
||||
|
||||
memcpy(c, &coll, sizeof(struct ncclColl));
|
||||
|
||||
c->args.bid = bid;
|
||||
c->active = 1;
|
||||
STORE(&c->active, 1);
|
||||
opIndex = (opIndex+1)%NCCL_MAX_OPS;
|
||||
c->nextIndex = opIndex;
|
||||
channel->collFifoTail = opIndex;
|
||||
@@ -418,8 +417,8 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
ncclResult_t ret = ncclSuccess;
|
||||
int savedDev = -1;
|
||||
if (info->comm->checkPointers) {
|
||||
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
|
||||
CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
|
||||
CUDACHECKGOTO(hipGetDevice(&savedDev), ret, end);
|
||||
CUDACHECKGOTO(hipSetDevice(info->comm->cudaDev), ret, end);
|
||||
}
|
||||
// Check arguments
|
||||
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
|
||||
@@ -428,7 +427,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
|
||||
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
|
||||
NCCLCHECKGOTO(saveKernel(info), ret, end);
|
||||
end:
|
||||
if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
|
||||
if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
|
||||
ncclAsyncErrCheck(ret);
|
||||
return ret;
|
||||
} else {
|
||||
|
||||
Ссылка в новой задаче
Block a user