Add support for CUDA graphs.
Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439.
Fix bootstrap issue caused by connection reordering.
Fix CPU locking block.
Improve CollNet algorithm.
Improve performance on DGX A100 for communicators with only one GPU per node.
This commit is contained in:
Sylvain Jeaugey
2021-04-12 16:00:11 -07:00
parent 911d61f214
commit a46ea10583
43 changed files with 2687 additions and 1244 deletions
+74 -7
View File
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,15 +11,82 @@
#include "group.h"
#include "collectives.h"
size_t ncclKernMaxLocalSize();
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
ncclResult_t ncclSaveKernel(struct ncclInfo* info);
ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
ncclResult_t ncclLaunchReset(ncclComm_t comm);
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
template<int USING_CUDA_GRAPH>
void CUDART_CB ncclEnqueueHostSetup(void* arg);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
// Enqueue information (for kernel and proxy) for each operation
struct ncclQueueElem {
struct ncclWorkElem work;
struct ncclProxyArgs proxyArgs;
struct ncclQueueElem* next;
};
// Store enqueue elements in a list
struct ncclQueueElemList {
struct ncclQueueElem* head;
struct ncclQueueElem* tail;
};
// Structure passed to CUDA graph
struct ncclQueueInfo {
ncclComm_t comm;
int maxChannels; // Dynamic version of gridDim
ncclResult_t ret; // Return value of host setup call
struct ncclQueueElemList elemList;
};
// Get next element from enqueue list
static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
if (eqInfo == NULL) return ncclInternalError;
struct ncclQueueElemList* list = &eqInfo->elemList;
if (list->tail != NULL) {
*elemOut = list->tail;
memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
} else {
NCCLCHECK(ncclCalloc(&list->tail, 1));
*elemOut = list->tail;
list->head = list->tail;
}
if (list->tail->next == NULL) {
NCCLCHECK(ncclCalloc(&list->tail->next, 1));
}
list->tail = list->tail->next;
return ncclSuccess;
}
// Reset element queue
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
if (eqInfo == NULL) return ncclInternalError;
eqInfo->maxChannels = 0;
eqInfo->ret = ncclSuccess;
eqInfo->elemList.tail = eqInfo->elemList.head;
return ncclSuccess;
}
// Destroy enqueue info space
// used by both CUDA graph and non CUDA graph
static void ncclDestroyQueueInfo(void* ptr) {
if (ptr == NULL) return;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
struct ncclQueueElem* head = eqInfo->elemList.head;
while (head != NULL) {
struct ncclQueueElem* temp = head;
head = head->next;
free(temp);
}
free(eqInfo);
}
#endif // End include guard