2.9.6-1

Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
2021-04-12 16:00:11 -07:00
parent 911d61f214
commit a46ea10583
43 changed files with 2687 additions and 1244 deletions
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,15 +11,82 @@
 #include "group.h"
 #include "collectives.h"

+size_t ncclKernMaxLocalSize();
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
 ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
-ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
-ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
-ncclResult_t ncclSaveKernel(struct ncclInfo* info);
-ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
-ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
+ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernel(ncclComm_t comm);
+ncclResult_t ncclRecordEvents(struct ncclComm* comm);
+ncclResult_t ncclLaunchReset(ncclComm_t comm);
+ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
+ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
+template<int USING_CUDA_GRAPH>
+void CUDART_CB ncclEnqueueHostSetup(void* arg);
+ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
+ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);

+// Enqueue information (for kernel and proxy) for each operation
+struct ncclQueueElem {
+  struct ncclWorkElem work;
+  struct ncclProxyArgs proxyArgs;
+  struct ncclQueueElem* next;
+};
+
+// Store enqueue elements in a list
+struct ncclQueueElemList {
+  struct ncclQueueElem* head;
+  struct ncclQueueElem* tail;
+};
+
+// Structure passed to CUDA graph
+struct ncclQueueInfo {
+  ncclComm_t comm;
+  int maxChannels;    // Dynamic version of gridDim
+  ncclResult_t ret;   // Return value of host setup call
+  struct ncclQueueElemList elemList;
+};
+
+// Get next element from enqueue list
+static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
+  if (eqInfo == NULL) return ncclInternalError;
+  struct ncclQueueElemList* list = &eqInfo->elemList;
+  if (list->tail != NULL) {
+    *elemOut = list->tail;
+    memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
+  } else {
+    NCCLCHECK(ncclCalloc(&list->tail, 1));
+    *elemOut = list->tail;
+    list->head = list->tail;
+  }
+  if (list->tail->next == NULL) {
+    NCCLCHECK(ncclCalloc(&list->tail->next, 1));
+  }
+  list->tail = list->tail->next;
+  return ncclSuccess;
+}
+
+// Reset element queue
+static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
+  if (eqInfo == NULL) return ncclInternalError;
+  eqInfo->maxChannels = 0;
+  eqInfo->ret = ncclSuccess;
+  eqInfo->elemList.tail = eqInfo->elemList.head;
+  return ncclSuccess;
+}
+
+// Destroy enqueue info space
+// used by both CUDA graph and non CUDA graph
+static void ncclDestroyQueueInfo(void* ptr) {
+  if (ptr == NULL) return;
+  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
+  struct ncclQueueElem* head = eqInfo->elemList.head;
+  while (head != NULL) {
+    struct ncclQueueElem* temp = head;
+    head = head->next;
+    free(temp);
+  }
+  free(eqInfo);
+}
 #endif // End include guard