Merge remote-tracking branch 'nccl/master' into develop

2021-04-30 16:57:36 -07:00
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -52,11 +52,16 @@ extern struct allocationTracker allocTracker[];

 template <typename T>
 static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  // Need async stream for P2P pre-connect + CUDA Graph
+  hipStream_t stream;
+  CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
  if (isFineGrain)
    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
  else
    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
+  CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
+  CUDACHECK(hipStreamSynchronize(stream));
+  CUDACHECK(hipStreamDestroy(stream));
  int dev;
  CUDACHECK(hipGetDevice(&dev));
  if (dev < MAX_ALLOC_TRACK_NGPU) {
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
-ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
-ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr);
 ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
 ncclResult_t bootstrapClose(void* commState);
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -28,6 +28,15 @@
    }                                                       \
 } while(false)

+// Report failure but clear error and continue
+#define CUDACHECKIGNORE(cmd) do {  \
+    hipError_t err = cmd;         \
+    if( err != hipSuccess ) {     \
+        INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, hipGetErrorString(err)); \
+        (void) hipGetLastError(); \
+    }                              \
+} while(false)
+
 #include <errno.h>
 // Check system calls
 #define SYSCHECK(call, name) do { \
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -81,4 +81,5 @@ DECL_ALL
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define SENDRECV_SLICEFACTOR 1
+
 #endif
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -15,6 +15,9 @@
 // [/RCCL]

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  typedef void *cudaGraph_t;
+  typedef void *cudaGraphNode_t;
+  #define HIPRT_CB
 #else
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -84,16 +87,17 @@ struct ncclComm {
  int nNodes;
  int localRanks;

-  enum { GROUP, PARALLEL } launchMode;
+  enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
  hipStream_t userStream;
  bool userStreamSet;
  hipEvent_t doneEvent;
+  hipEvent_t intDoneEvent;
  bool checkPointers;

-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
+  // Counter for tracking CUDA launches (P2P and collectives included)
  uint64_t opCount;
-  uint64_t lastOpCount;
+  // Collective operation counter
+  uint64_t collOpCount;

  // Channels for collectives
  int nChannels;
@@ -101,8 +105,6 @@ struct ncclComm {
  int p2pnChannels;
  int p2pnChannelsPerPeer;
  int p2pChannels[MAXCHANNELS];
-  //Channels for collnet
-  int collNetnChannels;

  // Buffer sizes
  int buffSizes[NCCL_NUM_PROTOCOLS];
@@ -157,6 +159,7 @@ struct ncclComm {
  struct ncclInfo* asyncOps;
  int asyncOpCount;
  size_t asyncTotalSize;
+  int lastChannel;

  //list of async p2p operation queued in a group semantics
  struct ncclP2Plist* p2pSends;
@@ -169,6 +172,11 @@ struct ncclComm {
  int rootPid;                     // Process ID of root
  // [/RCCL]

+  // Store info for cudaGraph
+  int usingCudaGraph; // Only use it during capture time, not launch time
+  struct ncclQueueInfo* enqueueInfo;
+  cudaGraphNode_t lastSetupNode;
+  unsigned long long lastCudaGraphId;
 };

 #endif
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
@@ -125,8 +125,9 @@ struct ncclConnInfo {
 struct ncclConnector {
  int connected;
  struct ncclProxyArgs *proxyAppend;
+  struct ncclProxyArgs **proxyAppendPtr;
  struct ncclTransportComm* transportComm;
-  void* transportResources; // Host-side resources
+  void* transportResources;
  struct ncclConnInfo conn;
  struct ncclComm *comm;
 };
@@ -151,11 +152,23 @@ struct ncclTree {
  int down[NCCL_MAX_TREE_ARITY];
 };

+#define NCCL_MAX_DIRECT_ARITY 7
+struct ncclDirect {
+  int depth;
+  int out;
+  int nHeads;
+  int headRank;
+  int shift;
+  int up[NCCL_MAX_DIRECT_ARITY];
+  int down[NCCL_MAX_DIRECT_ARITY];
+};
+
+#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
+#define NCCL_CONN_IDX_P2P_NET 2
+#define NCCL_MAX_CONNS 3
 struct ncclPeer {
-  struct ncclConnector send;
-  struct ncclConnector recv;
-  struct ncclConnector p2pSend;
-  struct ncclConnector p2pRecv;
+  struct ncclConnector send[NCCL_MAX_CONNS];
+  struct ncclConnector recv[NCCL_MAX_CONNS];
 };

 struct ncclDevComm;
@@ -179,7 +192,6 @@ struct ncclWorkElem {
  const void * sendbuff;
  void * recvbuff;

-  uint64_t opCount;
  // Op-specific fields.
  union {
    struct {
@@ -192,9 +204,15 @@ struct ncclWorkElem {
    struct {
      size_t sendCount;
      size_t recvCount;
+      int sendChunkSize;
+      int recvChunkSize;
      int32_t delta;
      uint16_t nThreads;
    } p2p;
+    struct {
+      uint16_t padding[15];
+      uint16_t opCount;
+    } op;
    // [RCCL] Clique-based arguments
    //        NOTE: Follows same field structure as coll
    //              because nChannels is accessed from "coll" struct.
@@ -206,7 +224,7 @@ struct ncclWorkElem {
      uint8_t nChannels;
    } clique;
    // [/RCCL]
-    uint64_t align[3];
+    uint64_t align[4];
  };
 };
 struct ncclWork {
@@ -219,7 +237,7 @@ struct ncclChannel {
    struct {
      struct ncclRing ring;
      struct ncclTree tree;
-      struct ncclTree collTree;
+      struct ncclDirect collTree;

      int id;

@@ -241,6 +259,12 @@ struct ncclChannel {
      float bw_cumulative;
      int bw_count;
 #endif
+      uint16_t index;        // Only used by GPU
+
+      // GDRCOPY support
+      struct ncclWork* workFifoGdr;
+      struct ncclWork* workFifoDev;
+      void* gdrMemDesc;
    };
    int data[0x80];
  };
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,15 +11,82 @@
 #include "group.h"
 #include "collectives.h"

+size_t ncclKernMaxLocalSize();
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
 ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
-ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
-ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
-ncclResult_t ncclSaveKernel(struct ncclInfo* info);
-ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info);
-ncclResult_t ncclSaveCommKernels(struct ncclComm* comm);
+ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernel(ncclComm_t comm);
+ncclResult_t ncclRecordEvents(struct ncclComm* comm);
+ncclResult_t ncclLaunchReset(ncclComm_t comm);
+ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
+ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
+template<int USING_CUDA_GRAPH>
+void HIPRT_CB ncclEnqueueHostSetup(void* arg);
+ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
+ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);

+// Enqueue information (for kernel and proxy) for each operation
+struct ncclQueueElem {
+  struct ncclWorkElem work;
+  struct ncclProxyArgs proxyArgs;
+  struct ncclQueueElem* next;
+};
+
+// Store enqueue elements in a list
+struct ncclQueueElemList {
+  struct ncclQueueElem* head;
+  struct ncclQueueElem* tail;
+};
+
+// Structure passed to CUDA graph
+struct ncclQueueInfo {
+  ncclComm_t comm;
+  int maxChannels;    // Dynamic version of gridDim
+  ncclResult_t ret;   // Return value of host setup call
+  struct ncclQueueElemList elemList;
+};
+
+// Get next element from enqueue list
+static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) {
+  if (eqInfo == NULL) return ncclInternalError;
+  struct ncclQueueElemList* list = &eqInfo->elemList;
+  if (list->tail != NULL) {
+    *elemOut = list->tail;
+    memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs));
+  } else {
+    NCCLCHECK(ncclCalloc(&list->tail, 1));
+    *elemOut = list->tail;
+    list->head = list->tail;
+  }
+  if (list->tail->next == NULL) {
+    NCCLCHECK(ncclCalloc(&list->tail->next, 1));
+  }
+  list->tail = list->tail->next;
+  return ncclSuccess;
+}
+
+// Reset element queue
+static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
+  if (eqInfo == NULL) return ncclInternalError;
+  eqInfo->maxChannels = 0;
+  eqInfo->ret = ncclSuccess;
+  eqInfo->elemList.tail = eqInfo->elemList.head;
+  return ncclSuccess;
+}
+
+// Destroy enqueue info space
+// used by both CUDA graph and non CUDA graph
+static void ncclDestroyQueueInfo(void* ptr) {
+  if (ptr == NULL) return;
+  struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
+  struct ncclQueueElem* head = eqInfo->elemList.head;
+  while (head != NULL) {
+    struct ncclQueueElem* temp = head;
+    head = head->next;
+    free(temp);
+  }
+  free(eqInfo);
+}
 #endif // End include guard
@@ -0,0 +1,272 @@
+/*************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GDRWRAP_H_
+#define NCCL_GDRWRAP_H_
+
+#include "nccl.h"
+#include <stdint.h> // for standard [u]intX_t types
+#include <stdio.h>
+
+// These can be used if the GDR library isn't thread safe
+#include <pthread.h>
+extern pthread_mutex_t gdrLock;
+#define GDRLOCK() pthread_mutex_lock(&gdrLock)
+#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
+#define GDRLOCKCALL(cmd, ret) do {                      \
+    GDRLOCK();                                          \
+    ret = cmd;                                          \
+    GDRUNLOCK();                                        \
+} while(false)
+
+#define GDRCHECK(cmd) do {                              \
+    int e;                                              \
+    /* GDRLOCKCALL(cmd, e); */                          \
+    e = cmd;                                            \
+    if( e != 0 ) {                                      \
+      WARN("GDRCOPY failure %d", e);                    \
+      return ncclSystemError;                           \
+    }                                                   \
+} while(false)
+
+// This is required as the GDR memory is mapped WC
+#if !defined(__NVCC__)
+#if defined(__PPC__)
+static inline void wc_store_fence(void) { asm volatile("sync") ; }
+#elif defined(__x86_64__)
+#include <immintrin.h>
+static inline void wc_store_fence(void) { _mm_sfence(); }
+#elif defined(__aarch64__)
+#ifdef __cplusplus
+#include <atomic>
+static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
+#else
+#include <stdatomic.h>
+static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
+#endif
+#endif
+#endif
+
+//#define GDR_DIRECT 1
+#ifdef GDR_DIRECT
+// Call the GDR API library code directly rather than via
+// dlopen() wrappers
+#include <gdrapi.h>
+
+static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
+static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
+static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
+static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
+  GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
+  GDRCHECK(gdr_unpin_buffer(g, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
+  GDRCHECK(gdr_get_info(g, handle, info));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
+  GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
+  GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static void wrap_gdr_runtime_get_version(int *major, int *minor) {
+  gdr_runtime_get_version(major, minor);
+  return ncclSuccess;
+}
+static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
+  gdr_driver_get_version(g, major, minor);
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
+  GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
+  GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
+  return ncclSuccess;
+}
+
+#else
+// Dynamically handle dependency the GDR API library
+
+/* Extracted from gdrapi.h (v2.1 Nov 2020) */
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+struct gdr;
+typedef struct gdr *gdr_t;
+
+typedef struct gdr_mh_s {
+  unsigned long h;
+} gdr_mh_t;
+
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    unsigned mapped:1;
+    unsigned wc_mapping:1;
+};
+typedef struct gdr_info gdr_info_t;
+
+/* End of gdrapi.h */
+
+ncclResult_t wrap_gdr_symbols(void);
+
+gdr_t wrap_gdr_open(void);
+ncclResult_t wrap_gdr_close(gdr_t g);
+ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
+ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
+ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
+ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+
+#endif // GDR_DIRECT
+
+// Global GDR driver handle
+extern gdr_t ncclGdrCopy;
+
+#include "alloc.h"
+
+typedef struct gdr_mem_desc {
+  void *gdrDevMem;
+  void *gdrMap;
+  size_t gdrOffset;
+  size_t gdrMapSize;
+  gdr_mh_t gdrMh;
+} gdr_mem_desc_t;
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+static gdr_t ncclGdrInit() {
+  return NULL;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  return ncclSuccess;
+}
+#else
+static gdr_t ncclGdrInit() {
+  int libMajor, libMinor, drvMajor, drvMinor;
+  gdr_t handle = NULL;
+  // Dynamically load the GDRAPI library symbols
+  if (wrap_gdr_symbols() == ncclSuccess) {
+    handle = wrap_gdr_open();
+
+    if (handle != NULL) {
+      ncclResult_t res;
+
+      // Query the version of libgdrapi
+      NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
+
+      // Query the version of gdrdrv driver
+      NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
+
+      // Only support GDRAPI 2.1 and later
+      if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
+        goto error;
+      }
+      else
+        INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
+    }
+  }
+  return handle;
+error:
+  if (handle != NULL) (void) wrap_gdr_close(handle);
+  return NULL;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
+  gdr_info_t info;
+  size_t mapSize;
+  gdr_mh_t mh;
+  char *devMem;
+  void *gdrMap;
+
+  mapSize = sizeof(T)*nelem;
+
+  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
+  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
+  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
+  uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
+  size_t align = alignedAddr - (uint64_t)devMem;
+
+  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
+  NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
+
+  NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
+  //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
+
+  NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
+
+  // Will offset ever be non zero ?
+  ssize_t off = info.va - alignedAddr;
+
+  gdr_mem_desc_t* md;
+  NCCLCHECK(ncclCalloc(&md, 1));
+  md->gdrDevMem = devMem;
+  md->gdrMap = gdrMap;
+  md->gdrMapSize = mapSize;
+  md->gdrOffset = off+align;
+  md->gdrMh = mh;
+  *gdrHandle = md;
+
+  *ptr = (T *)((char *)gdrMap+off);
+  if (devPtr) *devPtr = (T *)(devMem+off+align);
+
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
+  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
+  CUDACHECK(hipFree(md->gdrDevMem));
+  free(md);
+
+  return ncclSuccess;
+}
+#endif
+
+#endif // End include guard
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -28,7 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);

 // Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
 ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);

@@ -93,13 +94,11 @@ struct ncclTopoRanks {
 };

 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
    struct ncclTopoRanks* topoRanks);

 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
-    struct ncclTopoRanks** allTopoRanks, int* rings, int nc);
-
-ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
 #include "info.h"
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -4,7 +4,7 @@
 * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
 *
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc
 static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
  if (ret != IBV_SUCCESS) {
-    WARN("ibv_post_send() failed with error %s", strerror(ret));
+    WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -1,6 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -19,8 +19,7 @@ typedef enum {
  ncclPatternTreeUp,
  ncclPatternTreeDown,
  ncclPatternTreeUpDown,
-  ncclPatternCollTreeUp,
-  ncclPatternCollTreeDown
+  ncclPatternCollTreeUpDown
 } ncclPattern_t;

 // Used to pass NCCL call information between functions
@@ -50,6 +49,8 @@ struct ncclInfo {
  int nchunksPerLoop;
  ssize_t sendbytes;
  ssize_t recvbytes;
+  int recvChunkSize;
+  int sendChunkSize;
  uint32_t delta;
  int channelId;
 };
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -53,11 +53,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
    ncclNetHandle_t handle;
    void* gpuPtr = NULL;
    void* mHandle = NULL;
-    NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
-    NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
-    NCCLCHECK(ncclNetAccept(lComm, &rComm));
-    CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained));
+    ncclResult_t ret;
    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+    CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
@@ -66,9 +67,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
    }
    ncclDebugNoWarn = 0;
    CUDACHECK(hipFree(gpuPtr));
+cleanup4:
    NCCLCHECK(ncclNetCloseRecv(rComm));
+cleanup3:
    NCCLCHECK(ncclNetCloseSend(sComm));
+cleanup2:
    NCCLCHECK(ncclNetCloseListen(lComm));
+cleanup1:
    break;
  }
  return ncclSuccess;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1,5 +1,6 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,48 +15,67 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }
 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);

-struct ncclProxyArgs {
-  proxyProgressFunc_t progress;
+#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
+static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+
+struct ncclProxySubArgs {
  struct ncclChannel* channel;
  struct ncclConnector* connector;
-  size_t sendbytes;
-  size_t recvbytes;
-  int sliceSteps;
-  int chunkSteps;
  int nsteps;
-  uint64_t opCount;
-  int protocol;
-  int segment; // Only for profiling
-  ncclDataType_t dtype;
-  ncclRedOp_t redOp;
-  int state;   // add component before this line -- it is left out during initialization
+  ssize_t sendbytes;
+  ssize_t recvbytes;
+  int sendChunkSize;
+  int recvChunkSize;
+  int delta;

  // Internal state
+  uint64_t base;
  uint64_t posted;
-  uint64_t received; // Only used by recv proxy to wait for flush.
+  uint64_t received;
+  uint64_t flushed;
  uint64_t transmitted;
  uint64_t done;
  uint64_t end;
-  uint64_t hdp_flushed;
  void* requests[NCCL_STEPS];
+};
+
+struct ncclProxyArgs {
+  proxyProgressFunc_t progress;
+  struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+  int nsubs;
+  int done;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  uint64_t opCount;
+  uint64_t commOpCount;
+  int protocol;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
+  ncclPattern_t pattern;
+  int root;
+  int state;
+  char* sharedBuff[NCCL_STEPS];
+  int sharedSize[NCCL_STEPS];
+
  int idle;
+  uint64_t hdp_flushed;

  // Element linking
  pthread_mutex_t mutex;
  struct ncclProxyArgs* next;
  struct ncclProxyArgs* nextPeer;
-  struct ncclProxyArgs* nextGroup;
  struct ncclProxyArgs** proxyAppendPtr;
 };

 struct ncclProxySharedBuffers {
-  int nslots;
-  int slotSize;
-  char* cudaBuff[2*MAXCHANNELS];
-  int* cudaUsed[2*MAXCHANNELS];
-  char* hostBuff[2*MAXCHANNELS];
-  int* hostUsed[2*MAXCHANNELS];
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
  struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
+  // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
+  struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
+  void* collNetResources;
 };

 struct ncclProxyPool;
@@ -64,11 +84,16 @@ struct ncclProxyState {
  pthread_mutex_t opsMutex;
  pthread_mutex_t poolMutex;
  bool stop;
-  struct ncclProxySharedBuffers* sharedBuffs;
-  struct ncclProxyArgs* ops;
-  struct ncclProxyArgs* nextOps;
+  struct ncclProxySharedBuffers sharedBuffs;
+  struct ncclProxyArgs* ops;           // Running operations, used by proxy thread
+  struct ncclProxyArgs* postedOps;     // Posted operations, shared between proxy and main thread, locked with opsMutex
+  struct ncclProxyArgs* postedOpsEnd;
+  struct ncclProxyArgs* nextOps;       // Pending operations, used by main thread (could still be cancelled)
  struct ncclProxyArgs* nextOpsEnd;
-  struct ncclProxyArgs* pool;
+  struct ncclProxyArgs* pool;          // Free operations for main thread
+  struct ncclProxyArgs* poolFreed;     // Freed operations by the progress thread
+  struct ncclProxyArgs* poolReturned;  // Shared between main and progress thread, lock with poolMutex
+
  struct ncclProxyPool* pools;
 };

@@ -80,15 +105,16 @@ enum proxyMode {
  proxyTo = 2
 };

-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
-ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment);
+ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyDestroy(struct ncclComm* comm);

 ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
-ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr);
-ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr);
+ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
+ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
 ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);

 #include <unistd.h>
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -41,7 +41,7 @@ struct ncclConnect {
 };

 struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
+  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
  ncclResult_t (*free)(void*);
  ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -54,7 +54,10 @@ struct ncclTransport {
  struct ncclTransportComm recv;
 };

-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph);
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);

+enum { collNetRecv=0, collNetSend=1 };
+int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
+ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
 #endif
@@ -37,6 +37,4 @@ static long log2i(long n) {
 return l;
 }

-int busIdToCudaDev(int64_t busId);
-
 #endif