Merge pull request #782 from ROCmSoftwarePlatform/2.18.3

Sync up with NCCL 2.18.3
2023-06-29 15:04:16 -07:00
@@ -13,6 +13,9 @@
 #define ROUNDUP(x, y) \
    (DIVUP((x), (y))*(y))

+#define ALIGN_POWER(x, y) \
+    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
+
 #define ALIGN_SIZE(size, align) \
  size = ((size + (align) - 1) / (align)) * (align);

@@ -12,6 +12,7 @@
 #include "checks.h"
 #include "align.h"
 #include "utils.h"
+#include "p2p.h"
 #include <sys/mman.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -87,6 +88,77 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
 #define MAX_ALLOC_TRACK_NGPU 32
 extern struct allocationTracker allocTracker[];

+#if CUDART_VERSION >= 11030
+
+#include <cuda.h>
+#include "cudawrap.h"
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+  ncclResult_t result = ncclSuccess;
+  size_t granularity = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp prop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int flag = 0;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
+  prop.location.id = currentDev;
+  // Query device to see if RDMA support is available
+  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+  if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
+  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  ALIGN_SIZE(size, granularity);
+  /* Allocate the physical memory on the device */
+  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
+  /* Reserve a virtual address range */
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0));
+  /* Map the virtual address range to the physical allocation */
+  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+  /* Now allow RW access to the newly mapped memory */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = currentDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+  if (handlep) *handlep = handle;
+  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
+  return result;
+}
+
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  if (ptr == NULL) return ncclSuccess;
+  ncclResult_t result = ncclSuccess;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  return result;
+}
+
+#else
+
+extern int ncclCuMemEnable();
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+
+#endif
+
 template <typename T>
 ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
  ncclResult_t result = ncclSuccess;
@@ -193,8 +265,13 @@ template <typename T>
 ncclResult_t ncclCudaFree(T* ptr) {
  ncclResult_t result = ncclSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaFree(ptr), result, finish);
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaFree(ptr), result, finish);
+  }
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
  return result;
@@ -20,6 +20,7 @@ ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
 ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
+ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
@@ -9,7 +9,9 @@
 #include "comm.h"

 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
-ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
 static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
  int peerNode = comm->rankToNode[peer];
@@ -18,11 +18,11 @@
    }                                                       \
 } while(false)

-#define CUDACHECKGOTO(cmd, res, label) do {                 \
+#define CUDACHECKGOTO(cmd, RES, label) do {                 \
    cudaError_t err = cmd;                                  \
    if( err != cudaSuccess ) {                              \
        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        res = ncclUnhandledCudaError;                       \
+        RES = ncclUnhandledCudaError;                       \
        goto label;                                         \
    }                                                       \
 } while(false)
@@ -60,11 +60,11 @@
  } \
 } while(true)

-#define SYSCHECKGOTO(statement, res, label) do { \
+#define SYSCHECKGOTO(statement, RES, label) do { \
  if ((statement) == -1) {    \
    /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
    goto label; \
  } \
 } while (0);
@@ -72,16 +72,16 @@
 #define NEQCHECK(statement, value) do {   \
  if ((statement) != value) {             \
    /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
    return ncclSystemError;     \
  }                             \
 } while (0);

-#define NEQCHECKGOTO(statement, value, res, label) do { \
+#define NEQCHECKGOTO(statement, value, RES, label) do { \
  if ((statement) != value) { \
    /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
    goto label; \
  } \
 } while (0);
@@ -89,57 +89,57 @@
 #define EQCHECK(statement, value) do {    \
  if ((statement) == value) {             \
    /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError);    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
    return ncclSystemError;     \
  }                             \
 } while (0);

-#define EQCHECKGOTO(statement, value, res, label) do { \
+#define EQCHECKGOTO(statement, value, RES, label) do { \
  if ((statement) == value) { \
    /* Print the back trace*/ \
-    res = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
    goto label; \
  } \
 } while (0);

 // Propagate errors up
 #define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess && res != ncclInProgress) { \
+  ncclResult_t RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
    /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return RES; \
  } \
 } while (0);

-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess && res != ncclInProgress) { \
+#define NCCLCHECKGOTO(call, RES, label) do { \
+  RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
    /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
    goto label; \
  } \
 } while (0);

 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
-  ncclResult_t res = call;                \
-  if (res != ncclSuccess && res != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+  ncclResult_t RES = call;                \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
    return ncclInternalError;             \
  }                                       \
  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
 } while (!(cond));

-#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
-  res = call;                             \
-  if (res != ncclSuccess && res != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+  RES = call;                             \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
    goto label;                           \
  }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
 } while (!(cond));

 #define NCCLCHECKTHREAD(a, args) do { \
@@ -63,11 +63,12 @@ struct ncclDevRedOpFull {
  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))

 #define DECL3(func, devredop, type, undef) \
-  DECL4(func, RING,    devredop, type, undef) \
-  DECL4(func, TREE,    devredop, type, undef) \
+  DECL4(func, RING,           devredop, type, undef) \
+  DECL4(func, TREE,           devredop, type, undef) \
  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
-  DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
-  DECL4(func, NVLS,    devredop, type, undef)
+  DECL4(func, COLLNET_CHAIN,  devredop, type, undef) \
+  DECL4(func, NVLS,           devredop, type, undef) \
+  DECL4(func, NVLS_TREE,      devredop, type, undef)

 #if defined(RCCL_BFLOAT16)
 #define DECL2(func, devredop, undefForFloat) \
@@ -101,19 +101,51 @@ struct ncclCommCallback {
  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
 };

+struct ncclSharedResources {
+  int refCount;
+  struct ncclComm* owner; /* comm which creates this shared res. */
+  struct ncclChannelPeer* peers[MAXCHANNELS];
+  struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
+  /* P2P operation counter, one per channel */
+  uint64_t p2pOpCount[MAXCHANNELS];
+  /* Collective operation counter */
+  uint64_t collOpCount;
+  int tpNRanks;
+  int tpNLocalRanks;
+  int tpNChannels;
+  int tpP2pNChannels;
+  int tpP2pChunkSize;
+  uint64_t magic;
+
+  // top parent rank to localRank translation table
+  int* tpRankToLocalRank;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;
+
+  /* proxy related shared res */
+  struct ncclProxyState* proxyState;
+};
+
 struct ncclChannel {
-  struct ncclChannelPeer* peers;
-  struct ncclDevChannelPeer* devPeers;
+  struct ncclChannelPeer** peers;
+  struct ncclDevChannelPeer** devPeers;
  struct ncclRing ring;
  int* devRingUserRanks;
  struct ncclTree tree;
+
  struct ncclTree collnetChain;
  struct ncclDirect collnetDirect;
  struct ncclTree binTree;
  struct ncclNvls nvls;
+
  int id; // index of this channel
  uint32_t workFifoSent; // last used work index+1
-  uint64_t p2pOpCount;
+
+  /* comm split sharable resources */
+  struct ncclChannelPeer* collnetPeers;
+  struct ncclDevChannelPeer* collnetDevPeers;
+  struct ncclChannelPeer* nvlsPeers;
+  struct ncclDevChannelPeer* nvlsDevPeers;
 };

 struct ncclWorkList {
@@ -167,6 +199,10 @@ struct ncclComm {
  // List of destructors to run when comm is destructed
  struct ncclDestructor* destructorHead;

+  struct ncclSharedResources* sharedRes;
+  /* map to top parent ranks. */
+  int* topParentRanks;
+  int* topParentLocalRanks;
  struct ncclChannel channels[MAXCHANNELS];
  struct ncclPeerInfo* peerInfo;
  struct ncclTopoSystem* topo;
@@ -180,15 +216,16 @@ struct ncclComm {

  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.

+  uint64_t commHash;
  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index
+  //int nvmlDev; // my nvml device index
  int compCap; // compute capability of the GPU
-  int minCompCap; // min compute capability in the communicator
+  int minCompCap, maxCompCap; // min/max compute capability in the communicator
  int64_t busId;   // my PCI bus ID in int format
  cpu_set_t cpuAffinity; // CPU affinity of the GPU
  int WarpSize;
-  int virtualId;
  int cudaArch; // matches __CUDA_ARCH__ of device

  int node;
@@ -207,12 +244,11 @@ struct ncclComm {

  // Counter for tracking CUDA launches (P2P and collectives included)
  uint64_t opCount;
-  // Collective operation counter
-  uint64_t collOpCount;

  // Channels for collectives
  int nChannels;
  int nvlsChannels;
+  int collNetChannels;
  // Channels (per peer) for p2p
  int p2pnChannels;
  int p2pnChannelsPerPeer;
@@ -237,6 +273,8 @@ struct ncclComm {

  // Flag to ask NCCL kernels to abort
  volatile uint32_t *abortFlag;
+  volatile uint32_t *childAbortFlag;
+  uint32_t *abortFlagRefCount;

  // Flags for enable P2P NET
  uint32_t p2pNet;
@@ -268,21 +306,24 @@ struct ncclComm {
  char intraPad2[64 - sizeof(uint64_t)];
  uint64_t intraBarrierGate; // only used if this is intraComm0

-  struct ncclProxyState proxyState;
-
+  struct ncclProxyState* proxyState;
+  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
  // Whether this communicator uses collNet
  int collNetSupport;
+  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
  int intraHighestTransportType;
+  int* collNetHeads;
+  int collNetHeadsNum;
+  /* sharable collNet proxy progress resource. */
+  struct ncclCollNetSharedRes* collNetSharedRes;

  // NVLink SHARP (NVLS) support
  int nvlsSupport;
-  void* nvlsResources;
+  /* sharable NVLS resource. */
+  struct ncclNvlsSharedRes* nvlsResources;

  size_t channelSize; // User requested work size (bytes) for channel partitions

-  // Internal streams
-  struct ncclStrongStream deviceStream, hostStream;
-
  // pools backed by comm->memPermanent
  struct ncclMemoryPool memPool_ncclProxyOp;
  struct ncclMemoryPool memPool_ncclKernelPlan;
@@ -319,13 +360,7 @@ struct ncclComm {
  volatile bool collTraceExit;
 #endif

-  // communicator mode
-  int blocking;
-  // CGA cluster size
-  int cgaClusterSize;
-  int minCTAs, maxCTAs;
-  // network interface name
-  char *netName;
+  ncclConfig_t config;
  // initState is to more conveniently reclaim resources when errors happen.
  ncclResult_t initState;
  // flag to indicate if ncclCommFinalize() is called
@@ -11,6 +11,9 @@
 #include <cuda_runtime.h>
 #include "checks.h"

+// Is cuMem API usage enabled
+extern int ncclCuMemEnable();
+
 #if CUDART_VERSION >= 11030
 #include <cudaTypedefs.h>
 #else
@@ -85,6 +88,7 @@ DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
 DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
 DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
 #if CUDA_VERSION >= 11070
@@ -21,12 +21,13 @@
 typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];

-#define NCCL_NUM_ALGORITHMS 5 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
 #define NCCL_ALGO_COLLNET_DIRECT 2
 #define NCCL_ALGO_COLLNET_CHAIN 3
 #define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
 extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];

 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
@@ -112,10 +113,10 @@ struct ncclConnInfo {
 };

 struct ncclProxyConnector {
-  int rank;
-  int localRank;
+  int tpRank;
+  int tpLocalRank;
+  int sameProcess;
  struct ncclProxyConnection* connection;
-  struct ncclComm* comm;
 };

 struct ncclConnector {
@@ -124,7 +125,6 @@ struct ncclConnector {
  struct ncclTransportComm* transportComm;
  void* transportResources;
  struct ncclConnInfo conn;
-  struct ncclComm *comm;
 };

 struct ncclRing {
@@ -141,6 +141,9 @@ struct ncclRing {
 };


+// The root of each tree only has one node down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
 #define NCCL_MAX_TREE_ARITY 3
 struct ncclTree {
  int depth;
@@ -161,18 +164,24 @@ struct ncclDirect {

 #define NCCL_CONN_IDX_P2P_NET 2
 #define NCCL_MAX_NVLS_ARITY 8
+#define NCCL_MAX_NVLS_TREE_ARITY 3
 struct ncclNvls {
  int out;
  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
  int up[NCCL_MAX_NVLS_ARITY];
  int down;
+  int treeUp;
+  int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
+  int node;
+  int nNodes;
 };

 #define NCCL_MAX_CONNS 3
 struct ncclChannelPeer {
  struct ncclConnector send[NCCL_MAX_CONNS];
  struct ncclConnector recv[NCCL_MAX_CONNS];
+  int refCount;
 };

 struct ncclDevComm;
@@ -362,7 +371,7 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
 #endif

 struct alignas(16) ncclDevChannel {
-  struct ncclDevChannelPeer *peers;
+  struct ncclDevChannelPeer** peers;
  struct ncclRing ring;
  struct ncclTree tree;
  struct ncclTree collnetChain;
@@ -298,7 +298,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
-  CUDACHECK(cudaFree(md->gdrDevMem));
+  NCCLCHECK(ncclCudaFree(md->gdrDevMem));
  free(md);

  return ncclSuccess;
@@ -59,9 +59,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_TYPE_ROME 4
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
-ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
+ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);

 #define NCCL_TOPO_MAX_NODES 256

@@ -72,6 +74,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 #define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
 #define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
+#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
 struct ncclTopoGraph {
  // Input / output
  int id; // ring : 0, tree : 1, collnet : 2
@@ -108,18 +111,16 @@ struct ncclTopoRanks {
  int treeToParent[MAXCHANNELS];
  int treeToChild0[MAXCHANNELS];
  int treeToChild1[MAXCHANNELS];
+  int nvlsHeads[MAXCHANNELS];
 };

-ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
-    struct ncclTopoRanks* topoRanks);
+ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);

 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
-    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc);
-
+    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
 ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);

-ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
 #include "info.h"
 ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);

@@ -36,6 +36,7 @@ struct ncclAsyncJob {
  void(*destructor)(void*);
  ncclGroupJobState_t state;
  volatile uint32_t *abortFlag; /* point to comm abortFlag */
+  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
  ncclComm_t comm;
 };

@@ -67,6 +68,24 @@ extern __thread ncclResult_t ncclGroupError;
 extern __thread struct ncclComm* ncclGroupCommHead;
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
 extern __thread int ncclGroupBlocking;
+extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
+extern __thread struct ncclGroupJob ncclGroupJobMain;
+
+static inline void groupResetJobState() {
+  ncclGroupBlocking = -1;
+  ncclGroupJobMainPtr = NULL;
+  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
+  return;
+}
+
+static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
+  ncclResult_t ret = ncclSuccess;
+  if (job) {
+    ret = ncclAsyncJobComplete(&job->base);
+    groupResetJobState();
+  }
+  return ret;
+}

 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
  if (ncclGroupDepth > 0) {
@@ -91,7 +110,7 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
    ncclMemoryStackPush(&comm->memScoped);
  }

-  ncclGroupBlocking = comm->blocking;
+  ncclGroupBlocking = comm->config.blocking;
 }

 // Add comm to this thread's group needing preconnect
@@ -0,0 +1,44 @@
+#ifndef NCCL_IBV_SYMBOLS_H_
+#define NCCL_IBV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* IB Verbs Function Pointers*/
+struct ncclIbvSymbols {
+  int (*ibv_internal_fork_init)(void);
+  struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+  void (*ibv_internal_free_device_list)(struct ibv_device **list);
+  const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+  struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+  int (*ibv_internal_close_device)(struct ibv_context *context);
+  int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+  void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+  int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+  int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+  int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+  int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+  struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+  int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+  struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+  struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
+  /* DMA-BUF support */
+  struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+  int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+  struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+  int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+  struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+};
+
+/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
+
+#endif  // NCCL_IBV_SYMBOLS_H_
@@ -26,6 +26,7 @@ typedef enum : uint8_t {
  ncclPatternCollnetChain,
  ncclPatternCollnetDirect,
  ncclPatternNvls,
+  ncclPatternNvlsTree,
  ncclPatternSend,
  ncclPatternRecv
 } ncclPattern_t;
@@ -94,7 +95,6 @@ struct ncclCudaStreamList {
  struct ncclCudaStreamList *next;
  cudaStream_t stream;
 };
-
 struct ncclTasks {
  struct Peer {
    bool sendSeen, recvSeen;
@@ -104,7 +104,8 @@ struct ncclTasks {
  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
  size_t collBytesTotal;
  struct Peer* peers/*[nRanks]*/;
-  int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
+  int *p2pSendOrder, *p2pRecvOrder;
+  int p2pOrderSteps;
  int nTasksColl, nTasksP2p;

  // The list of user streams aggregated over all tasks present.
@@ -18,25 +18,6 @@ ncclResult_t ncclNetPluginInit();
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 int ncclNetVersion(struct ncclComm* comm);

-// Translation to external API
-static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
-static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
-/* DMA-BUF support */
-static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
-
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);

@@ -126,7 +126,7 @@
 * Systems:
 *
 * \image html
- * https://raw.githubusercontent.com/jrhemstad/nvtx_wrappers/master/docs/example_range.png
+ * https://raw.githubusercontent.com/NVIDIA/NVTX/release-v3/docs/images/example_range.png
 *
 * Alternatively, use the \ref MACROS like `NVTX3_FUNC_RANGE()` to add
 * ranges to your code that automatically use the name of the enclosing function
@@ -561,18 +561,27 @@

 /* Temporary helper #defines, removed with #undef at end of header */

-#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
-#if defined(_MSC_VER) && _MSC_VER < 1914
-/* Microsoft's compiler prior to VS2017 Update 7 (15.7) uses an older parser
- * that does not work with domain::get's specialization for domain::global,
- * and would require extra conditions to make SFINAE work for the overloaded
- * get() functions.  This macro disables use of overloaded get() in order to
- * work with VS2015 and versions of VS2017 below 15.7, without penalizing
- * users of newer compilers.  Building with this flag set to 0 means errors
- * when defining tag structs (see documentation for domain, named_category,
- * and registered_string) will have more complex compiler error messages
- * instead of the clear static_assert messages from the get() overloads.
+/* Some compilers do not correctly support SFINAE, which is used in this API
+ * to detect common usage errors and provide clearer error messages (by using
+ * static_assert) than the compiler would produce otherwise.  These compilers
+ * will generate errors while compiling this file such as:
+ *
+ *  error: ‘name’ is not a member of ‘nvtx3::v1::domain::global’
+ *
+ * The following compiler versions are known to have this problem, and so are
+ * set by default to disable the SFINAE-based checks:
+ *
+ * - All MSVC versions prior to VS2017 Update 7 (15.7)
+ * - GCC 8.1-8.3 (the problem was fixed in GCC 8.4)
+ *
+ * If you find your compiler hits this problem, you can work around it by
+ * defining NVTX3_USE_CHECKED_OVERLOADS_FOR_GET to 0 before including this
+ * header, or you can add a check for your compiler version to this #if.
+ * Also, please report the issue on the NVTX github page.
 */
+#if !defined(NVTX3_USE_CHECKED_OVERLOADS_FOR_GET)
+#if defined(_MSC_VER) && _MSC_VER < 1914 \
+  || defined(__GNUC__) && __GNUC__ == 8 && __GNUC_MINOR__ < 4
 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 0
 #else
 #define NVTX3_USE_CHECKED_OVERLOADS_FOR_GET 1
@@ -1,30 +1,33 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
 #ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
 #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
 #endif

-/*
- * Helper array to get the alignment for each predefined C language type.
- */
-
 typedef void* pointer_type;

-#if __STDC_VERSION__ >= 201112L /* or CPP11 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
+#include <uchar.h>
 #include <stdalign.h>
+#endif
+
+/* `alignof` is available as of C11 or C++11 */
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
 #define nvtx_alignof(type) alignof(type)
 #define nvtx_alignof2(type,tname) alignof(type)
-#else /*  __STDC_VERSION__ >= 201112L */
-#ifndef __cplusplus

-#include <stddef.h>
-#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
-#define nvtx_alignof2(type,tname) nvtx_alignof(type)
+#else /* (__STDC_VERSION__ >= 201112L) || (__cplusplus >= 201103L) */

-#else /* __cplusplus */
-
-#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
-#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
-#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
-#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
+/* Create helper structs to determine type alignment. */
+#define MKTYPEDEF(type) typedef struct {char c; type d;} _nvtx_##type
+#define MKTYPEDEF2(type,tname) typedef struct {char c; type d;} _nvtx_##tname

 MKTYPEDEF(char);
 MKTYPEDEF2(unsigned char, uchar);
@@ -54,22 +57,33 @@ MKTYPEDEF(size_t);
 MKTYPEDEF(pointer_type);

 MKTYPEDEF(wchar_t);
-#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
-    {sizeof(char8_t), nvtx_alignof(char8_t)},
+
+/* `char8_t` is available as of C++20 or C23 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
    MKTYPEDEF(char8_t);
 #endif
-#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
+/* `char16_t` and `char32_t` are available as of C++11 or C11 */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
    MKTYPEDEF(char16_t);
    MKTYPEDEF(char32_t);
 #endif

+/* C requires to include stddef.h to use `offsetof` */
+#ifndef __cplusplus
+#include <stddef.h>
+#endif
+
+#define nvtx_alignof(tname) offsetof(_nvtx_##tname, d)
+#define nvtx_alignof2(type, tname) offsetof(_nvtx_##tname, d)
+
+#endif /*  __STDC_VERSION__ >= 201112L */
+
 #undef MKTYPEDEF
 #undef MKTYPEDEF2

-#endif /* __cplusplus */
-#endif /*  __STDC_VERSION__ >= 201112L */
-
 /*
+ * Helper array to get the alignment for each predefined C/C++ language type.
 * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
 */
 const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
@@ -109,13 +123,14 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_

    /*** Special character types ***/
    /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
-    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
-#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
-    {sizeof(char8_t), nvtx_alignof(char8_t)},
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) || (defined(__cplusplus) && __cplusplus >= 201811L)
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {sizeof(char8_t), nvtx_alignof(char8_t)},
 #else
-    {0, 0},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */ {0, 0},
 #endif
-#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 200704L)
    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
 #else
@@ -125,4 +140,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
 };

 #undef nvtx_alignof
-#undef nvtx_alignof2
+#undef nvtx_alignof2
@@ -9,4 +9,21 @@
 #ifndef NCCL_P2P_H_
 #define NCCL_P2P_H_

+#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+
+typedef struct {
+  int data; // Currently only support an fd based descriptor
+} ncclCuDesc;
+
+typedef union {
+  // Legacy CUDA IPC
+  cudaIpcMemHandle_t devIpc;
+  // cuMem API support
+  ncclCuDesc cuDesc;
+} ncclIpcDesc;
+
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
+ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
+
 #endif
@@ -15,11 +15,13 @@
 #include "ipcsocket.h"
 #include <pthread.h>
 #include "shm.h"
+#include "p2p.h"

 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+enum { proxyRecv=0, proxySend=1 };

 struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);

 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
 static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
@@ -130,18 +132,11 @@ struct ncclProxySharedP2p {
  int size;
  char* cudaBuff;
  char* hostBuff;
-  cudaIpcMemHandle_t ipc;
+  // CUDA IPC
+  ncclIpcDesc ipcDesc;
  struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
 };

-struct ncclProxySharedCollNet {
-  int size;
-  char* cudaBuff;
-  char* hostBuff;
-  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
-  void* resources;
-};
-
 struct ncclProxyPeer {
  struct ncclProxySharedP2p send;
  struct ncclProxySharedP2p recv;
@@ -165,7 +160,6 @@ struct ncclProxyProgressState {
  bool stop;
  struct ncclProxyPeer** localPeers;
  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
-  struct ncclProxySharedCollNet collNet;
  struct ncclProxyArgs* active;
  struct ncclProxyArgs* pool;
  struct ncclProxyPool* pools;
@@ -192,12 +186,27 @@ struct ncclProxyAsyncOp {

 struct ncclProxyLocalPeer {
  struct ncclSocket sock;
-  int localRank;
+  int tpRank;
+  int tpLocalRank;
  ncclProxyAsyncOp* asyncOps;
  int asyncOpCounter;
 };

 struct ncclProxyState {
+  int refCount;
+  int tpRank;
+  int tpnRanks;
+  int tpLocalnRanks;
+  int cudaDev;
+  int p2pnChannels;
+  int p2pChunkSize;
+  int nChannels;
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  bool allocP2pNetLLBuffers;
+  bool dmaBufSupport;
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
+  volatile uint32_t* abortFlag;
  // Service thread
  pthread_t thread;
  struct ncclSocket* listenSock;
@@ -209,6 +218,7 @@ struct ncclProxyState {
  struct ncclSocket* peerSocks;
  struct ncclProxyOps* proxyOps;
  void** sharedDevMems;
+  struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)

  // Progress thread
  struct ncclProxyProgressState progressState;
@@ -228,13 +238,14 @@ enum proxyConnectState {

 struct ncclProxyConnection {
  int send, transport, shared;
-  int localRank;
+  int tpLocalRank, sameProcess;
  struct ncclSocket* sock;
  struct ncclTransportComm* tcomm;
  struct ncclProxyArgs *proxyAppend;
  struct ncclProxyArgs **proxyAppendPtr;
  void* transportResources;
  proxyConnectState state;
+  struct ncclCollNetSharedRes* collNet;
 };

 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -250,7 +261,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* prox
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
-ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
 enum ncclProxyMsgType {
  ncclProxyMsgInit = 1,
  ncclProxyMsgSharedInit = 2,
@@ -260,22 +271,24 @@ enum ncclProxyMsgType {
  ncclProxyMsgClose = 6,
  ncclProxyMsgAbort = 7,
  ncclProxyMsgStop = 8,
-  ncclProxyMsgConvertFd = 9 // cuMem API support
+  ncclProxyMsgConvertFd = 9, // cuMem API support (UDS)
 };

 // This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
 // Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
 // ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
-ncclResult_t ncclProxyCallAsync(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
+ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);

 // This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
-ncclResult_t ncclProxyCallBlocking(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
-ncclResult_t ncclPollProxyResponse(struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
+ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);

-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyClientConvertFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int fd, int* convertedFd);
+
+ncclResult_t ncclProxyStop(struct ncclComm* comm);
 ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);

-enum { proxyRecv=0, proxySend=1 };
-ncclResult_t mscclSaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);
+ncclResult_t mscclSaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex);

 #endif
@@ -36,7 +36,6 @@ struct ncclComm;
 struct ncclPeerInfo {
  int rank;
  int cudaDev;
-  int netDev;
  int gdrSupport;
  bool hasFineGrain;
  uint64_t hostHash;
@@ -45,7 +44,6 @@ struct ncclPeerInfo {
  int64_t busId;
  struct ncclComm* comm;
  int cudaCompCap;
-  int virtualId;
 };

 #define CONNECT_SIZE 128
@@ -53,15 +51,46 @@ struct ncclConnect {
  char data[CONNECT_SIZE];
 };

+#if CUDART_VERSION >= 12010
+
+#define NVLS_HANDLE_SIZE 64
+struct ncclNvlsSharedRes {
+  int refCount;
+  CUmulticastObjectProp properties;
+  CUmemAccessDesc accessDesc;
+  int dev;
+  size_t size;
+  size_t granularity;
+  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
+  char* mcBuff; // Multicast NVLS buffer address
+  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
+  char* ucBuff; // Unicast NVLS buffer address
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  int nChannels;
+};
+
+#endif /* CUDART_VERSION >= 12010 */
+
+struct ncclCollNetSharedRes {
+  int refCount;
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+  void* resources;
+  int nChannels;
+  size_t buffSize;
+};
+
 struct ncclTransportComm {
  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
  ncclResult_t (*free)(struct ncclConnector*);
-  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
-  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
-  ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
+  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
+  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
 };

 struct ncclTransport {
@@ -74,10 +103,9 @@ struct ncclTransport {
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);

-#if CUDART_VERSION >= 12010
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm);
+ncclResult_t ncclNvlsInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
-#endif

 enum { collNetRecv=0, collNetSend=1 };
 int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);