Merge remote-tracking branch 'nccl/master' into develop

2025-08-28 15:45:42 -05:00
@@ -193,10 +193,9 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
 #define MAX_ALLOC_TRACK_NGPU 128
 extern struct allocationTracker allocTracker[];

-#if CUDART_VERSION >= 11030
+#if ROCM_VERSION >= 70000

-#include <cuda.h>
-#include "cudawrap.h"
+#include "rocmwrap.h"

 // ncclCuMemAllocAddr takes memory handle and size and returns the mapped address pointer
 static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
@@ -262,7 +261,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
  prop.requestedHandleTypes = type;
  prop.location.id = currentDev;
  // Query device to see if RDMA support is available
-  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
+  // CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
  if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
  ALIGN_SIZE(size, granularity);
@@ -318,21 +317,21 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
 extern int ncclCuMemEnable();

 static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
+  WARN("CUMEM not supported prior to ROCm 7.0");
  return ncclInternalError;
 }
 static inline ncclResult_t ncclCuMemFree(void *ptr) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
+  WARN("CUMEM not supported prior to ROCm 7.0");
  return ncclInternalError;
 }

 static inline ncclResult_t ncclCuMemAllocAddr(void **ptr, CUmemGenericAllocationHandle *handleIn, size_t size) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
+  WARN("CUMEM not supported prior to ROCm 7.0");
  return ncclInternalError;
 }

 static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
+  WARN("CUMEM not supported prior to ROCm 7.0");
  return ncclInternalError;
 }
 #endif
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOCATOR_H_
+#define NCCL_ALLOCATOR_H_
+
+ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
+ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
+
+#endif
@@ -122,6 +122,10 @@ typedef ncclResult_t (*ncclCommDestroy_fn_t)(ncclComm_t comm);

 typedef ncclResult_t (*ncclCommAbort_fn_t)(ncclComm_t comm);

+typedef ncclResult_t (*ncclCommShrink_fn_t)(ncclComm_t comm, int* excludeRanksList,
+                                            int excludeRanksCount, ncclComm_t *newcomm, 
+                                            ncclConfig_t* config, int shrinkFlags);
+
 typedef ncclResult_t (*ncclCommSplit_fn_t)(ncclComm_t comm, int color, int key,
                                           ncclComm_t* newcomm, ncclConfig_t* config);

@@ -158,6 +162,10 @@ typedef ncclResult_t (*ncclCommRegister_fn_t)(const ncclComm_t comm, void* buff,

 typedef ncclResult_t (*ncclCommDeregister_fn_t)(const ncclComm_t comm, void* handle);

+typedef ncclResult_t (*ncclCommWindowRegister_fn_t)(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+
+typedef ncclResult_t (*ncclCommWindowDeregister_fn_t)(ncclComm_t comm, ncclWindow_t win);
+
 typedef struct rcclApiFuncTable
 {
    uint64_t                      size;
@@ -184,6 +192,7 @@ typedef struct rcclApiFuncTable
    ncclCommFinalize_fn_t         ncclCommFinalize_fn;
    ncclCommDestroy_fn_t          ncclCommDestroy_fn;
    ncclCommAbort_fn_t            ncclCommAbort_fn;
+    ncclCommShrink_fn_t           ncclCommShrink_fn;
    ncclCommSplit_fn_t            ncclCommSplit_fn;
    ncclGetErrorString_fn_t       ncclGetErrorString_fn;
    ncclGetLastError_fn_t         ncclGetLastError_fn;
@@ -198,6 +207,8 @@ typedef struct rcclApiFuncTable
    mscclUnloadAlgo_fn_t          mscclUnloadAlgo_fn;
    ncclCommRegister_fn_t         ncclCommRegister_fn;
    ncclCommDeregister_fn_t       ncclCommDeregister_fn;
+    ncclCommWindowRegister_fn_t   ncclCommWindowRegister_fn;
+    ncclCommWindowDeregister_fn_t ncclCommWindowDeregister_fn;
    ncclAllReduceWithBias_fn_t    ncclAllReduceWithBias_fn;

 } rcclApiFuncTable;
@@ -19,6 +19,28 @@
  #endif
 #endif

+template<typename Int>
+constexpr static __host__ __device__ Int minval(Int a) { return a; }
+template<typename Int, typename ...More>
+constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
+  #if __CUDA_ARCH__
+    return minval(min(a, b), more...);
+  #else
+    return minval(a < b ? a : b, more...);
+  #endif
+}
+
+template<typename Int>
+constexpr static __host__ __device__ Int maxval(Int a) { return a; }
+template<typename Int, typename ...More>
+constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
+  #if __CUDA_ARCH__
+    return maxval(max(a, b), more...);
+  #else
+    return maxval(a > b ? a : b, more...);
+  #endif
+}
+
 #define DIVUP(x, y) \
    (((x)+(y)-1)/(y))

@@ -32,32 +54,150 @@
  size = ((size + (align) - 1) / (align)) * (align);

 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
+static __host__ __device__ constexpr Z divUp(X x, Y y) {
  return (x+y-1)/y;
 }

 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
+static __host__ __device__ constexpr Z roundUp(X x, Y y) {
  return (x+y-1) - (x+y-1)%y;
 }
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundDown(X x, Y y) {
+static __host__ __device__ constexpr Z roundDown(X x, Y y) {
  return x - x%y;
 }

 // assumes second argument is a power of 2
 template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
+static __host__ __device__ constexpr Z alignUp(X x, int a) {
  return (x + a-1) & Z(-a);
 }
 // assumes second argument is a power of 2
 template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignDown(X x, int a) {
+static __host__ __device__ constexpr Z alignDown(X x, int a) {
  return x & Z(-a);
 }

 template<typename Int>
-inline __host__ __device__ int countOneBits(Int x) {
+constexpr __host__ __device__ bool isPow2(Int x) {
+  return (x & (x-1)) == 0;
+}
+
+template<typename T>
+static __host__ __device__ T add4G(T base, int delta4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = base;
+  u32[1] += delta4G;
+  return tmp;
+}
+
+template<typename T>
+static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = ptr;
+  u32[1] += delta4G;
+  if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
+  return tmp;
+}
+
+template<typename T>
+static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = ptr;
+  u32[1] -= delta4G;
+  if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
+  return tmp;
+}
+
+// Produce the reciprocal of x for use in idivByRcp
+constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
+  return uint32_t(uint64_t(0x100000000)/x);
+}
+constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
+  return uint64_t(-1)/x + isPow2(x);
+}
+
+static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
+#if __CUDA_ARCH__
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
+#if __CUDA_ARCH__
+  return __umul64hi(a, b);
+#else
+  return (uint64_t)(((unsigned __int128)a)*b >> 64);
+#endif
+}
+
+// Produce the reciprocal of x*y given their respective reciprocals. This incurs
+// no integer division on device.
+static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint32_t rcp = mul32hi(xrcp, yrcp);
+  uint32_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint64_t rcp = mul64hi(xrcp, yrcp);
+  uint64_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+
+// Fast integer division where divisor has precomputed reciprocal.
+// idivFast(x, y, idivRcp(y)) == x/y
+static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q = x, r = 0;
+  if (yrcp != 0) {
+    q = mul32hi(x, yrcp);
+    r = x - y*q;
+    if (r >= y) { q += 1; r -= y; }
+  }
+  *quo = q;
+  *rem = r;
+}
+static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q = x, r = 0;
+  if (yrcp != 0) {
+    q = mul64hi(x, yrcp);
+    r = x - y*q;
+    if (r >= y) { q += 1; r -= y; }
+  }
+  *quo = q;
+  *rem = r;
+}
+
+static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return q;
+}
+static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return q;
+}
+
+static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return r;
+}
+static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return r;
+}
+
+template<typename Int>
+static __host__ __device__ int countOneBits(Int x) {
 #if __CUDA_ARCH__
  if (sizeof(Int) <= sizeof(unsigned int)) {
    return __popc((unsigned int)x);
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {

 // Returns index of first one bit or returns -1 if mask is zero.
 template<typename Int>
-inline __host__ __device__ int firstOneBit(Int mask) {
+static __host__ __device__ int firstOneBit(Int mask) {
  int i;
 #if __CUDA_ARCH__
  if (sizeof(Int) <= sizeof(int)) {
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
 }

 template<typename Int>
-inline __host__ __device__ int popFirstOneBit(Int* mask) {
+static __host__ __device__ int popFirstOneBit(Int* mask) {
  Int tmp = *mask;
  *mask &= *mask-1;
  return firstOneBit(tmp);
 }

 template<typename Int>
-inline __host__ __device__ int log2Down(Int x) {
+static __host__ __device__ int log2Down(Int x) {
  int w, n;
 #if __CUDA_ARCH__
  if (sizeof(Int) <= sizeof(int)) {
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
 }

 template<typename Int>
-inline __host__ __device__ int log2Up(Int x) {
+static __host__ __device__ int log2Up(Int x) {
  int w, n;
  if (x != 0) x -= 1;
 #if __CUDA_ARCH__
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
 }

 template<typename Int>
-inline __host__ __device__ Int pow2Up(Int x) {
+static __host__ __device__ Int pow2Up(Int x) {
  return Int(1)<<log2Up(x);
 }

 template<typename Int>
-inline __host__ __device__ Int pow2Down(Int x) {
+static __host__ __device__ Int pow2Down(Int x) {
  // True, log2Down can return -1, but we don't normally pass 0 as an argument...
  // coverity[negative_shift]
  return Int(1)<<log2Down(x);
 }

 template<typename UInt, int nSubBits>
-inline __host__ __device__ UInt reverseSubBits(UInt x) {
+static __host__ __device__ UInt reverseSubBits(UInt x) {
  if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
    switch (8*sizeof(UInt)) {
    case 16: x = __builtin_bswap16(x); break;
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon

 // Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
 template<typename Int>
-inline __host__ __device__ Int reverseBits(Int x, int nBits) {
+static __host__ __device__ Int reverseBits(Int x, int nBits) {
  using UInt = typename ncclToUnsigned<Int>::type;
  union { UInt ux; Int sx; };
  sx = x;
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
 // has nearly the full range of uint32_t except it only keeps the top 3 bits
 // beneath the leading 1 bit and thus has a max value of 0xf0000000.

-inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
+static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
  int log2x;
  #if __CUDA_ARCH__
    log2x = 31-__clz(x|1);
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
  return exponent<<bitsPerPow2 | mantissa;
 }

-inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
+static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
  uint32_t exponent = x>>bitsPerPow2;
  uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
  if (exponent != 0) exponent -= 1;
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {

 constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }

-inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
+static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
  return u32fpEncode(x, 3);
 }
-inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
+static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
  return u32fpDecode(x, 3);
 }

 // The hash isn't just a function of the bytes but also where the bytes are split
 // into different calls to eatHash().
-inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
  char const* ptr = (char const*)bytes;
  acc[0] ^= size;
  while (size != 0) {
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
 }

 template<typename T>
-inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
  eatHash(acc, (const void*)bytes, sizeof(T));
 }

-inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
  uint64_t h = acc[0];
  h ^= h >> 31;
  h *= 0xbac3bd562846de6b;
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
  return h;
 }

-inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
  uint64_t acc[2] = {1, 1};
  eatHash(acc, bytes, size);
  return digestHash(acc);
 }
 template<typename T>
-inline __host__ __device__ uint64_t getHash(const T* bytes) {
+static __host__ __device__ uint64_t getHash(const T* bytes) {
  return getHash((const void*)bytes, sizeof(T));
 }

@@ -19,6 +19,7 @@
 #include "graph.h"
 #include "nvmlwrap.h"
 #include "profiler.h"
+#include "allocator.h"
 #include "latency_profiler/CollTrace.h"
 #include "rccl_common.h"
 #include "recorder.h"
@@ -140,7 +141,6 @@ struct ncclSharedResources {
  int* tpRankToLocalRank;
  // Internal streams
  struct ncclStrongStream deviceStream, hostStream;
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
  int persistentRefs;
  cudaEvent_t launchEvent, scratchEvent;

@@ -229,6 +229,7 @@ struct ncclTaskColl {
  // Profiler plugin
  int eActivationMask;
  void* eventHandle;
+  uint8_t nChannels;
 };
 struct ncclTaskP2p {
  struct ncclTaskP2p* next;
@@ -243,6 +244,7 @@ struct ncclTaskP2p {
  // Profiler plugin
  int eActivationMask;
  void* eventHandle;
+  uint8_t nChannels;
 };

 struct ncclKernelPlan {
@@ -255,10 +257,14 @@ struct ncclKernelPlan {

  bool persistent; // aka captured in a graph
  bool isHostCbEnq;
+  bool isSymColl;
  enum ncclDevWorkStorageType workStorageType;
  bool kernelSpecialized;
-  void *kernelFn;
-  struct ncclDevKernelArgs* kernelArgs;
+  void* kernelFn;
+  union {
+    struct ncclDevKernelArgs* kernelArgs;
+    struct ncclSymDevArgs* kernelSymArgs;
+  };
  size_t kernelArgsSize;
  struct channelMasks channelMask;
  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
@@ -367,6 +373,7 @@ struct ncclKernelPlanner {
  struct Peer* peers/*[nRanks]*/;
  int nTasksColl, nTasksP2p;
  bool persistent;
+  bool isSymColl;

  // The list of user streams aggregated over all tasks present.
  struct ncclCudaStreamList* streams;
@@ -430,12 +437,19 @@ struct ncclPeerInfo {
  int64_t busId;
  struct ncclComm* comm;
  int cudaCompCap;
+  size_t totalGlobalMem;
  // MNNVL support
  nvmlGpuFabricInfoV_t fabricInfo;
  int cuMemSupport;
  int version;
 };

+typedef enum ncclGroupTaskType {
+  ncclGroupTaskTypeCollective = 0,
+  ncclGroupTaskTypeSymRegister = 1,
+  ncclGroupTaskTypeNum = 2,
+} ncclGroupTaskType_t;
+
 struct ncclComm {
  uint64_t startMagic;
  struct ncclMemoryStack memPermanent, memScoped;
@@ -452,9 +466,10 @@ struct ncclComm {
  struct ncclTopoSystem* topo;
  struct ncclProxyConnector* gproxyConn;
  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
+  bool peerInfoValid;

-  int netPluginLoaded;
  ncclNet_t* ncclNet;
+  int netPluginIndex;
  int ncclNetVer;
  ncclNetDeviceType netDeviceType;
  ncclCollNet_t* ncclCollNet;
@@ -471,7 +486,6 @@ struct ncclComm {

  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.

-  const char* commName;
  uint64_t commHash;
  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
@@ -556,6 +570,7 @@ struct ncclComm {

  // Device side of the communicator (for cudaFree's)
  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+  struct ncclSymDevComm symDevComm;

  uint32_t workArgsBytes; // max size of kernel args
  uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -563,12 +578,10 @@ struct ncclComm {
  void* workFifoBufDev;
  void* workFifoBufGdrHandle;

-  // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
-  uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
-  // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
-  uint32_t workFifoConsumedLeast;
  // Monotonic number of bytes (mod 1<<32) sent to fifo.
  uint32_t workFifoProduced;
+  uint32_t workFifoProducedLastRecorded;
+  uint32_t workFifoConsumed;

  // Intra-process sync
  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -584,10 +597,8 @@ struct ncclComm {
  struct ncclProxyState* proxyState;
  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
  // Whether this communicator uses collNet
-  int collNetSupport;
  bool isOneRPN;
  uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
-  bool intraNodeP2pSupport;
  int* collNetHeads;
  int collNetHeadsNum;
  int* collNetDenseToUserRank;
@@ -609,7 +620,7 @@ struct ncclComm {

  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
  // this comm is not yet in a group.
-  struct ncclComm* groupNext;
+  struct ncclComm* groupNext[ncclGroupTaskTypeNum];
  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
  struct ncclComm* preconnectNext;
  int localPersistentRefs; // number of persistent plan-lists capturing this comm
@@ -631,6 +642,7 @@ struct ncclComm {
  ncclUserRedOp *userRedOps;

  // Queue of things for the main thread to do
+  int reclaimSteps;
  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;

  hipEvent_t doneEvent;
@@ -670,6 +682,9 @@ struct ncclComm {
  // group job to support multi-thread FT
  struct ncclGroupJob *groupJob;

+  // Flag indicating if this communicator shares resources with parent or children
+  bool shareResources;
+
  // Tuning plugin
  int tunerPluginLoaded;
  ncclTuner_t* tuner;
@@ -683,16 +698,25 @@ struct ncclComm {
  // buffer registration cache
  struct ncclRegCache regCache;
  int isAllNvlink;
+  bool isAllDirectP2p;
+  int symmetricSupport;
  bool useNetPXN;
  bool useGdr;
  int splitCount;

+  // symmetric buffer
+  uint8_t* baseUCSymPtr;
+  uint8_t* baseMCSymPtr;
+  size_t baseStride;
+  size_t symAllocHead;
+  CUmemGenericAllocationHandle symMCHandle;
+  struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
+
  // Unroll factor for comm [RCCL]
  int unroll;
-
-  // custom collective
+  // custom collective [RCCL]
  bool enableCustColl;
-
+  
  uint64_t endMagic;
 };

@@ -724,15 +748,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
  return ncclSuccess;
 }

-inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
+inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) {
  ncclResult_t result = ncclSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
  while (true) {
    struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
    if (cb == nullptr) break;
-    cudaError_t ok = cudaEventSynchronize(cb->event);
-    if (ok == cudaErrorNotReady) break;
+    cudaError_t ok;
+    if (waitSome) {
+      ok = cudaEventSynchronize(cb->event);
+      waitSome = false;
+    } else {
+      ok = cudaEventQuery(cb->event);
+      if (ok == cudaErrorNotReady) break;
+    }
    ncclIntruQueueDequeue(&comm->eventCallbackQueue);
    if (ok == cudaSuccess) {
      NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
@@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
  return ncclSuccess;
 }

+static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
+  int c = 0;
+  int start = -1;
+  // Iterate through all possible CPU bits plus one extra position
+  for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
+    int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
+    // Start of a new range
+    if (isSet && start == -1) {
+      start = cpu;
+    }
+    // End of a range, add comma between ranges
+    if (!isSet && start != -1) {
+      if (cpu-1 == start) {
+        c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
+      } else {
+        c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
+      }
+      if (c >= len-1) break;
+      start = -1;
+    }
+  }
+  if (c == 0) str[0] = '\0';
+  return str;
+}
+
 #endif
@@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
    }							      \
 } while(false)

+#define CUCALL(cmd) do {				      \
+    pfn_##cmd;				                \
+} while(false)
+
 #define CUCHECKGOTO(cmd, res, label) do {		      \
    CUresult err = pfn_##cmd;				      \
    if( err != CUDA_SUCCESS ) {				      \
@@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
    }									\
 } while(0)

-#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol

 #if CUDART_VERSION >= 11030
 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
-DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
-DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
-DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
-DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
-DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000);
 #if CUDART_VERSION >= 11080
-DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060);
 #endif
 // cuMem API support
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
-DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
-DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemMap);
-DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
-DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
-DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
+DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020);
 #if CUDA_VERSION >= 11070
-DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
-DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
 #endif
 #endif

@@ -14,6 +14,7 @@
 #include <hip/hip_bfloat16.h>
 #include "nccl_common.h"
 #include "bitops.h"
+#include "symmetric.h"
 #if defined(ENABLE_NPKIT)
 #include "npkit/npkit_struct.h"
 #endif
@@ -41,6 +42,30 @@ extern const char* funcNames[];
  #define NCCL_CUDA_ARCH 0
 #endif

+#ifdef __CUDA_ARCH_SPECIFIC__
+  #define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__
+#elif defined(__CUDA_ARCH_HAS_FEATURE__)
+  #if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 900
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1000
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1010
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1200
+  #else
+    #define NCCL_CUDA_ARCH_SPECIFIC 0
+  #endif
+#else
+  #define NCCL_CUDA_ARCH_SPECIFIC 0
+#endif
+
+#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
+  #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__
+#else
+  #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
+#endif
+
 #include "net_device.h"

 enum ncclDevRedOp_t {
@@ -516,6 +541,14 @@ struct alignas(16) ncclDevChannel {
  uint64_t workCounter;
 };

+#define MAX_PROFILER_EVENTS_PER_CHANNEL 64
+struct ncclDevProfiler {
+  struct {
+    uint64_t counter;
+    uint64_t timestamp;
+  } data[MAX_PROFILER_EVENTS_PER_CHANNEL];
+};
+
 struct ncclDevComm {
  int rank;
  int nRanks;
@@ -526,9 +559,6 @@ struct ncclDevComm {
  int isAllNvlink;
  int p2pnChannelsPerPeer;

-  // Work fifo return credits
-  uint32_t* workConsumed/*[MAXCHANNELS]*/;
-
  int* collNetDenseToUserRank;

  // Flag to ask NCCL kernels to abort
@@ -540,8 +570,8 @@ struct ncclDevComm {
  int* rankToLocalRank;

  // Profiler counters
-  uint64_t* workStarted/*[MAXCHANNELS]*/;
-  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;

 #if defined(ENABLE_NPKIT)
  NpKitEventCollectContext* npKitEventCollectContexts;
@@ -641,7 +671,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int

 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
  // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
+  return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4;
 }

 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
@@ -672,7 +702,6 @@ extern int const ncclDevKernelCount;
 extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];

 // Table of most specialized kernel function to run given func index.
-extern int const ncclDevFuncIdCount;
 extern int const ncclDevFuncRowToId[];
 extern void* const ncclDevKernelForFunc[/*funcIndex*/];
 extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
@@ -51,6 +51,8 @@ int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);

+ncclResult_t ncclGetUserP2pLevel(int* level);
+
 #define MAX_XGMI_INTER_GPUS 4
 ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int64_t* id, int* dev);
 ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
@@ -81,7 +83,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);

+// Allows for up to 32 NICs per node on GB200-NVL72
 #define NCCL_TOPO_MAX_NODES 64
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);

 // Init search. Needs to be done before calling ncclTopoCompute
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
@@ -10,9 +10,11 @@

 #include "nccl.h"
 #include "comm.h"
+#include "allocator.h"
+#include "register.h"

 ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
-void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommJoin(struct ncclComm* comm, int type);
 void ncclGroupCommPreconnect(struct ncclComm* comm);
 ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
@@ -53,13 +55,14 @@ ncclResult_t ncclAsyncLaunch(

 struct ncclGroupJob {
  struct ncclAsyncJob base;
-  struct ncclComm **groupCommHeadPtr;
-  struct ncclComm **groupCommPreconnectHeadPtr;
-  ncclResult_t *groupErrorPtr;
-  bool *abortFlagPtr;
-  int *groupBlockingPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
-  bool initialized;
+  int groupRefCount;
+  bool nonBlockingInit;
+  bool joined;
+  struct ncclComm *groupCommHead[ncclGroupTaskTypeNum];
+  struct ncclComm *groupCommPreconnectHead;
+  ncclResult_t groupError;
+  bool abortFlag;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncJobs;
 };

 ncclResult_t ncclGroupStartInternal();
@@ -70,27 +73,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);

 extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
 extern __thread ncclResult_t ncclGroupError;
-extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
 extern __thread int ncclGroupBlocking;
-extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
-extern __thread struct ncclGroupJob ncclGroupJobMain;
-
-static inline void groupResetJobState() {
-  ncclGroupBlocking = -1;
-  ncclGroupJobMainPtr = NULL;
-  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
-  return;
-}
-
-static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
-  ncclResult_t ret = ncclSuccess;
-  if (job) {
-    ret = ncclAsyncJobComplete(&job->base);
-    groupResetJobState();
-  }
-  return ret;
-}

 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
  if (ncclGroupDepth > 0) {
@@ -100,31 +85,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
 }

 // Add comm to this thread's group
-inline void ncclGroupCommJoin(struct ncclComm* comm) {
-  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
+  if (comm->groupNext[type] == reinterpret_cast<struct ncclComm*>(0x1)) {
    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
    // the users program order yet insures siblings occur consecutively. This
    // is required by doLaunches() in "group.cc".
-    struct ncclComm** pp = &ncclGroupCommHead;
+    struct ncclComm** pp = &ncclGroupCommHead[type];
    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
-      pp = &(*pp)->groupNext;
+      pp = &(*pp)->groupNext[type];

    // didn't find its clique, we need to insert it with ascending order based on commHash
    if (*pp == nullptr) {
-      pp = &ncclGroupCommHead;
-      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+      pp = &ncclGroupCommHead[type];
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type];
    }
-    comm->groupNext = *pp;
+    comm->groupNext[type] = *pp;
    *pp = comm;
    // Comms gets a new memory stack scope upon joining. Each task batched for
    // this comm is allocated there.
    ncclMemoryStackPush(&comm->memScoped);
-    // Initialize planner
-    ncclKernelPlanner::Peer* tmp = comm->planner.peers;
-    memset(&comm->planner, 0, sizeof(comm->planner));
-    comm->planner.peers = tmp;
+    if (type == ncclGroupTaskTypeCollective) {
+      // Initialize planner
+      ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+      memset(&comm->planner, 0, sizeof(comm->planner));
+      comm->planner.peers = tmp;
+    }
  }
-
  ncclGroupBlocking = comm->config.blocking;
 }

@@ -137,8 +123,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
 }

 // Comm has left group
-inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
-  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
+  comm->groupNext[type] = reinterpret_cast<struct ncclComm*>(0x1);
  ncclMemoryStackPop(&comm->memScoped);
  return ncclSuccess;
 }
@@ -0,0 +1,18 @@
+#ifndef NCCL_MLX5DV_CORE_H_
+#define NCCL_MLX5DV_CORE_H_
+
+/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
+ * explicit including of MLX5 direct verbs header.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "ibvwrap.h"
+
+enum mlx5dv_reg_dmabuf_access  {
+	MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT		= (1<<0),
+};
+
+#endif  // NCCL_MLX5DV_CORE_H_
@@ -0,0 +1,23 @@
+#ifndef NCCL_MLX5DV_SYMBOLS_H_
+#define NCCL_MLX5DV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_MLX5DV
+#include <infiniband/mlx5dv.h>
+#else
+#include "mlx5/mlx5dvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* MLX5 Direct Verbs Function Pointers*/
+struct ncclMlx5dvSymbols {
+  bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
+  int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
+  /* DMA-BUF support */
+  struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+  };
+
+/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
+
+#endif  // NCCL_MLX5DV_SYMBOLS_H_
@@ -0,0 +1,41 @@
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_MLX5DVWRAP_H_
+#define NCCL_MLX5DVWRAP_H_
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#ifdef NCCL_BUILD_MLX5DV
+#include <infiniband/mlx5dv.h>
+#else
+#include "mlx5/mlx5dvcore.h"
+#endif
+
+#include "core.h"
+#include "ibvwrap.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef enum mlx5dv_return_enum
+{
+    MLX5DV_SUCCESS = 0,                   //!< The operation was successful
+} mlx5dv_return_t;
+
+ncclResult_t wrap_mlx5dv_symbols(void);
+/* NCCL wrappers of MLX5 direct verbs functions */
+bool wrap_mlx5dv_is_supported(struct ibv_device *device);
+ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
+/* DMA-BUF support */
+ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+
+#endif // NCCL_MLX5DVWRAP_H_
@@ -7,6 +7,9 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_

+#include <cstdint>
+#include "nccl.h"
+
 typedef enum {
  NCCL_LOG_NONE = 0,
  NCCL_LOG_VERSION = 1,
@@ -39,6 +42,16 @@ typedef enum {

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

+// NCCL core profiler callback for network defined events instrumentation
+enum {
+  ncclProfilerNetEventStart = 0,
+  ncclProfilerNetEventStop,
+  ncclProfilerNetEventUpdate,
+  ncclProfilerNetEventUpdateAndStop,
+};
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
 #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
 typedef enum {
  ncclFuncBroadcast = 0,
@@ -54,7 +67,7 @@ typedef enum {
  ncclNumFuncs = 10
 } ncclFunc_t;

-#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
 #define NCCL_ALGO_UNDEF -1
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
@@ -14,8 +14,6 @@

 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);

@@ -37,10 +37,11 @@
 #define NVTX_SID_CommInitRankScalable 17 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            18
 #define NVTX_SID_CommFinalize         19
+#define NVTX_SID_CommShrink           20
 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!

 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 21 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START

 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;

@@ -70,6 +70,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons
  )
 )

+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
+    (int, num_exclude, TYPE_INT, "num_exclude")
+  )
+)
+
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
  NCCL_NVTX_PAYLOAD_ENTRIES(
    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
@@ -29,10 +29,9 @@
 #define NCCL_NET_MAX_REQUESTS 32

 // Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-// NCCL core profiler callback for network defined events instrumentation
-typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+#ifndef NCCL_NET_MAX_PLUGINS
+#define NCCL_NET_MAX_PLUGINS 16
+#endif

 #include "net/net_v10.h"
 #include "net/net_v9.h"
@@ -19,43 +19,53 @@ enum {
 };

 typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,

  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,

  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined event states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
 } ncclProfilerEventState_t;

 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;

 #include <cstdint>
+#include "profiler/profiler_v4.h"
 #include "profiler/profiler_v3.h"
 #include "profiler/profiler_v2.h"
 #include "profiler/profiler_v1.h"

-typedef ncclProfiler_v3_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v4_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;

 #define NCCL_PROFILER_NET_VER_BITS  (16)
 #define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
@@ -21,8 +21,8 @@ struct ncclProxyConnector;

 struct ncclProfilerProxy {
  bool initialized;
-  uint64_t* workStarted/*[MAXCHANNELS]*/;
-  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
  uint64_t workCounter[MAXCHANNELS]; // host work counter
  struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
  struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
@@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);

 // Proxy Op Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
-ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
+ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
 ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);

 // Proxy Step Start/Stop Event Wrappers
@@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);

 // Kernel Channel Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
-ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);

 // Record Event Wrappers
-ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);

@@ -118,6 +118,13 @@ struct ncclProxyOp {
  facebook_rccl::ProxyTraceExtraInfo traceInfo;
 };

+struct ncclProxySubArgs;
+
+struct ncclProxyEventHandle {
+  void* stepEventHandle;
+  struct ncclProxySubArgs* subArgPtr;
+};
+
 struct ncclProxySubArgs {
  struct ncclProxyConnection* connection;
  int reg;
@@ -150,13 +157,12 @@ struct ncclProxySubArgs {
  // Profiler plugin
  int eActivationMask;
  int rank;
-  uint64_t profilerSteps;
  pid_t pid;
  void* profilerContext;
  void* taskEventHandle;
  void* opEventHandle;
  void* kernelEventHandle;
-  void* stepEventHandles[NCCL_STEPS];
+  struct ncclProxyEventHandle pHandles[NCCL_STEPS];
  size_t transSize;
  uint64_t workCounter;

@@ -254,6 +260,8 @@ struct ncclProxyPeer {
 };

 struct ncclSharedNetComms {
+  int activeConnect[MAXCHANNELS];
+  int activeAccept[MAXCHANNELS];
  void* sendComm[MAXCHANNELS];
  void* recvComm[MAXCHANNELS];
  int sendRefCount[MAXCHANNELS];
@@ -29,18 +29,24 @@ struct ncclRegNetHandles {
  struct ncclRegNetHandles* next;
 };

+struct ncclSymRegTask {
+  struct ncclSymRegTask *next;
+  void* buff;
+  size_t baseSize;
+  CUmemGenericAllocationHandle memHandle;
+  struct ncclReg* regHandle;
+  size_t alignment;
+};
+
 struct ncclReg {
  // common attributes
-  size_t pages;
+  uintptr_t begAddr, endAddr; // page aligned
  int localRefs;
  int graphRefs;
-  uintptr_t addr;
  uint32_t state;
  // net reg
  struct ncclRegNetHandles* netHandleHead;
  // nvls reg
-  uintptr_t baseAddr;
-  size_t baseSize;
  CUdeviceptr regAddr;
  size_t regUCSize, regMCSize;
  int dev;
@@ -52,6 +58,10 @@ struct ncclReg {
  // general ipc reg
  struct ncclPeerRegIpcAddr regIpcAddrs;
  struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
+  // symmetric reg
+  void* baseSymPtr;
+  size_t symSize;
+  int winFlags;
 };

 struct ncclRegCache {
@@ -60,10 +70,14 @@ struct ncclRegCache {
  uintptr_t pageSize;
 };

+struct ncclWindow {
+  struct ncclReg* handle;
+};
+
 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
 ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
 ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
 ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
+ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);

 #endif
@@ -0,0 +1,33 @@
+#ifndef NCCL_REGISTER_INLINE_H_
+#define NCCL_REGISTER_INLINE_H_
+
+#include "comm.h"
+#include "register.h"
+
+static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  *outReg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population) return ncclSuccess;
+    struct ncclReg *reg = cache->slots[slot];
+    if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
+    if ((uintptr_t)data + size <= reg->endAddr) {
+      *outReg = reg;
+      return ncclSuccess;
+    }
+  }
+}
+
+static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
+  struct ncclReg* regRecord = NULL;
+  *symPtr = NULL;
+  *outReg = NULL;
+  NCCLCHECK(ncclRegFind(comm, data, size, &regRecord));
+  if (regRecord && regRecord->baseSymPtr) {
+    *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
+    *outReg = regRecord;
+  }
+  return ncclSuccess;
+}
+
+#endif
@@ -15,25 +15,35 @@ typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, voi
 typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
 typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);

+#ifdef __HIP_PLATFORM_AMD__
+#define CUPFN(symbol) symbol
+#else
 #define CUPFN(symbol) pfn_##symbol
+#endif

-// Check CUDA PFN driver calls
-#define CUCHECK(cmd) do {				      \
+#define HSACHECK(cmd) do {				      \
    hsa_status_t err = pfn_##cmd;				      \
    if( err != HSA_STATUS_SUCCESS ) {				      \
      const char *errStr;				      \
      pfn_hsa_status_string(err, &errStr);	      \
-      WARN("ROCr failure '%s'", errStr);		      \
+      WARN("HIP failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    hipError_t err = cmd;				      \
+    if( err != hipSuccess ) {				      \
+      WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__);		      \
      return ncclUnhandledCudaError;			      \
    }							      \
 } while(false)

 #define CUCHECKGOTO(cmd, res, label) do {		      \
-    hsa_status_t err = pfn_##cmd;				      \
-    if( err != HSA_STATUS_SUCCESS ) {				      \
-      const char *errStr;				      \
-      pfn_hsa_status_string(err, &errStr);	      \
-      WARN("ROCr failure '%s'", errStr);		      \
+    hipError_t err = cmd;				      \
+    if( err != hipSuccess ) {				      \
+      WARN("HIP failure '%s' at %s:%d", hipGetErrorString(err), __FILE__, __LINE__);		      \
      res = ncclUnhandledCudaError;			      \
      goto label;					      \
    }							      \
@@ -45,7 +55,7 @@ typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size
    if( err != HSA_STATUS_SUCCESS ) {						\
      const char *errStr;						\
      pfn_hsa_status_string(err, &errStr);			\
-      INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr);	\
+      INFO(NCCL_ALL,"%s:%d HIP failure '%s'", __FILE__, __LINE__, errStr);	\
    }									\
 } while(false)

@@ -69,8 +69,10 @@ struct ncclSocket {

 const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
-int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
-int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
+                                          union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
+ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
+                                int* nIfs);

 // Initialize a socket
 ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
@@ -0,0 +1,90 @@
+#ifndef NCCL_DEVICE_SYMMETRIC_H_
+#define NCCL_DEVICE_SYMMETRIC_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "bitops.h"
+
+constexpr int ncclSymMaxBlocks = 64;
+constexpr int ncclSymMaxThreads = 512;
+constexpr int ncclSymLLMaxEltSize = 64;
+
+constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
+  return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
+}
+
+constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
+  return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
+}
+
+struct alignas(16) ncclSymDevBase {
+  uint32_t llEpoch[ncclSymMaxBlocks];
+  uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
+  uint32_t barInboxMc[ncclSymMaxBlocks];
+  uint32_t barInboxPerPeer[];
+
+  static constexpr size_t size(int nRanks) {
+    return sizeof(ncclSymDevBase) +
+           alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
+           ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
+  }
+};
+
+static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
+  // Get pointer to buffer trailing the header struct.
+  char* ans = (char*)(base + 1);
+  // Skip over barInboxPerPeer[]
+  ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
+  // Skip to our block
+  int epochSize = ncclSymLLEpochSize(nRanks);
+  ans += block * /*epochs=*/2 * epochSize;
+  ans += (epoch & 1)*epochSize;
+  return (uint4*)ans;
+}
+
+struct ncclSymDevComm {
+  ncclSymDevBase* base;
+  ncclSymDevBase* baseMc;
+  uint32_t stride4G;
+  int nRanks, rank;
+  uint32_t nRanks_rcp32; // idivRcp32(nRanks)
+};
+
+struct alignas(16) ncclSymDevArgs {
+  struct ncclSymDevComm comm;
+  int rootRank;
+  uint64_t redOpArg; // must be collectively uniform
+  size_t nElts;
+  char* input;
+  char* output;
+};
+
+enum ncclSymKernelId {
+  ncclSymKernelId_AllReduce_AGxLL_R,
+  ncclSymKernelId_AllReduce_AGxLLMC_R,
+  ncclSymKernelId_AllReduce_RSxLD_AGxST,
+  ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
+
+  ncclSymKernelId_AllGather_LL,
+  ncclSymKernelId_AllGather_LLMC,
+  ncclSymKernelId_AllGather_ST,
+  ncclSymKernelId_AllGather_STMC,
+
+  ncclSymKernelId_ReduceScatter_LL,
+  ncclSymKernelId_ReduceScatter_LD,
+  ncclSymKernelId_ReduceScatter_LDMC,
+
+  ncclSymKernelId_Count
+};
+
+bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+
+ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
+
+// Generated by src/device/symmetric/generate.py
+extern int const ncclSymKernelCount;
+extern void* const ncclSymKernelList[];
+void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+const char* ncclSymKernelIdToString(int kernelId);
+
+#endif
@@ -23,6 +23,7 @@

 #include "proxy.h"
 #include "comm.h"
+#include "bootstrap.h"

 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -37,7 +38,15 @@ struct ncclConnector;
 struct ncclComm;

 #define CHANNEL_MASK_OFFSET(nranks, connIndex) (nranks * (connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0))
+
 #define CONNECT_SIZE 256
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#define NCCL_MAX_PAGE_SIZE (512L * 1024L)
+#define NCCL_REC_PAGE_SIZE (4L * 1024L)
+#else
+#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L)
+#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L)
+#endif
 struct ncclConnect {
  char data[CONNECT_SIZE];
 };
@@ -65,6 +74,7 @@ struct ncclNvlsSharedRes {
  char* ucBuff; // Unicast NVLS buffer address
  char* ucCredit; // Unicast NVLS credit address
  int nChannels;
+  int nHeads;
  struct ncclShmemCollBuff nvlsShmem;
  void *nvlsShmemHandle;
 };
@@ -104,7 +114,8 @@ struct ncclTransport {

 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, bool* needsProxy=NULL);
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
+ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);

 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -139,5 +150,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz
 ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
 ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
 ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
+
+ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
+ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
+ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);

 #endif
@@ -44,6 +44,12 @@ static long log2i(long n) {
  return log2Down(n);
 }

+// Comparator function for qsort/bsearch to compare integers
+static int compareInts(const void *a, const void *b) {
+    int ia = *(const int*)a, ib = *(const int*)b;
+    return (ia > ib) - (ia < ib);
+}
+
 inline uint64_t clockNano() {
  struct timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);