2.22.3-1

Rework core for NVIDIA Trusted Computing * Compress work structs so that they are shared between channels * Utilize the full amount of kernel argument space permitted (4k) before resorting to work fifo. * Rework the task preprocessing phase. * Use a separate abortDevFlag which is kept in sync with abortFlag using cudaMemcpy operations. * Rename src/include/align.h to src/include/bitops.h Add lazy connection establishment for collective operations * Move buffer allocation and connection establishment to the first collective operation using that algorithm. * Accelerate init time and reduce memory usage. * Avoid allocating NVLS buffers if all calls are registered. * Compute algo/proto in ncclLaunchCollTasksInfo early on. * Connect peers in ncclCollPreconnectFunc if not connected already. * Also move shared buffer creation to the first send/recv call. Accelerate intra-node NVLink detection * Make each rank only detect NVLinks attached to its GPU. * Fuse XMLs to reconstruct the full NVLink topology Add init profiling to report time spend in different init phases. * Report timings of bootstrap, allgather, search, connect, etc. * Add new "PROFILE" category for NCCL_DEBUG_SUBSYS. Add support for PCI p2p on split PCI switches * Detect split PCI switches through a kernel module exposing switch information. * Update the topology XML and graph to add those inter-switch connections. Add cost estimation API * Add a new ncclGroupEndSimulate primitive to return the estimated time a group would take. Net/IB: Add separate traffic class for fifo messages * Add NCCL_IB_FIFO_TC to control the traffic class of fifo messages independently from NCCL_IB_TC. Merges PR #1194 Net/IB: Add support for IB router * Use flid instead of lid if subnets do not match * Warn if flid is 0 Optimizations and fixes for device network offload (unpack) * Double the default number of channels * Cache netDeviceType * Fix save/increment head logic to enable Tree support. Support ncclGroupStart/End for ncclCommAbort/Destroy * Allow Abort/Destroy to be called within a group when managing multiple GPUs with a single process. Improve Tuner API * Provide to the plugin the original cost table so that the plugin can leave unknown or disabled algo/proto combinations untouched. * Remove nvlsSupport and collnetSupport. Do not print version to stdout when using a debug file * Also print version from all processes with INFO debug level. Fixes issue #1271 Fix clang warnings in NVTX headers * Update NVTX headers to the latest version Fixes issue #1270 Disable port fusion in heterogeneous systems * Do not fuse ports if a mix of multi-port and single port are detected. Fix NVLS graphs search for dual NICs. * Fix NVLS graph search when we have more than one NIC per GPU. Fix crash with collnetDirect * Add separate graph search for collnetDirect, testing alltoall paths and working similarly to the NVLS search. Fix hang when nodes have different CPU types * Add the CPU type to the rank peer info. * Align all ranks on the CPU type after the first allgather. * Only use the aligned CPU type for all tuning operations. Fixes issue #1136 Fixes issue #1184 Fix performance of registered send/recv operations * Allow for single full size operations * Add INFO to confirm the registration of send/recv buffers. Move all sync ops to finalize stage * Ensure ncclCommDestroy is non-blocking if ncclCommFinalize has been called. Improve error reporting during SHM segment creation Improve support of various compilers Merges PR #1177 Merges PR #1228 Allow net and tuner plugins to be statically linked * Search for ncclNet or ncclTuner symbols in the main binary. Merges PR #979 Plugin examples includes cleanup * Harmonize err.h and common.h usage. * Add mixed plugin with both net and tuner.
2024-06-11 01:28:01 -07:00
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
@@ -11,6 +11,7 @@ typedef enum { ncclSuccess                 =  0,
               ncclSystemError             =  2,
               ncclInternalError           =  3,
               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
               ncclRemoteError             =  6 } ncclResult_t;

 #endif
@@ -8,6 +8,7 @@
 #include <stdint.h>
 #include <stdlib.h>

+#include "common.h"
 #include "err.h"

 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -19,11 +20,6 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32

-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
-
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
@@ -2,8 +2,8 @@
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 */

-#ifndef NCCL_ERR_H_
-#define NCCL_ERR_H_
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_

 /* Data types */
 typedef enum { ncclInt8       = 0, ncclChar       = 0,
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
@@ -8,15 +8,24 @@
 #ifndef NCCL_TUNER_H_
 #define NCCL_TUNER_H_

-#include "nccl.h"
+#include <stdint.h>
+#include <stdlib.h>

-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+#include "common.h"
+#include "err.h"

 #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;

 #define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
 #define NCCL_ALGO_UNDEF -1
@@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2

+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -52,31 +63,33 @@ typedef struct {
  //   - context: tuner context object
  //   - collType: collective type , e.g., allreduce, allgather…
  //   - nBytes: collective size in bytes
-  //   - collNetSupport: whether collnet supports this type
-  //   - nvlsSupport: whether nvlink sharp supports this time
  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
  //
  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the given collective
  //   - nChannels: number of channels (hence SMs) to be used.
  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
  // default tuning for the given collective.
  // Also, the plugin is allowed to not set any output, or set only the
  // algorithm and protocol, but not only the algorithm or only the protocol.
  // Unset fields will be set automatically by NCCL.
  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels);
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);

  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
+} ncclTuner_v3_t;

-typedef ncclTuner_v2_t ncclTuner_t;
+typedef ncclTuner_v3_t ncclTuner_t;

-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"

 #endif
@@ -11,14 +11,21 @@
 __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }

 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels) { *algorithm = NCCL_ALGO_RING; *protocol = NCCL_PROTO_SIMPLE; return ncclSuccess; }
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels) {
+  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+  if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  }
+  *nChannels = 1;
+  return ncclSuccess;
+}

 __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }

 #define PLUGIN_NAME "Example"

-const ncclTuner_v2_t ncclTunerPlugin_v2 = {
+const ncclTuner_v3_t ncclTunerPlugin_v3 = {
  .name = PLUGIN_NAME,
  .init = pluginInit,
  .getCollInfo = pluginGetCollInfo,
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 21
-NCCL_PATCH   := 5
+NCCL_MINOR   := 22
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -201,7 +201,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom

 ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
  memset(handle, 0, sizeof(ncclBootstrapHandle));
-  NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

  const char* env = ncclGetEnv("NCCL_COMM_ID");
  if (env) {
@@ -210,7 +209,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
      return ncclInvalidArgument;
    }
+    handle->magic = NCCL_MAGIC;
  } else {
+    NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
    memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
    NCCLCHECK(bootstrapCreateRoot(handle, false));
  }
@@ -626,7 +627,7 @@ ncclResult_t bootstrapClose(void* commState) {
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (state->unexpectedConnections != NULL) {
    unexpectedFree(state);
-    if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
+    if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
      WARN("Unexpected connections are not empty");
      return ncclInternalError;
    }
@@ -7,16 +7,17 @@
 #include "channel.h"
 #include "param.h"
 #include "gdrwrap.h"
+#include "transport.h"

 ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  struct ncclChannel* channel = &comm->channels[channelId];
  if (channel->id != -1) return ncclSuccess;

  int nRanks = comm->nRanks;
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+  int nvlsRanks = comm->localRanks;
  int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
  channel->id = channelId;
-  channel->workFifoSent = 0;
+  channel->workFifoProduced = 0;

  struct ncclSharedResources* sharedRes = comm->sharedRes;

@@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo

  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));

-  int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
+  int nvlsRanks = comm->localRanks;
+
  if (share) {
    channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
    channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
@@ -9,6 +9,69 @@
 #include "enqueue.h"
 #include "nccl.h"

+const char* ncclFuncToString(ncclFunc_t fn) {
+  switch (fn) {
+  case ncclFuncAllGather: return "AllGather";
+  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncRecv: return "Recv";
+  case ncclFuncReduce: return "Reduce";
+  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncSendRecv: return "SendRecv";
+  case ncclFuncSend: return "Send";
+  default: return "Invalid";
+  }
+}
+
+const char* ncclDevRedOpToString(ncclDevRedOp_t op) {
+  switch (op) {
+  case ncclDevSum: return "Sum";
+  case ncclDevProd: return "Prod";
+  case ncclDevMinMax: return "MinMax";
+  case ncclDevPreMulSum: return "PreMulSum";
+  case ncclDevSumPostDiv: return "SumPostDiv";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclDatatypeToString(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8: return "ncclInt8";
+  case ncclInt32: return "ncclInt32";
+  case ncclUint32: return "ncclUint32";
+  case ncclInt64: return "ncclInt64";
+  case ncclUint64: return "ncclUint64";
+  case ncclFloat16: return "ncclFloat16";
+  case ncclFloat32: return "ncclFloat32";
+  case ncclFloat64: return "ncclFloat64";
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16: return "ncclBfloat16";
+#endif
+  default: return "Unknown";
+  }
+}
+
+const char* ncclAlgoToString(int algo) {
+  switch (algo) {
+  case NCCL_ALGO_TREE: return "TREE";
+  case NCCL_ALGO_RING: return "RING";
+  case NCCL_ALGO_COLLNET_DIRECT: return "COLLNET_DIRECT";
+  case NCCL_ALGO_COLLNET_CHAIN: return "COLLNET_CHAIN";
+  case NCCL_ALGO_NVLS: return "NVLS";
+  case NCCL_ALGO_NVLS_TREE: return "NVLS_TREE";
+  default: return "Unknown";
+  }
+}
+
+const char* ncclProtoToString(int proto) {
+  switch (proto) {
+  case NCCL_PROTO_LL: return "LL";
+  case NCCL_PROTO_LL128: return "LL128";
+  case NCCL_PROTO_SIMPLE: return "SIMPLE";
+  default: return "Unknown";
+  }
+}
+
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
@@ -8,7 +8,10 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <string.h>
+#include <strings.h>
 #include <sys/syscall.h>
+#include <chrono>
 #include "param.h"

 int ncclDebugLevel = -1;
@@ -16,14 +19,15 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
+static uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
 FILE *ncclDebugFile = stdout;
-pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
-std::chrono::steady_clock::time_point ncclEpoch;
+static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+static std::chrono::steady_clock::time_point ncclEpoch;
+static bool ncclWarnSetDebugInfo = false;

 static __thread int tid = -1;

-void ncclDebugInit() {
+static void ncclDebugInit() {
  pthread_mutex_lock(&ncclDebugLock);
  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
  const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
@@ -83,6 +87,8 @@ void ncclDebugInit() {
        mask = NCCL_BOOTSTRAP;
      } else if (strcasecmp(subsys, "REG") == 0) {
        mask = NCCL_REG;
+      } else if (strcasecmp(subsys, "PROFILE") == 0) {
+        mask = NCCL_PROFILE;
      } else if (strcasecmp(subsys, "ALL") == 0) {
        mask = NCCL_ALL;
      }
@@ -94,6 +100,15 @@ void ncclDebugInit() {
    free(ncclDebugSubsys);
  }

+  const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
+  if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
+    int64_t value;
+    errno = 0;
+    value = strtoll(ncclWarnSetDebugInfoEnv, NULL, 0);
+    if (!errno)
+      ncclWarnSetDebugInfo = value;
+  }
+
  // Cache pid and hostname
  getHostName(hostname, 1024, '.');
  pid = getpid();
@@ -143,8 +158,6 @@ void ncclDebugInit() {
  pthread_mutex_unlock(&ncclDebugLock);
 }

-NCCL_PARAM(WarnSetDebugInfo, "WARN_ENABLE_DEBUG_INFO", 0);
-
 /* Common logging function used by the INFO, WARN and TRACE macros
 * Also exported to the dynamically loadable Net transport modules so
 * they can share the debugging mechanisms and output files
@@ -178,7 +191,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
  if (level == NCCL_LOG_WARN) {
    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
                   hostname, pid, tid, cudaDev, filefunc, line);
-    if (ncclParamWarnSetDebugInfo()) ncclDebugLevel = NCCL_LOG_INFO;
+    if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
  } else if (level == NCCL_LOG_INFO) {
    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
@@ -190,17 +203,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
  }

-  if (len) {
-    va_list vargs;
-    va_start(vargs, fmt);
-    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
-    va_end(vargs);
-    // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
-    // Rewind len so that we can replace the final \0 by \n
-    if (len > sizeof(buffer)) len = sizeof(buffer)-1;
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  va_list vargs;
+  va_start(vargs, fmt);
+  len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+  va_end(vargs);
+  // vsnprintf may return len > sizeof(buffer) in the case of a truncated output.
+  // Rewind len so that we can replace the final \0 by \n
+  if (len > sizeof(buffer)) len = sizeof(buffer)-1;
+  buffer[len++] = '\n';
+  if (len) fwrite(buffer, 1, len, ncclDebugFile);
 }

 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
@@ -10,30 +10,26 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    const int *ringRanks = ring->userRanks;
    const int nranks = ncclShmem.comm.nRanks;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t count = args->count;
+    size_t count, partOffset, partCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
    size_t offset;
    size_t dataOffset;
    int nelem;
    int rankDest;

-    T *inputBuf = (T*)args->sendbuff;
-    T *outputBuf = (T*)args->recvbuff;
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);

-    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+    for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
      /////////////// begin AllGather steps ///////////////
-      nelem = min(chunkCount, channelCount - elemOffset);
-      dataOffset = gridOffset + elemOffset;
+      nelem = min(chunkCount, partCount - elemOffset);
+      dataOffset = partOffset + elemOffset;

      // step 0: push data to next GPU
      rankDest = ringRanks[0];
@@ -64,52 +60,50 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t count = args->count;
    const ssize_t rank = ncclShmem.comm.rank;
-    const size_t chunkCount = args->chunkCount;
-    size_t gridOffset = args->workOffset;
-    size_t channelCount = args->workCount;
+    size_t count, gridOffset, channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;

-    const int nThreadsBcast = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
-    const int nThreadsGather = args->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
+    const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
+    const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
    const int tidEndGather = nThreadsGather;
    const int tidEndBcast = tidEndGather + nThreadsBcast;

-    if (!args->regUsed) {
+    if (!work->regUsed) {
      if (tid < tidEndGather) {
        // Gather
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
@@ -119,8 +113,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        // Bcast through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
@@ -133,7 +127,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
          prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);

        /* used as sync */
        prims.scatter(0, 0, 0, 0, -1, 0);
@@ -144,8 +138,8 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
      } else if (tid < tidEndBcast) {
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, args->sendbuff, NULL,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
        /* used as sync */
        prims.recv(0, 0);

@@ -161,10 +155,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
  template<bool BcastSendNotRecv>
  struct Scatterer {
-    struct ncclWorkElem* args;
+    struct ncclDevWorkColl* work;
    ssize_t chunkSize;
    ssize_t railGridOffset;

@@ -179,13 +173,13 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
      struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
      int nNodes = ncclShmem.comm.nNodes;
      int nRails = direct->nHeads;
-      int bid = args->bid;
-      char* inbuf = (char*)args->sendbuff;
-      char* outbuf = (char*)args->recvbuff;
-      ssize_t sizePerRank = args->count*sizeof(T);
+      int part = ncclShmem.channelId - work->channelLo;
+      char* inbuf = (char*)work->sendbuff;
+      char* outbuf = (char*)work->recvbuff;
+      ssize_t sizePerRank = work->collnet.count*sizeof(T);
      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);

-      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
      int railAllSize = railAllEnd - railAllBeg;
      if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -232,28 +226,27 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
    }
  };

-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    int tid = threadIdx.x;
-    const int nChannels = args->nChannels;
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
+    const int part = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
    int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t chunkSize = int(args->chunkCount);
-    ssize_t const &sizePerRank = args->count;
-
+    ssize_t sizePerRank = work->collnet.count*sizeof(T);
+    size_t chunkSize = work->collnet.chunkCount;
    bool isMultiRail = (direct->nHeads > 1);
    int nWarps1 = 1;
    int nWarps2 = (isMultiRail ? 2 : 1);
    int nWarps3 = (isMultiRail ? 2 : 0);
-    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
    nWarps3 = int(denom*nWarps3);
    nWarps2 = int(denom*nWarps2);
-    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+    nWarps1 = work->nWarps - (nWarps2+nWarps3);

    using Proto = ProtoSimple<1, 1>;

    int tn = nWarps1*WARP_SIZE;
    if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
        if (tid == 0) {
          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
          Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -262,10 +255,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
      } else {
        // Phase 1: send to network
        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid, tn, nullptr, &direct->out, args->sendbuff, nullptr,
+          prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
            /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
-          ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
+          ssize_t railAllBeg = railGridOffset + part * chunkSize;
          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
          ssize_t railOneEnd = railOneBeg + sizePerRank;
@@ -280,7 +273,7 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC

    tn = nWarps2*WARP_SIZE;
    if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
        if (tid == 0) {
          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -293,10 +286,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
            /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0);
        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
          Scatterer</*BcastSendNotRecv=*/true> scat;
-          scat.args = args;
+          scat.work = work;
          scat.chunkSize = chunkSize;
          scat.railGridOffset = railGridOffset;
-          prims.process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
        }
      }
      return;
@@ -311,10 +304,10 @@ struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
              /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0);
      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
        Scatterer</*BcastSendNotRecv=*/false> scat;
-        scat.args = args;
+        scat.work = work;
        scat.chunkSize = chunkSize;
        scat.railGridOffset = railGridOffset;
-        prims.process</*Recv=*/1, /*Send=*/0>(scat);
+        prims.template process</*Recv=*/1, /*Send=*/0>(scat);
      }
      return;
    }
@@ -10,28 +10,27 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    int ringIx = ring->index;
-    ssize_t chunkCount = args->chunkCount;
    const int nranks = ncclShmem.comm.nRanks;
+    ssize_t gridOffset;
+    ssize_t channelCount;
+    ssize_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    const ssize_t loopCount = nranks * chunkCount;
    ssize_t offset;
-    ssize_t gridOffset = args->workOffset;
-    ssize_t channelCount = args->workCount;
    int nelem;
    int chunk;

    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      (tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);

    for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
      ssize_t remCount = channelCount - elemOffset;
      ssize_t chunkOffset;

-      if (remCount < loopCount) chunkCount = args->lastChunkCount;
+      if (remCount < loopCount) chunkCount = alignUp(divUp(remCount, nranks), 16/sizeof(T));

      auto modRanks = [&]__device__(int r)->int {
        return r - (r >= nranks ? nranks : 0);
@@ -75,24 +74,24 @@ namespace {
      chunkOffset = chunk * chunkCount;
      offset = gridOffset + elemOffset + chunkOffset;
      nelem = (int)min(chunkCount, remCount - chunkOffset);
+
      prims.directRecv(offset, nelem);
    }
  }

  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclTree *tree = &ncclShmem.channel.tree;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t chunkCount = args->chunkCount;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;

    { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
-        (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
+        (tid, nthreads, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg);
      if (tree->up == -1) {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
@@ -118,7 +117,7 @@ namespace {

    { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0> prims
-        (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
+        (tid, nthreads, &tree->up, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
      if (tree->up == -1) {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
@@ -144,16 +143,14 @@ namespace {
  }

  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclTree *tree = &ncclShmem.channel.tree;
-    const size_t chunkCount = args->chunkCount;
-    const size_t gridOffset = args->workOffset;
-    const size_t channelCount = args->workCount;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;
-
    int nthreadsSplit;
    if (Proto::Id == NCCL_PROTO_SIMPLE) {
      nthreadsSplit = nthreads/2;
@@ -167,7 +164,7 @@ namespace {
    if (tree->up == -1) {
      // Reduce and broadcast. Max number of recv is 2, max number of send is 2
      Primitives<T, RedOp, FanSymmetric<NCCL_MAX_TREE_ARITY_TOP>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
+        prims(tid, nthreads, tree->down, tree->down, work->sendbuff, work->recvbuff, work->redOpArg);
      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
        offset = gridOffset + elemOffset;
        nelem = min(chunkCount, channelCount - elemOffset);
@@ -184,7 +181,7 @@ namespace {
       * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
       */
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_TREE_ARITY, 1>, /*Direct=*/1, Proto, 0>
-        prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
+        prims(tid, nthreadsSplit, tree->down, &tree->up, work->sendbuff, work->recvbuff, work->redOpArg, 0*Proto::MaxGroupWidth);
      if (tree->down[0] == -1) {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
@@ -203,8 +200,8 @@ namespace {
    else {
      // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
      Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_TREE_ARITY>, /*Direct=*/1, Proto, 0>
-        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
-            args->redOpArg, 1*Proto::MaxGroupWidth);
+        prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, work->sendbuff, work->recvbuff,
+            work->redOpArg, 1*Proto::MaxGroupWidth);
      if (tree->down[0] == -1) {
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
@@ -224,34 +221,33 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    #if CUDART_VERSION >= 11020 && CUDART_VERSION < 11040 && __CUDA_ARCH__ >= 800
-      runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
+      runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
    #else
-      runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(args);
+      runTreeSplit<T, RedOp, ProtoSimple<1, 1>>(tid, nthreads, work);
    #endif
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
    static constexpr int COLLNET_COPY_THREADS = 96;
-    const int tid = threadIdx.x;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+    const int bid = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
-    const ssize_t chunkSize = args->chunkCount;
-    const ssize_t size = args->count;
+    const ssize_t chunkSize = work->collnet.chunkCount;
+    const ssize_t size = work->collnet.count;
    const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;

    const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
@@ -259,7 +255,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
    const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
    const int nThreadsGather  =             ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
    const int nThreadsBcast   = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
-    const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
+    const int nThreadsReduce = work->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
    const int tidStartBcast = nThreadsGather;
    const int tidStartScatter = tidStartBcast + nThreadsBcast;
    const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -269,12 +265,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
    if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
      // Scatter
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
-           args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
+        prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        if (args->regUsed) {
+        if (work->regUsed) {
          prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
        } else {
          prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
@@ -284,12 +280,12 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
      if (hasDn) {
        // Reduce, send to network
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
-             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
+          prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
          int nelem = min(chunkSize, size-offset);
-          if (args->regUsed) {
+          if (work->regUsed) {
            prims.directRecvReduceSend(offset, nelem);
          } else {
            prims.recvReduceSend(offset, nelem);
@@ -297,7 +293,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
        }
      } else {
        // Directly send to network
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
          if (tid == tidStartReduce) {
            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -305,8 +301,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
          __syncwarp();
        } else {
          Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
-             args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+          prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
            int nelem = min(chunkSize, size-offset);
@@ -317,8 +313,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
    } else if (tid < tidStartBcast && hasUp) {
      // Gather
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
-        prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
-           args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
+        prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
+           work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
        int nelem = min(direct->nHeads*chunkSize, size-offset);
@@ -328,15 +324,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
      if (hasDn) {
        // Recv from network, broadcast
        Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
-          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
-             args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
+             work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
          int nelem = min(chunkSize, size-offset);
          prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
        }
      } else {
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
          if (tid == tidStartBcast) {
            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -345,8 +341,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
        } else {
          // Recv from network (no post thread needed)
          Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-            prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
-              args->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
+            prims(tid - tidStartBcast, nThreadsBcast, &direct->out, nullptr, work->sendbuff, work->recvbuff,
+              work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
            int nelem = min(chunkSize, size - offset);
@@ -359,18 +355,16 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCC
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    ssize_t chunkSize = args->chunkCount;
    const bool hasOut = nvls->out != -1;
    const int nranks = ncclShmem.comm.nRanks;
    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-    const int bcastWarps = hasOut ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
-    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
-    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
-    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
+    const int bcastWarps = hasOut ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 2) : 0;
+    const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasOut ? 3 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;

    const int nThreadsScatter = scatterWarps*WARP_SIZE;
    const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -381,35 +375,37 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
    const int tidEndReduce = tidEndGather + nThreadsReduce;
    const int tidEndBcast = tidEndReduce + nThreadsBcast;

-    if (args->oneNode) {
+    if (work->oneNode) {
+      ssize_t gridOffset, channelCount, chunkSize;
+      ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
      const ssize_t loopCount = nvls->nHeads * chunkSize;
-      const ssize_t channelCount = args->workCount;
-      const ssize_t gridOffset = args->workOffset;
      ssize_t offset;
      int nelem;
+      int remCount = channelCount%(nvls->nHeads*chunkSize);
+      int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));

      if (tid < tidEndScatter) {
        // Scatter
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
          offset = gridOffset + elemOffset;
-          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndGather) {
        // Gather
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
          offset = gridOffset + elemOffset;
-          nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndReduce) {
@@ -417,10 +413,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
          ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkSize = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
          chunkOffset = elemOffset + nvls->headRank * chunkSize;
          offset = gridOffset + chunkOffset;
          nelem = min(chunkSize, channelCount - chunkOffset);
@@ -428,30 +424,32 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        }
      }
    } else {
-      const int bid = args->bid;
-      const ssize_t loopSize = args->nChannels * nvls->nHeads * chunkSize;
-      const ssize_t size = args->count;
+      const int bid = ncclShmem.channelId - work->channelLo;
+      const int nChannels = work->channelHi - work->channelLo + 1;
+      const ssize_t chunkSize = work->collnet.chunkCount;
+      const ssize_t loopSize = nChannels * nvls->nHeads * chunkSize;
+      const ssize_t size = work->collnet.count;

      if (tid < tidEndScatter) {
        // Scatter
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = args->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
          prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndGather) {
        // Gather
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = args->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
          prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
        }
      } else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -460,7 +458,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
            int nelem = min(chunkSize, size - offset);
@@ -471,7 +469,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-              args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, args);
+              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
            int nelem = min(chunkSize, size - offset);
@@ -483,7 +481,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
          int nelem = min(chunkSize, size - offset);
@@ -495,25 +493,25 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SI
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
    const int treeUp = nvls->treeUp;
    const int* treeDown = nvls->treeDown;
-    ssize_t chunkCount = args->chunkCount;
+    ssize_t gridOffset, channelCount, chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    const ssize_t loopCount = nvls->nHeads * chunkCount;
-    const ssize_t channelCount = args->workCount;
-    const ssize_t gridOffset = args->workOffset;
    const int nranks = ncclShmem.comm.nRanks;
    const bool hasUp = treeUp != -1;
    const int totalWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
-    const int bcastWarps = hasUp ? (args->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
-    const int reduceWarps = args->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
-    const int scatterWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
-    const int gatherWarps = args->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
+    const int bcastWarps = hasUp ? (work->regUsed ? ((totalWarps - 2) >> 1) - 1 : 4) : 0;
+    const int reduceWarps = work->regUsed ? (totalWarps - bcastWarps - 2) : (hasUp ? 5 : nranks <= 6 ? 7 : 5);
+    const int scatterWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps + 1) >> 1;
+    const int gatherWarps = work->regUsed ? 1 : (totalWarps - reduceWarps - bcastWarps) >> 1;
    ssize_t offset;
    int nelem;
+    int remCount = channelCount%(nvls->nHeads*chunkCount);
+    int lastChunkCount = alignUp(divUp(remCount, nvls->nHeads), 16/sizeof(T));

    const int nThreadsScatter = scatterWarps*WARP_SIZE;
    const int nThreadsGather  = gatherWarps*WARP_SIZE;
@@ -528,24 +526,24 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      // Scatter
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-          args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+        prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+          work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
        offset = gridOffset + elemOffset;
-        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
        prims.scatter(offset, nelem, chunkCount, chunkCount, -1, 0);
      }
    } else if (tid < tidEndGather) {
      // Gather
      using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
      Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
-          args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        prims(tid - tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+          work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
        offset = gridOffset + elemOffset;
-        nelem = args->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+        nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
        prims.gather(offset, nelem, chunkCount, chunkCount, -1, 0);
      }
    } else if (tid < tidEndReduce && nvls->headRank != -1) {
@@ -554,10 +552,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
        Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
          ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
          chunkOffset = elemOffset + nvls->headRank * chunkCount;
          offset = gridOffset + chunkOffset;
          nelem = min(chunkCount, channelCount - chunkOffset);
@@ -568,10 +566,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
        Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/1, Proto, 0>
          prims(tid - tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, args);
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
        for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
          ssize_t chunkOffset;
-          if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+          if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
          chunkOffset = elemOffset + nvls->headRank * chunkCount;
          offset = gridOffset + chunkOffset;
          nelem = min(chunkCount, channelCount - chunkOffset);
@@ -583,10 +581,10 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
      using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
      Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/1, Proto, 0>
        prims(tid - tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
-          args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+          work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
      for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
        ssize_t chunkOffset;
-        if (channelCount - elemOffset < loopCount) chunkCount = args->lastChunkCount;
+        if (channelCount - elemOffset < loopCount) chunkCount = lastChunkCount;
        chunkOffset = elemOffset + nvls->headRank * chunkCount;
        offset = gridOffset + chunkOffset;
        nelem = min(chunkCount, channelCount - chunkOffset);
@@ -597,17 +595,15 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PRO
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = args->nWarps*WARP_SIZE;
-    const int bid = args->bid;
-    const int nChannels = args->nChannels;
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    const int bid = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
    ncclTree *tree = &ncclShmem.channel.collnetChain;
-    ssize_t chunkSize = args->chunkCount;
+    ssize_t chunkSize = work->collnet.chunkCount;
    const ssize_t loopSize = int(nChannels*chunkSize);
    const int nranks = ncclShmem.comm.nRanks;
-    const ssize_t size = args->count;
+    const ssize_t size = work->collnet.count;

    int nthreadsSplit = nthreads/2;
    if (nthreadsSplit >= 256) nthreadsSplit += 64;
@@ -634,7 +630,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL

    if (tid < nthreadsSplit) {
      if (recv == -1) {
-        if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
          if (groupTid == 0) {
            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
@@ -642,8 +638,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
          __syncwarp();
        } else {
          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-              args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid * int(chunkSize);
            int nelem = min(chunkSize, size - offset);
@@ -652,8 +648,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
        }
      } else {
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-            args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+          prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
          ssize_t offset = gridOffset + bid * int(chunkSize);
          int nelem = min(chunkSize, size - offset);
@@ -665,7 +661,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
      if (recv == nranks) {
        // I'm the first in the broadcast chain, I need to perform the division (postOp)
        if (send == -1) {
-          if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+          if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
            if (groupTid == 0) {
              int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
@@ -673,8 +669,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
            __syncwarp();
          } else {
            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-              prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-                args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+              prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+                work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
            for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
              ssize_t offset = gridOffset + bid * int(chunkSize);
              int nelem = min(chunkSize, size - offset);
@@ -683,8 +679,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
          }
        } else {
          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-              args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+            prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+              work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid * int(chunkSize);
            int nelem = min(chunkSize, size - offset);
@@ -693,8 +689,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
        }
      } else {
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
-            args->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
+          prims(groupTid, groupNthreads, &recv, &send, work->sendbuff, work->recvbuff,
+            work->redOpArg, group * Proto::MaxGroupWidth, connIndex, connIndex);
        if (send == -1) {
          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
            ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -714,29 +710,29 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runTreeSplit<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runTreeSplit<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runTreeSplit<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runTreeSplit<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };
@@ -10,23 +10,22 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    const int rank = ring->userRanks[0];
    const int nextRank = ring->userRanks[1];
-    const int root = args->root;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
+    const int root = work->root;
+    size_t chunkCount;
+    size_t channelCount;
+    size_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;

-    T *inputBuf = (T*)args->sendbuff;
-    T *outputBuf = (T*)args->recvbuff;
+    T *inputBuf = (T*)work->sendbuff;
+    T *outputBuf = (T*)work->recvbuff;
    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg);

    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
      offset = gridOffset + elemOffset;
@@ -48,23 +47,23 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };
@@ -14,11 +14,11 @@ __shared__ ncclShmemData ncclShmem;
 #endif

 struct RunWorkNop {
-  __device__ void run(ncclWork *w) {}
+  __device__ void run() {}
 };

-__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
-  ncclKernelMain<-1, RunWorkNop>(comm, channelMask, workHead);
+__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {
+  ncclKernelMain<-1, RunWorkNop>(&args4K.args);
 }

 __device__ void ncclDevFunc_Nop() {}
@@ -10,10 +10,19 @@
 #include "collectives.h"
 #include "device.h"
 #include "op128.h"
+#include "reduce_kernel.h"
 #include "network/unpack/unpack_defs.h"

 #define COLL_UNROLL (ncclCollUnroll())

+#if __CUDA_ARCH__ >= 700
+// __grid_constant__ appears to break cuda-gdb
+//#define NCCL_GRID_CONSTANT __grid_constant__
+#define NCCL_GRID_CONSTANT
+#else
+#define NCCL_GRID_CONSTANT
+#endif
+
 typedef void(*ncclDevFuncPtr_t)();
 extern __device__ ncclDevFuncPtr_t const ncclDevFuncTable[];

@@ -31,18 +40,28 @@ struct ncclShmemGroup {
 };

 struct ncclShmemData {
-  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
-  uint64_t redOpArgs[NCCL_MAX_ARITY+1];
+  struct ncclDevKernelArgs args;
  int channelId;
  int aborted;
  alignas(16) struct ncclDevComm comm;
  alignas(16) struct ncclDevChannel channel;
-  alignas(16) struct ncclWork work;
+
+  int batchIx, nextBatchIx;
+  enum ncclDevWorkType workType;
+  uint8_t directMode;
+  uint16_t funcId;
+  int nWorks;
+  int workSize;
+  uint32_t workConsumed;
+  struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
+  uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
+
+  alignas(16) char workStorage[1024];
+
  alignas(16) union {
    unpackShmem unpack;
  } devicePlugin;
 };
-static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");

 extern __shared__ ncclShmemData ncclShmem;
 #if __CUDA_ARCH__ >= 700
@@ -55,14 +74,62 @@ __device__ inline void* ncclScratchForWarp(int warp) {
  return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
 }

-__device__ inline bool barrierReduceAny(int bit) {
-  uint32_t popc;
-  asm ("{"
-    ".reg .pred barr_pred;"
-    "setp.eq.u32 barr_pred, %1, 1;"
-    "bar.red.popc.u32 %0, 2, barr_pred;"
-  "}" : "=r"(popc) : "r"(bit));
-  return popc != 0;
+__device__ inline void barrier_sync(int name) {
+  #if 0
+  asm volatile("barrier.sync %0;" :: "r"(name) : "memory");
+  #else
+  asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
+  #endif
+}
+__device__ inline void barrier_sync(int name, int nThreads) {
+  #if 0
+  asm volatile("barrier.sync %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+  #else
+  asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+  #endif
+}
+__device__ inline void barrier_sync_aligned(int name) {
+  asm volatile("barrier.sync.aligned %0;" :: "r"(name) : "memory");
+}
+__device__ inline void barrier_sync_aligned(int name, int nThreads) {
+  asm volatile("barrier.sync.aligned %0, %1;" :: "r"(name), "r"(nThreads) : "memory");
+}
+
+__device__ inline bool barrier_red_or(bool vote, int name) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred p, %2, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred p, %2, %3, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred.aligned p, %2, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
+  return bool(ans);
+}
+__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
+  int ans;
+  asm("{ .reg .pred p;"
+      "  setp.ne.s32 p, %1, 0;"
+      "  barrier.red.or.pred.aligned p, %2, %3, p; "
+      "  selp.s32 %0, 1, 0, p; }"
+      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
+  return bool(ans);
 }

 // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
@@ -71,158 +138,261 @@ inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int by
  if (offset < bytes) {
    uint64_t a=0, b=0;
    asm("ld.v2.u64 {%0,%1},[%2];" : "=l"(a),"=l"(b) : "l"((char const*)src + offset));
-    asm volatile("st.v2.u64 [%0],{%1,%2};" :: "l"((char*)dst + offset), "l"(a), "l"(b));
+    uint32_t udst = (uint32_t)__cvta_generic_to_shared(dst);
+    asm volatile("st.shared.v2.u64 [%0],{%1,%2};" :: "r"(udst + offset), "l"(a), "l"(b));
+  }
+}
+
+// Must run with at least 64 threads
+__device__ __forceinline__ void loadWorkBatchToShmem(
+    int tid, int tn, struct ncclDevKernelArgs const* args, int batchIx
+  ) {
+  int lane = tid%WARP_SIZE;
+  int workCursor = 0; // num works written in previous loop iterations.
+  while (true) {
+    struct ncclDevWorkBatch batch = ((struct ncclDevWorkBatch*)(args+1))[batchIx];
+
+    // fnsOfBitset[n] = index of n'th set bit in batch.offsetBitset.
+    // PTX has instruction "fns" (find n-th set) but it expands to a lot of SASS,
+    // since we know all lanes will be querying the same bitmask we can compute
+    // much faster using shared memory.
+    uint8_t* fnsOfBitset = (uint8_t*)ncclScratchForWarp(threadIdx.x/WARP_SIZE);
+    __syncwarp();
+    if (uint32_t(batch.offsetBitset) & (1u<<lane)) {
+      int nWorksBelow = __popc(uint32_t(batch.offsetBitset) & ((1u<<lane)-1));
+      fnsOfBitset[nWorksBelow] = lane;
+    }
+    int nWorksLow32 = __popc(uint32_t(batch.offsetBitset)); // just of low 32 bits
+    if (uint32_t(batch.offsetBitset>>32) & (1u<<lane)) {
+      int nWorksBelow = nWorksLow32;
+      nWorksBelow += __popc(uint32_t(batch.offsetBitset>>32) & ((1u<<lane)-1));
+      fnsOfBitset[nWorksBelow] = 32 + lane;
+    }
+    int nWorks = nWorksLow32 + __popc(uint32_t(batch.offsetBitset>>32)); // add high 32 bits
+    __syncwarp();
+
+    int workSize;
+    int nPacks; // total number of packs loaded, each pack is 16 bytes
+    int packInWork; // my pack index within work struct
+    int dstWork; // my work index in contiguous destination shmem
+    switch (batch.workType) {
+    case (int)ncclDevWorkTypeP2p:
+      workSize = sizeof(struct ncclDevWorkP2p);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    case (int)ncclDevWorkTypeColl:
+      workSize = sizeof(struct ncclDevWorkColl);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    case (int)ncclDevWorkTypeCollReg:
+    default:
+      workSize = sizeof(struct ncclDevWorkCollReg);
+      nPacks = nWorks*(workSize/16);
+      packInWork = tid%(workSize/16);
+      dstWork = tid/(workSize/16);
+      break;
+    }
+    if (tid == 0) {
+      ncclShmem.workSize = workSize;
+      ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
+    }
+    // We deliberately replicate these div and mod calculations into the case
+    // blocks above so that they get constant divisor optimizations by the compiler.
+    //   packInWork = tid%(workSize/16);
+    //   dstWork = tid/(workSize/16);
+
+    // We can only assume we have 64 threads, which means we can read at most 1024 bytes
+    // here which is the per batch maximum.
+    if (tid < nPacks) {
+      int srcWork = fnsOfBitset[dstWork]; // find n'th set bit in batch.offsetBitset
+      ulong2 tmp;
+      // The loads done in these two cases must be kept separate since we are
+      // relying on the compiler to use "ld.param" in the first one. The parameter
+      // space is not generically addressable, so any attempt to load through
+      // a pointer that *might* be parameter space backed will cause the
+      // compiler to spill the parameter struct (4K!) to each thread's local space
+      // before creating a pointer (to the spill) and decimate perf.
+      //
+      // An example of what not to do would be the following:
+      //
+      // if (condition) {
+      //   // The compiler could spill parameter_variable to local space and take
+      //   // the address of that, since when src is loaded below it could also
+      //   // be global space.
+      //   src = &parameter_variable;
+      // } else {
+      //   src = &global_variable;
+      // }
+      // memcpy(dst, src, n);
+      if (ncclShmem.args.workStorageType == ncclDevWorkStorageTypeArgs) {
+        char* src = (char*)args + (batch.offsetBase + srcWork*workSize + packInWork*16);
+        tmp = *(ulong2*)src; // becomes ld.param.v2.u64
+      } else {
+        char* src = (char*)ncclShmem.args.workBuf + ((batch.offsetBase + srcWork*workSize + packInWork*16) & ncclShmem.args.workMask);
+        tmp = *(ulong2*)src; // becomes ld.v2.u64
+      }
+      char* dst = ncclShmem.workStorage;
+      dst += (workCursor + dstWork)*workSize + packInWork*16;
+      *(ulong2*)dst = tmp;
+    }
+    workCursor += nWorks;
+
+    if (batch.nextExtends) {
+      batchIx += batch.nextJump;
+      tid -= 64; // Rotate threads so we use the next two warps for next batch struct.
+      if (tid < 0) tid += tn;
+    } else {
+      if (tid == 0) {
+        ncclShmem.batchIx = batchIx;
+        ncclShmem.nextBatchIx = (batch.nextJump == 0) ? -1 : batchIx + batch.nextJump;
+        ncclShmem.workType = (enum ncclDevWorkType)batch.workType;
+        ncclShmem.nWorks = workCursor;
+        ncclShmem.funcId = batch.funcId;
+      }
+      break;
+    }
  }
 }

 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWorkElement {
-  __device__ void run(ncclWorkElem*) {
+struct RunWorkColl {
+  __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
    // Put NOT IMPLEMENTED behavior here.
  }
 };

 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
-struct RunWork {
+struct RunWorkBatch;
+
+// Specialized for P2p in sendrecv.h
+template<typename T, typename RedOp>
+struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE>;
+
+// Specialized here for non-P2p (Coll and CollReg)
+template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
+struct RunWorkBatch {
  // This __forceinline__ is necessary. The compiler was inserting a function call
  // here from the LL ncclKernel.
-  __device__ __forceinline__ void run(ncclWork *w) {
-    int wid = threadIdx.x / WARP_SIZE;
-    ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
-    int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
-    #pragma unroll 1
-    while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
-      if (wid < we->nWarps) {
-        RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
+  __device__ __forceinline__ void run() {
+    int tid = threadIdx.x;
+    int tn = blockDim.x;
+
+    if (RedOpArg<RedOp>::ArgUsed) {
+      int nWorks = ncclShmem.nWorks;
+      for (int w=tid; w < nWorks; w += tn) {
+        struct ncclDevWorkColl* work = (ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
+        if (work->redOpArgIsPtr) {
+          work->redOpArg = RedOpArg<RedOp>::loadArg(reinterpret_cast<void*>(work->redOpArg));
+        }
      }
-      we = (ncclWorkElem*)((char*)we + stride);
+      __syncthreads();
+    }
+
+    #pragma unroll 1
+    for (int w=0; w < ncclShmem.nWorks; w++) {
+      struct ncclDevWorkColl* work = (struct ncclDevWorkColl*)(ncclShmem.workStorage + w*ncclShmem.workSize);
+      if (w != 0) {
+        struct ncclDevWorkColl* workPrev = (struct ncclDevWorkColl*)(ncclShmem.workStorage + (w-1)*ncclShmem.workSize);
+        if (work->nWarps != workPrev->nWarps) __syncthreads();
+      }
+      int subtn = work->nWarps*WARP_SIZE;
+      if (tid < subtn) RunWorkColl<Fn, T, RedOp, Algo, Proto>().run(tid, subtn, work);
    }
  }
 };

-static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
-  if (we->isUsed && we->redOpArgIsPtr) {
-    /* redOpArg is a pointer to the scalar value, so we'll dereference it
-     * here so that redOpArg holds the bits of the scalar going forward.
-     * The tricky thing is we don't know its type T since that's encoded in
-     * the funcIndex. Because it would be difficult to get sizeof(T) from
-     * funcIndex, we'll cheat and just dereference the largest possible size
-     * given the alignment of the pointer. We might be reading in more bytes
-     * than we need but that's harmless.
-     */
-    if (we->redOpArg%2 != 0)
-      we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
-    else if (we->redOpArg%4 != 0)
-      we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
-    else if (we->redOpArg%8 != 0)
-      we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
-    else
-      we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
-  }
-}
-
-template<int SpecializedFnId, typename SpecializedRunWork>
-__device__ void ncclKernelMain(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) {
+template<int SpecializedFnId, typename SpecializedRunWorkBatch>
+__device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
  int tid = threadIdx.x;
+  int tn = blockDim.x;
+
+  // Copy kernel args to shmem and then only read those. Otherwise the compiler
+  // will end up putting the args into thread local stack which is very wasteful.
+  if (tid < sizeof(ncclDevKernelArgs)/sizeof(uint32_t)) {
+    ((uint32_t*)&ncclShmem.args)[tid] = ((uint32_t*)args)[tid];
+  }

  // To map blockId to channelId, we need the n'th set bit of channelMask which
  // is the inverse of counting the number of set bits among the the first n.
-  if (tid < WARP_SIZE) {
-    int x = tid;
-    if (channelMask & (1ull<<x)) {
-      int y = __popcll(channelMask & ((1ull<<x)-1));
-      if (blockIdx.x == y) ncclShmem.channelId = x;
-    }
-    if (32 < MAXCHANNELS) {
-      x = 32 + tid;
-      if (channelMask & (1ull<<x)) {
-        int y = __popcll(channelMask & ((1ull<<x)-1));
-        if (blockIdx.x == y) ncclShmem.channelId = x;
-      }
-    }
+  // PTX has the fns instruction which does this but is extremely slow. We can
+  // do better when we know all threads are querying the same bitmask.
+  if (tid < MAXCHANNELS && (args->channelMask & (1ull<<tid))) {
+    int n = __popcll(args->channelMask & ((1ull<<tid)-1));
+    if (blockIdx.x == n) ncclShmem.channelId = tid;
  }
-  __syncthreads(); // publish ncclShmem.channelId
-  int channelId = ncclShmem.channelId;
+  __syncthreads(); // publish ncclShmem.{args, channelId}
  /* set abort flag to 0 */
  if (tid == 0) ncclShmem.aborted = 0;

-  if (true) {
-    void *dst, *src;
-    int bytes;
-    // Use first 3 warps to load comm, channel, and work into ncclShmem
-    switch (tid/WARP_SIZE) {
-    case 0:
-      dst = &ncclShmem.comm;
-      src = comm;
-      bytes = sizeof(ncclDevComm);
+  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  switch (tid/WARP_SIZE) {
+  case 0:
+    { void* dst = &ncclShmem.comm;
+      void* src = ncclShmem.args.comm;
+      int bytes = sizeof(ncclDevComm);
      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
-      break;
-    case 1:
-      // Get address of channel without incurring indirect load from ncclDevComm::channels
-      dst = &ncclShmem.channel;
-      src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
-      bytes = sizeof(ncclDevChannel);
+      copyToShmem16(tid, dst, src, bytes);
+    } break;
+  case 1:
+    { // Get address of channel without incurring indirect load from ncclDevComm::channels
+      void* dst = &ncclShmem.channel;
+      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
+      int bytes = sizeof(ncclDevChannel);
      static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
-      break;
-    case 2:
-      dst = &ncclShmem.work;
-      src = workHead + blockIdx.x;
-      bytes = sizeof(ncclWork);
-      static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
-      break;
-    default:
-      bytes = 0;
-      break;
-    }
-    if (bytes) copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
+      copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
+    } break;
+  default:
+    { int subtid = tid - 2*WARP_SIZE;
+      int subtn = tn - 2*WARP_SIZE;
+      loadWorkBatchToShmem(subtid, subtn, args, /*batchIx=*/blockIdx.x);
+    } break;
  }
  __syncthreads(); // publish ncclShmem

+  if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
+    // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
+    ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
+  }
+
  while (true) {
-    // Notify host that all fifo reads are complete.
-    if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
-      *ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
-    }
-
-    __syncwarp();
-    if (ncclShmem.work.header.type == ncclWorkTypeColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
-    } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
-      if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
-    }
-    __syncthreads();
-
-    if (0 <= SpecializedFnId && ncclShmem.work.header.funcIndex == (unsigned)SpecializedFnId) {
-      SpecializedRunWork().run(&ncclShmem.work);
+    if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
+      SpecializedRunWorkBatch().run();
    } else {
-      ncclDevFuncTable[ncclShmem.work.header.funcIndex]();
+      ncclDevFuncTable[ncclShmem.funcId]();
    }

-    int workIxNext = ncclShmem.work.header.workNext;
+    if (ncclShmem.nextBatchIx == -1) break;
+    int batchIx = ncclShmem.nextBatchIx;
    __syncthreads();
-    if (ncclShmem.work.header.isLast) break;
+    loadWorkBatchToShmem(tid, tn, args, batchIx);

-    copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
-
-    { // Check whether the last operation was aborted and make sure all threads exit
-      int aborted = tid == 0 ? *comm->abortFlag : 0;
-      if (barrierReduceAny(aborted)) // publish ncclShmem.work
-        break;
+    // Check whether the last operation was aborted and make sure all threads exit
+    bool aborted = false;
+    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
+    aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
+    if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
    }
+    if (aborted) break;
  }
 }

-__global__ void ncclDevKernel_Generic(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+__global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
 __device__ void ncclDevFunc_Nop();

 #define DEFINE_ncclDevKernel(suffix, coll, redop, ty, algo, proto, specializedFnId) \
-  __global__ void ncclDevKernel_##suffix(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
-    ncclKernelMain<specializedFnId, RunWork<coll, ty, redop<ty>, algo, proto>>(comm, channelMask, workHead); \
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) { \
+    ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
  }

 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
  __device__ void ncclDevFunc_##suffix() { \
-    RunWork<coll, ty, redop<ty>, algo, proto>().run(&ncclShmem.work); \
+    RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
  }

 #endif
@@ -233,6 +233,8 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
  out('#include "device.h"\n')
  out("\n")

+  out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs))
+
  # The mapping from function rows to valid primary function ids.
  out("extern int const ncclDevFuncRowToId[] = {\n")
  index = 0
@@ -251,7 +253,7 @@ with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
    cudart, _ = required_cuda(*kfn)
    sym = paste("_", "ncclDevKernel", *kfn)
    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
-    out("__global__ void %s(struct ncclDevComm*, uint64_t, struct ncclWork*);\n" % sym)
+    out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
    if cudart != 0: out("#endif\n")
  out("\n")

@@ -10,7 +10,7 @@
 #include "unpack_defs.h"

 #include "op128.h"
-#include "align.h"
+#include "bitops.h"
 #include "device.h"
 #include "common.h"

@@ -35,16 +35,16 @@ inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group,
  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
  ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
  ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
-  ncclShmem.groups[group].devicePlugin.unpack.head = handle->head;
+  ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
 }

-inline __device__ void ncclNetDeviceIncrementHead(const int group) {
-  ncclShmem.groups[group].devicePlugin.unpack.head++;
+inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
+  ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
 }

-inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group) {
+inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
  struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
-  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head;
+  handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
 }

 template <uint8_t sz>
@@ -183,7 +183,7 @@ inline __device__ void ncclNetDeviceUnpack</*Recv=*/1>(
    // Pack data from the internal iovec to the supplied flat srcs buffer using all the threads
    // + Src is necessary in the case of accessing the user buffer directly
    ncclNetDeviceUnpackInner(tid, tidInBlock, nworkers, group /* in case they need to use split warps shared memory partitioning*/,
-        ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head);
+      ix, ncclShmem.groups[group].srcs[ix + Src], workSize, ncclShmem.groups[group].devicePlugin.unpack.head[ix]);
  }
 }

@@ -54,7 +54,7 @@ struct unpackShmem {

 struct unpackGroupShmem {
  int unpackNetDeviceIndexMask; // We store a single unpackNetDeviceIndex because only one peer can be network recv
-  uint64_t head;
+  uint64_t head[NET_UNPACK_MAX_NPEERS];
  struct netUnpackMeta* g_meta[NET_UNPACK_MAX_NPEERS]; // head of handle to index into meta for meta copy
 };

@@ -44,10 +44,11 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }

  inline __device__ void barrier() {
-    if (nthreads == WARP_SIZE)
+    if (nthreads == WARP_SIZE) {
      __syncwarp();
-    else
-      asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+    } else {
+      barrier_sync(15-group, nthreads);
+    }
  }

  uint32_t abort = 0;
@@ -323,7 +324,8 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
  __device__  Primitives(
      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr,
+      bool userBufReg=false, int stepSize_=0
    ):
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
@@ -50,7 +50,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
  inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }

  inline __device__ void barrier() {
-    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+    barrier_sync(15-group, nthreads);
  }

  uint32_t abort = 0;
@@ -364,7 +364,8 @@ public:
  __device__ Primitives(
      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclWorkElem* e = nullptr, int stepSize_=0
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0, struct ncclDevWorkColl* e = nullptr,
+      bool userBufReg=false, int stepSize_=0
    ):
    redOp(redOpArg),
    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
@@ -23,7 +23,7 @@ class Primitives<
                       ConnFifoEnabled = 0x100,
                       DirectWrite = 0x200,
                       DirectRead = 0x400,
-                       ThreadsSynced = 0x800,
+                       // 0x800 is free to use
                       NvlsMinPolling = 0x1000,
                       NetDeviceUnpack = 0x2000,
                       AnyNetDeviceUnpack = 0x4000,
@@ -44,53 +44,38 @@ class Primitives<
  uint64_t *connStepPtr;
  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
  int      connStepSize; // Connection step size
-  void*    mhandle;
  void*    netDeviceHandle;

  // Don't use barrier 0 as it's used by the final sync
  __device__ void barrier() {
-    flags |= ThreadsSynced;
    if (nthreads == WARP_SIZE) __syncwarp();
    else {
      int bar = 15-group;
-      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nthreads) : "memory");
+      barrier_sync(bar, nthreads);
    }
  }
  __device__ void subBarrier() {
    if (nworkers == WARP_SIZE) __syncwarp();
    else {
-      int bar = (nworkers==nthreads ? 15 : 8) - group;
-      asm volatile("bar.sync %0, %1;" :: "r"(bar), "r"(nworkers) : "memory");
+      int bar = 15-group - (nworkers!=nthreads ? 1 : 0);
+      barrier_sync(bar, nworkers);
    }
  }

  __device__ bool barrierAny(int vote) {
-    flags |= ThreadsSynced;
    if (nthreads == WARP_SIZE) {
      return __any_sync(~0u, vote);
    } else {
-      int ans, bar = 15-group;
-      asm volatile(
-        "{ .reg .pred p;"
-        "  setp.ne.s32 p, %1, 0;"
-        "  bar.red.or.pred p, %2, %3, p; "
-        "  selp.s32 %0, 1, 0, p; }"
-        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nthreads) : "memory");
-      return ans != 0;
+      int name = 15-group;
+      return barrier_red_or(vote, name, nthreads);
    }
  }
  __device__ bool subBarrierAny(int vote) {
    if (nworkers == WARP_SIZE) {
      return __any_sync(~0u, vote);
    } else {
-      int ans, bar = (nworkers==nthreads ? 15 : 8) - group;
-      asm volatile(
-        "{ .reg .pred p;"
-        "  setp.ne.s32 p, %1, 0;"
-        "  bar.red.or.pred p, %2, %3, p; "
-        "  selp.s32 %0, 1, 0, p; }"
-        : "=r"(ans) : "r"(vote), "r"(bar), "r"(nworkers) : "memory");
-      return ans != 0;
+      int name = 15-group - (nworkers!=nthreads ? 1 : 0);
+      return barrier_red_or(vote, name, nworkers);
    }
  }

@@ -164,8 +149,8 @@ class Primitives<
      else {
        ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
      }
-      if ((flags & (AnyNetDeviceUnpack)) && (flags & (Recv*RoleWaitRecv))) {
-        ncclNetDeviceIncrementHead(group);
+      if (flags & NetDeviceUnpack) {
+        ncclNetDeviceIncrementHead(group, index);
      }
      step += StepPerSlice;
    }
@@ -436,7 +421,7 @@ private:
    }
  }

-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
    if (flags & (RoleWaitRecv|RolePostRecv)) {
      auto *conn = &peer->recv[connIndex];
      if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
@@ -488,7 +473,7 @@ private:
    }
  }

-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclDevWorkColl* e) {
    if (flags & (RoleWaitSend|RolePostSend)) {
      auto *conn = &peer->send[connIndex];
      step = conn->step;
@@ -538,13 +523,13 @@ private:
  __device__ Primitives(
      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr, struct ncclWorkElemP2p* p2p = nullptr, int stepSize_=0
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,bool userBufReg=false, int stepSize_=0
    ):
    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
    stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {

    // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nworkers = nthreads - (MaxSend > 0 && nthreads-WARP_SIZE >= 64 ? WARP_SIZE : 0);
+    this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);

    int nrecv=0, nsend=0;
    while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
@@ -572,7 +557,7 @@ private:
    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);

-    if (p2p && p2p->reg) flags |= UserBufferMode;
+    if (userBufReg) flags |= UserBufferMode;

    if (barrierAny(flags & NetDeviceUnpack)) {
      flags |= AnyNetDeviceUnpack;
@@ -584,13 +569,12 @@ private:
      }
    }

-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e);
  }

  __device__ ~Primitives() {
    // Ensure ncclShmem.groups[].send/recvConns are available
-    if (!(flags & ThreadsSynced))
-      barrier();
+    barrier();
    // Save steps for the next operation
    if (flags & (RolePostSend|RolePostRecv)) {
      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
@@ -606,8 +590,8 @@ private:
      while (*ptr != -1) if (checkAbort(spins)) break;
    }

-    if ((flags & (AnyNetDeviceUnpack)) && (flags & (RoleWaitRecv))) {
-      ncclNetDeviceSaveHead(netDeviceHandle, group);
+    if (flags & NetDeviceUnpack) {
+      ncclNetDeviceSaveHead(netDeviceHandle, group, index);
    }

    // Make sure all threads are done writing back conn->step and done using
@@ -615,7 +599,7 @@ private:
    barrier();
  }

-  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* e) {
    if (tid==0) {
      ncclShmem.groups[group].userInput = (void*)inputBuf;
      ncclShmem.groups[group].userOutput = (void*)outputBuf;
@@ -625,7 +609,7 @@ private:
    bool sendAcceptor = (flags == (flags|RoleWaitSend|DirectWrite)) || (flags == (flags|RoleWaitSend|NvlsDirectWrite));
    bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead) || (flags == (flags|RoleWaitRecv|NvlsDirectRead)); // receiver accepts direct buffer
-    int regUsed = e != nullptr ? e->elem.regUsed : 0;
+    int regUsed = e != nullptr ? e->coll.regUsed : 0;

    if (Direct && recvProvider) {
      int spins = 0;
@@ -10,22 +10,21 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const int nthreads = (int)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    const int nranks = ncclShmem.comm.nRanks;
    const int rank = ncclShmem.comm.rank;
    const int prevRank = ring->userRanks[nranks-1];
-    const int root = args->root;
-    const size_t chunkCount = args->chunkCount;
-    const size_t channelCount = args->workCount;
-    const size_t gridOffset = args->workOffset;
+    const int root = work->root;
+    size_t chunkCount;
+    size_t channelCount;
+    size_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    int nelem;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);

    if (prevRank == root) {
      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
@@ -52,23 +51,23 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };
@@ -37,6 +37,7 @@ template<typename T>
 struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
 template<typename T>
 struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
+
 template<typename T>
 struct FuncMinMax {
  using EltType = T;
@@ -47,9 +48,30 @@ struct FuncMinMax {
    isMinNotMax = (opArg&1)==0;
  }
 };
+
 template<typename T> struct FuncPreMulSum;
 template<typename T> struct FuncSumPostDiv;

+////////////////////////////////////////////////////////////////////////////////
+// Trait class for handling the reduction argument.
+
+template<typename Fn>
+struct RedOpArg { // default case: no argument
+  static constexpr bool ArgUsed = false;
+  __device__ static uint64_t loadArg(void *ptr) { return 0; }
+};
+
+template<typename T>
+struct RedOpArg<FuncMinMax<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    union { uint64_t u64; T val; };
+    u64 = 0;
+    val = *(T*)ptr;
+    return u64;
+  }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Trait classes for reduction functions. Given a function (FuncSum, etc.)
 // and a number of elements in a pack, will reduce, preOp, or postOp a pack
@@ -356,6 +378,17 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncPreMulSum

+template<typename T>
+struct RedOpArg<FuncPreMulSum<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    union { uint64_t u64; T val; };
+    u64 = 0;
+    val = *(T*)ptr;
+    return u64;
+  }
+};
+
 // General definition for all integral types, float, and double.
 template<typename T>
 struct FuncPreMulSum {
@@ -486,6 +519,14 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv

+template<typename T>
+struct RedOpArg<FuncSumPostDiv<T>> {
+  static constexpr bool ArgUsed = true;
+  __device__ static uint64_t loadArg(void *ptr) {
+    return *(uint64_t*)ptr;
+  }
+};
+
 template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
 struct FuncSumPostDiv_IntOnly;

@@ -658,7 +699,7 @@ struct Apply_LoadMultimem {
    static constexpr bool IsFloat = IsFloatingPoint<T>::value;
    static constexpr int BigPackSize =
      IsFloat && IsSum && sizeof(T) < 8 ? 16 :
-      IsFloat && IsSum ? 8 :
+      IsFloat && IsSum ? sizeof(T) :
      IsFloat && IsMinMax && sizeof(T)==2 ? 16 :
      !IsFloat && (IsSum||IsMinMax) && sizeof(T)>=4 ? sizeof(T) :
      /*multimem.ld_reduce not supported:*/ 0;
@@ -10,23 +10,22 @@

 namespace {
  template<typename T, typename RedOp, typename Proto>
-  __device__ __forceinline__ void runRing(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
-    const uint32_t nthreads = (uint32_t)args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
    ncclRing *ring = &ncclShmem.channel.ring;
    int const *ringRanks = ring->userRanks;
-    const size_t chunkCount = args->chunkCount;
    const int nranks = ncclShmem.comm.nRanks;
-    size_t channelCount = args->workCount;
-    size_t gridOffset = args->workOffset;
+    size_t count;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
    size_t offset;
    size_t dataOffset;
-    size_t count = args->count;
    uint32_t nelem;
    int rankDest;

    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
+      prims(tid, nthreads, &ring->prev, &ring->next, work->sendbuff, work->recvbuff, work->redOpArg);

    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
      nelem = min(chunkCount, channelCount - elemOffset);
@@ -54,56 +53,56 @@ namespace {
 }

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
    using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(args);
+    runRing<T, RedOp, Proto>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL>(args);
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    runRing<T, RedOp, ProtoLL128>(args);
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    runRing<T, RedOp, ProtoLL128>(tid, nthreads, work);
  }
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    const int tid = threadIdx.x;
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const size_t chunkCount = args->chunkCount;
-    const size_t count = args->count;
+    size_t count;
+    size_t gridOffset;
+    size_t channelCount;
+    size_t chunkCount;
+    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
    const int rank = ncclShmem.comm.rank;
    const int nranks = ncclShmem.comm.nRanks;
-    size_t gridOffset = args->workOffset;
-    size_t channelCount = args->workCount;
    size_t offset;
    int nelem;

    /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync; 
     * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
     * and the rest are allocated to scatter. */
-    const int nThreadsReduce = args->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
-    const int nThreadsScatter = args->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
+    const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
+    const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
    const int tidEndScatter = nThreadsScatter;
    const int tidEndReduce = tidEndScatter + nThreadsReduce;

-    if (!args->regUsed) {
+    if (!work->regUsed) {
      if (tid < tidEndScatter) {
        // Scatter
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
@@ -113,8 +112,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        // Reduce through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          offset = gridOffset + elemOffset;
          nelem = min(chunkCount, channelCount - elemOffset);
@@ -127,7 +126,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
          prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
-            args->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          prims.scatter(0, 0, 0, 0, -1, 0);
        }
@@ -138,8 +137,8 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
        // Reduce through NVLS
        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, args->recvbuff,
-            args->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, args);
+          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
+            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
          size_t outOffset = gridOffset + elemOffset;
          size_t inpOffset = outOffset + rank * count;
@@ -155,10 +154,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROT
 };

 template<typename T, typename RedOp>
-struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
+struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
  template<bool ReduceSendNotRecv>
  struct Scatterer {
-    struct ncclWorkElem* args;
+    struct ncclDevWorkColl* work;
    int chunkSize;
    ssize_t railGridOffset;

@@ -173,11 +172,11 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
      struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
      int nNodes = ncclShmem.comm.nNodes;
      int nRails = direct->nHeads;
-      int bid = args->bid;
-      void* inbuf = (void*)args->sendbuff;
-      ssize_t sizePerRank = args->count;
+      int part = ncclShmem.channelId - work->channelLo;
+      void* inbuf = (void*)work->sendbuff;
+      ssize_t sizePerRank = work->collnet.count;

-      ssize_t railAllBeg = min(railGridOffset + bid*chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
      int railAllSize = railAllEnd - railAllBeg;
      if (tid < nDsts) dstSizes[tid] = railAllSize;
@@ -204,7 +203,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
                     /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
                     /*MultimemDsts,MinDsts,MaxDsts=*/0,1,1,
                     /*PreOpSrcs=*/1>
-            (tid, tn, args->redOpArg, &args->redOpArg, false,
+            (tid, tn, work->redOpArg, &work->redOpArg, false,
             /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
               return s==0 ? (T*)inbuf + userOneBeg
                           : (T*)srcPtrs[s-1] + railAllOffset;
@@ -223,23 +222,23 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
    }
  };

-  __device__ __forceinline__ void run(ncclWorkElem *args) {
-    int tid = threadIdx.x;
-    const int nChannels = args->nChannels;
+  __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+    const int part = ncclShmem.channelId - work->channelLo;
+    const int nChannels = work->channelHi - work->channelLo + 1;
    struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
    int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t chunkSize = int(args->chunkCount);
-    ssize_t sizePerRank = args->count;
+    ssize_t chunkSize = int(work->collnet.chunkCount);
+    ssize_t sizePerRank = work->collnet.count;

    if (direct->out == -1) __trap();
    bool isMultiRail = (direct->nHeads > 1);
    int nWarps1 = (isMultiRail ? 2 : 0);
    int nWarps2 = (isMultiRail ? 2 : 1);
    int nWarps3 = 1;
-    float denom = float(args->nWarps)/float(nWarps1+nWarps2+nWarps3);
+    float denom = float(work->nWarps)/float(nWarps1+nWarps2+nWarps3);
    nWarps3 = int(denom*nWarps3);
    nWarps2 = int(denom*nWarps2);
-    nWarps1 = args->nWarps - (nWarps2+nWarps3);
+    nWarps1 = work->nWarps - (nWarps2+nWarps3);

    using Proto = ProtoSimple<1, 1>;

@@ -248,13 +247,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
      // Phase 1: Scatter inputs to peers
      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
        prims(tid, tn, nullptr, direct->heads+1, nullptr, nullptr,
-              args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
        Scatterer</*ReduceSendNotRecv=*/true> scat;
-        scat.args = args;
+        scat.work = work;
        scat.chunkSize = chunkSize;
        scat.railGridOffset = railGridOffset;
-        prims.process</*Recv=*/0, /*Send=*/1>(scat);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat);
      }
      return;
    }
@@ -262,7 +261,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,

    tn = nWarps2*WARP_SIZE;
    if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
        if (tid == 0) {
          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
@@ -272,13 +271,13 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
        // Phase 2: Reduce from peers + local input -> send to network
        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
          prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            args->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
          Scatterer</*ReduceSendNotRecv=*/false> scat;
-          scat.args = args;
+          scat.work = work;
          scat.chunkSize = chunkSize;
          scat.railGridOffset = railGridOffset;
-          prims.process</*Recv=*/1, /*Send=*/1>(scat);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat);
        }
      }
      return;
@@ -287,7 +286,7 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,

    tn = nWarps3*WARP_SIZE;
    if (tid < tn) {
-      if (args->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
        if (tid == 0) {
          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
          Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
@@ -296,10 +295,10 @@ struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT,
      } else {
        // Phase 3: recv from network
        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid, tn, &direct->out, nullptr, nullptr, args->recvbuff,
-            args->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
+          prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
+            work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
-          ssize_t railAllBeg = railGridOffset + args->bid * chunkSize;
+          ssize_t railAllBeg = railGridOffset + part * chunkSize;
          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
          ssize_t railOneEnd = railOneBeg + sizePerRank;
@@ -9,83 +9,159 @@
 #include "primitives.h"

 template<typename T, typename RedOp>
-struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  static_assert(sizeof(T)==1, "SendRecv only works on single byte types T.");
+
  template<typename Proto>
-  __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
-    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
-    ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
-    if (args->peer == ncclShmem.comm.rank) {
-      struct ncclWorkElemP2p* recvArgs = args-1;
-      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
-      if (buff != recvBuff) {
-        reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
-          (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
-      }
-    } else {
-      int chunkSize = args->chunkSize/sizeof(T);
-      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
-      int const peer = args->peer;
-      Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
-        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
-      size_t offset = 0;
-      do {
-        int nelem = min(size_t(chunkSize), count-offset);
-        prims.directSend(offset, offset, nelem);
-        offset += nelem;
-      } while(offset < count && args->reg == 0);
-    }
+  __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
+    size_t bytes = work->sendBytes;
+    int chunkSize = u32fp8Decode(work->sendChunkSize_u32fp8);
+    Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
+      prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
+            /*userBufferMode=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+    size_t cursor = 0;
+    do {
+      int n = min(size_t(chunkSize), bytes-cursor);
+      prims.directSend(cursor, cursor, n);
+      cursor += n;
+    } while (cursor < bytes && work->sendRegistered == 0);
  }

  template<typename Proto>
-  __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
-    if (args->peer != ncclShmem.comm.rank) {
-      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
-      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
-      int chunkSize = args->chunkSize/sizeof(T);
-      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
-      int const peer = args->peer;
-      Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
-        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, args, ncclShmem.comm.p2pChunkSize/sizeof(T));
-      size_t offset = 0;
-      do {
-        int nelem = min(size_t(chunkSize), count-offset);
-        prims.directRecv(offset, nelem);
-        offset += nelem;
-      } while(offset < count && args->reg == 0);
-    }
+  __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
+    size_t bytes = work->recvBytes;
+    int chunkSize = u32fp8Decode(work->recvChunkSize_u32fp8);
+    Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
+      prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
+            /*userBufferMode=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+    size_t cursor = 0;
+    do {
+      int n = min(size_t(chunkSize), bytes-cursor);
+      prims.directRecv(cursor, n);
+      cursor += n;
+    } while (cursor < bytes && work->recvRegistered == 0);
  }

-  __device__ __forceinline__ void run(ncclWork *work) {
-    struct ncclWorkElemP2p* args = work->p2pElems;
-    int ngroups = args->ngroups;
-    int tid = threadIdx.x;
-    int wid = tid / WARP_SIZE;
-    // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
-    // warps for send, 2 warps for recv).
-    // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
-    // So we mirror wid then mirror again the group.
-    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
-    uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
-    args += group;
-    tid -= args->warpStart * WARP_SIZE;
-    int nthreads = args->nWarps * WARP_SIZE;
+  __device__ __forceinline__ void run() {
+    const int tid = threadIdx.x;
+    const int tn = blockDim.x;
+    const int wid = tid/WARP_SIZE;
+    const int nWarps = tn/WARP_SIZE;
+    const int lane = tid%WARP_SIZE;

-    if (args->p2pType == ncclWorkP2pTypeUnused) return;
-    if (tid >= nthreads || args->peer == -1) return;
+    struct Shared {
+      uint32_t workSendMask; // bitmasks of which work indices have send/recv
+      uint32_t workRecvMask;
+    };
+    Shared* shared = (Shared*)ncclScratchForWarp(0);

-    // Select Proto here
-    // This is to allow the same kernel to run multiple primitives on different warps (thread groups)
-    if ((group%2) == 0) {
-      if (args->proto == NCCL_PROTO_LL) {
-        runRecv<ProtoLL>(tid, nthreads, group, args);
+    struct ncclDevWorkP2p* works = (ncclDevWorkP2p*)ncclShmem.workStorage;
+    int nWorks = ncclShmem.nWorks;
+
+    if (wid == 0) {
+      // Modify the memory range of each work[] to reflect this channel's
+      // partition of the work. Since integer divides are very heavy it's
+      // best to do them all in one warp.
+      int workIx = lane%16;
+      int isSend = lane < 16 ? 0 : 1;
+      bool hasWork = false;
+      if (workIx < nWorks) {
+        struct ncclDevWorkP2p* work = &works[workIx];
+        size_t bytes = isSend ? work->sendBytes : work->recvBytes;
+        int nParts = isSend ? work->nSendChannels : work->nRecvChannels;
+        int part = ncclP2pChannelToPart(work->nP2pChannels, work->channelBase, ncclShmem.channelId);
+        hasWork = (part < nParts);
+        if (nParts != 0) {
+          size_t partBeg, partEnd;
+          ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
+          (isSend ? work->sendAddr : work->recvAddr) = (char*)(isSend ? work->sendAddr : work->recvAddr) + partBeg;
+          (isSend ? work->sendBytes : work->recvBytes) = partEnd - partBeg;
+        }
+      }
+      uint32_t mask = __ballot_sync(~0u, hasWork);
+      if (lane == 0) {
+        shared->workSendMask = mask>>16;
+        shared->workRecvMask = mask & 0xffff;
+      }
+    }
+
+    // The fastest way to compute a warp uniform division x/y in [0,32) is to
+    // use each lane to guess a solution and count the ones that don't exceed
+    // the numerator:
+    //   __popc(__ballot_sync(~0u, y*(lane+1) <= x))
+    // That takes 1/3 the time of standard division and about 3/4 the time of
+    // approximate floating point division:
+    //   __float2int_rd(__fdividef(float(x),float(y))).
+
+    // nWarpPerWork = nWarps/nWorks
+    int nWarpPerWork = __popc(__ballot_sync(~0u, nWorks*(lane+1) <= nWarps));
+    int nRecvWarpPerWork = nWarpPerWork<=4 ? nWarpPerWork/2 : (nWarpPerWork-1)/2;
+    int nSendWarpPerWork = nWarpPerWork<=4 ? nRecvWarpPerWork : nRecvWarpPerWork+1;
+    // This might reduce nWarpPerWork which is probably desirable. It is better
+    // to have a balanced number of reading and writing threads even if that
+    // leaves warps unused.
+    nWarpPerWork = nSendWarpPerWork + nRecvWarpPerWork;
+    // The work index this warp belongs to: workIx = wid/nWarpPerWork
+    int workIx = __popc(__ballot_sync(~0u, (lane+1)*nWarpPerWork <= wid));
+
+    __syncthreads(); // Wait for works[] and shared->* to be updated by warp=0
+
+    uint32_t workSendMask = shared->workSendMask;
+    uint32_t workRecvMask = shared->workRecvMask;
+
+    __syncthreads(); // release scratch space used by shared->*
+    if (nWorks <= workIx) return;
+
+    // Thread range for whole work (send & recv combined)
+    int subtid = tid - workIx*nWarpPerWork*WARP_SIZE;
+    int subtn = nWarpPerWork*WARP_SIZE;
+
+    // A send primtive of sufficient size requires 2 cuda barrier ids.
+    constexpr int nSendWarpsForExtraGroup = NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE/WARP_SIZE;
+    // Count up all group ids used below this workIx:
+    int group, extra;
+    // Each recv gets one group id:
+    group = __popc(workRecvMask & ((1<<workIx)-1));
+    // Sends accompanying recvs get one and maybe an extra:
+    extra = (nSendWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
+    group += __popc((workSendMask & workRecvMask) & ((1<<workIx)-1))*(1+extra);
+    // Sends without recvs use more warps so compute extra accordingly:
+    extra = (nWarpPerWork >= nSendWarpsForExtraGroup) ? 1 : 0;
+    group += __popc((workSendMask & ~workRecvMask) & ((1<<workIx)-1))*(1+extra);
+
+    struct ncclDevWorkP2p* work = &works[workIx];
+    bool hasSend = 1 & (workSendMask>>workIx);
+    bool hasRecv = 1 & (workRecvMask>>workIx);
+    bool isCopy = work->sendRank == ncclShmem.comm.rank;
+    bool isSend = !hasRecv || (hasSend && subtid < nSendWarpPerWork*WARP_SIZE);
+
+    if (!isCopy && hasSend && hasRecv) {
+      // Translate thread ids to reflect just this send or recv as opposed to whole work.
+      if (isSend) {
+        subtn = nSendWarpPerWork*WARP_SIZE;
      } else {
-        runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
+        subtid -= nSendWarpPerWork*WARP_SIZE;
+        subtn = nRecvWarpPerWork*WARP_SIZE;
+        group += 1 + (nSendWarpPerWork >= nSendWarpsForExtraGroup ? 1 : 0);
+      }
+    }
+
+    if (isCopy) {
+      reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
+        (subtid, subtn, 0, nullptr, false, 1, &work->sendAddr, 1, &work->recvAddr, (ssize_t)work->sendBytes);
+    } else if (isSend) {
+      if (work->sendProtoLL) {
+        runSend<ProtoLL>(subtid, subtn, group, work);
+      } else {
+        runSend<ProtoSimple<1,1>>(subtid, subtn, group, work);
      }
    } else {
-      if (args->proto == NCCL_PROTO_LL) {
-        runSend<ProtoLL>(tid, nthreads, group, args);
+      if (work->recvProtoLL) {
+        runRecv<ProtoLL>(subtid, subtn, group, work);
      } else {
-        runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
+        runRecv<ProtoSimple<1,1>>(subtid, subtn, group, work);
      }
    }
  }
@@ -5,7 +5,9 @@
 ************************************************************************/

 #include "comm.h"
+#include "device.h"
 #include "graph.h"
+#include "transport.h"
 #include "trees.h"
 #include "rings.h"
 #include "topo.h"
@@ -84,6 +86,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
      topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
    }
  }
+  memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);

  return ncclSuccess;
 }
@@ -188,7 +191,7 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    char line[1024];
-    sprintf(line, "CollNet channel %d rank %d ", c, rank);
+    sprintf(line, "CollNetDirect channel %d rank %d ", c, rank);
    int nDown = 0;
    for (int i=0; i<nHeads; i++) {
      if (rank == heads[i]) { // is head
@@ -334,10 +337,14 @@ int ncclMinNchannels() {
  if (minNchannels < 0) minNchannels = 0;
  return minNchannels;
 }
+
+extern int64_t ncclParamWorkArgsBytes();
+
 int ncclMaxNchannels() {
  int maxNchannels = MAXCHANNELS;
  if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
  if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+  maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
  if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
  if (maxNchannels < 1) {
    WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
@@ -363,6 +370,8 @@ void exchangeValues(int* v0, int* v1) {
  *v0 = tmp;
 }

+NCCL_PARAM(UnpackDoubleNChannels, "UNPACK_DOUBLE_NCHANNELS", 1);
+
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
  // Gather data from all ranks
  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
@@ -444,13 +453,13 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa

  // Setup CollNet
  if (comm->collNetSupport == 1) {
-    struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
+    struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
    // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
-    if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
+    if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
      int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
      nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
    }
-    NCCLCHECK(connectCollNet(comm, collNetGraph));
+    NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
  }

  // Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -458,6 +467,12 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
  }

+  // Double the number of channels when using unpack networking (greater than 1 node)
+  // We won't automatically double past 16 channels, users can specify 32 if they want
+  if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
+     nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
+  }
+
  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
  // We permit combining max, then min, to only use the first channels, then duplicate them.
  if (comm->sharedRes->owner != comm) {
@@ -10,6 +10,8 @@
 #include "comm.h"
 #include "net.h"
 #include "channel.h"
+#include "transport.h"
+#include "device.h"

 // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths

@@ -732,12 +734,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp

 NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
 NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
-
-static int nextPow2(int v) {
-  int pow2 = 1;
-  while (pow2 < v) pow2 <<= 1;
-  return pow2;
-}
+extern int64_t ncclParamWorkArgsBytes();

 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
  /* here we already honor comm->max/minCTAs for p2pnChannels. */
@@ -759,19 +756,17 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
    }
  }

-  // Round to next pow2 nChannelsPerPeer and nChannels
-  comm->p2pnChannelsPerPeer = nextPow2(minChannels);
-  comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  // Make nChannelsPerPeer and nChannels powers of 2. This is relied on when
+  // mapping p2p peers to channels.
+  comm->p2pnChannelsPerPeer = pow2Up(minChannels);
+  comm->p2pnChannels = pow2Up(comm->p2pnChannels);
+
+  comm->p2pnChannels = std::min(comm->p2pnChannels, pow2Down(ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes())));
+  comm->p2pnChannelsPerPeer = std::min(comm->p2pnChannelsPerPeer, comm->p2pnChannels);

  // Init channels that weren't used so far
  for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));

-  // We want to spread channels used when there aren't many and progressively
-  // fill the whole space of nChannels. To do so we mirror the bits in the
-  // nChannels space.
-  for (int c=0; c<comm->p2pnChannels; c++) {
-    comm->p2pChannels[c] = mirrorBits(c, comm->p2pnChannels);
-  }
  return ncclSuccess;
 }

@@ -8,6 +8,7 @@
 #include "core.h"
 #include "graph.h"
 #include "topo.h"
+#include "transport.h"
 #include "xml.h"
 #include <math.h>

@@ -51,6 +52,15 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
  return ncclSuccess;
 }

+ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm) {
+  // We assume there is at least one CPU and that the CPUs have the same
+  // architecture and vendor.
+  const struct ncclTopoNodeSet* cpus = &comm->topo->nodes[CPU];
+  comm->cpuArch = cpus->nodes[0].cpu.arch;
+  comm->cpuVendor = cpus->nodes[0].cpu.vendor;
+  return ncclSuccess;
+}
+
 static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, int type, struct ncclTopoLink** revLink) {
  for (int l=0; l<node2->nlinks; l++) {
    struct ncclTopoLink* link = node2->links+l;
@@ -104,7 +114,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
 }

 // Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
-static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, float mult, struct ncclTopoNode** node) {
  // First handle easy cases
  *node = system->nodes[type2].nodes+index2;
  if (type1 == -1) return ncclSuccess;
@@ -334,6 +344,42 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
  return ncclSuccess;
 }

+ncclResult_t ncclTopoSearchTryCollnetDirect(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
+  int fwdg = 0;
+  int bwdg = 0;
+  struct ncclTopoNode* gpu = NULL;
+  float mul = 1.0 / (float)(system->nodes[GPU].count - 1);
+  do {
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, mul, &gpu));
+  } while (gpu && ++fwdg < system->nodes[GPU].count);
+
+  if (gpu != NULL) {
+    do {
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, mul, &gpu));
+    } while (gpu && ++bwdg < system->nodes[GPU].count);
+    if (gpu != NULL) {
+      // Both directions worked. Now we already have head, so pop the all other intra ranks.
+      int step = 1;
+      for (int index = 0; index < ngpus; ++index) {
+        if (index != g) {
+          graph->intra[graph->nChannels * ngpus + step] = system->nodes[GPU].nodes[index].gpu.rank;
+          step++;
+        }
+      }
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, NULL, ngpus, -1, -1, 0, time));
+    }
+    while (bwdg) {
+      bwdg--;
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, bwdg, GPU, g, -mul, &gpu));
+    }
+  }
+  while (fwdg) {
+    fwdg--;
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, fwdg, -mul, &gpu));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchTryNvls(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int g, int ngpus, int *time) {
  struct ncclTopoNode* nvs;
  struct ncclTopoNode* gpu;
@@ -514,6 +560,8 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
    }
  } else if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
    NCCLCHECK(ncclTopoSearchTryNvls(system, graph, saveGraph, g, ngpus, time));
+  } else if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
+    NCCLCHECK(ncclTopoSearchTryCollnetDirect(system, graph, saveGraph, g, ngpus, time));
  } else if (step < system->nodes[GPU].count-1) {
    // Go to next GPU
    int next[NCCL_TOPO_MAX_NODES];
@@ -552,9 +600,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
  int* nets;
  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
  int netCount;
+  int graphFound = 0;
  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
  for (int i=0; i<netCount; i++) {
-    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && i>0) continue;
+    if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) continue;
    int n = nets[(graph->nChannels+i)%netCount];
    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    if (graph->collNet && net->net.collSupport == 0) continue;
@@ -571,12 +620,22 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      }
    }

-    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+    if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
      // NVLS search only tries to find NIC:GPU combinations to compute the heads.
      if (graph->nChannels < netCount) {
        int gpu;
+        int duplicate = 0;
        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        // check whether there is duplicate head when one GPU connects with multiple NICs
+        for (int gc = 0; gc < graph->nChannels; gc++) {
+          if (graph->intra[gc * system->nodes[GPU].count] == system->nodes[GPU].nodes[gpu].gpu.rank) {
+            duplicate = 1;
+            break;
+          }
+        }
+        if (duplicate) continue;
        if (gpu != -1) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
+        graphFound = 1;
      }
    } else {
      if (graph->nChannels > 0) {
@@ -891,8 +950,9 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
  int ccMin;
  NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
-  // NVLS search must have ngpus heads at most.
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = system->nodes[GPU].count;
+  // NVLS and COLLNET_DIRECT search must have ngpus heads at most.
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
+    graph->maxChannels = system->nodes[GPU].count;

  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;

@@ -1104,7 +1164,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
 exit:
  return ret;
 fail:
-  WARN("Could not find NIC for rank %d in NVLS graph\n", comm->rank);
+  WARN("Could not find NIC for rank %d in NVLS graph", comm->rank);
  goto exit;
 }

@@ -11,6 +11,7 @@
 #include "nvmlwrap.h"
 #include "net.h"
 #include "coll_net.h"
+#include "transport.h"
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "xml.h"
@@ -51,7 +52,12 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
    return ncclSuccess;
  }
  for (int l=0; l<node->nlinks; l++) {
-    if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    // Go up the PCI tree to find the CPU. Follow only PCI switches.
+    if (node->links[l].type == LINK_PCI
+	&& (node->links[l].remNode->type == PCI
+	    || node->links[l].remNode->type == CPU)) {
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    }
    if (*cpu != NULL) return ncclSuccess;
  }
  return ncclSuccess;
@@ -109,11 +115,6 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
  n->type = type;
  n->id = id;
  if (type == GPU) {
-    // Create link to itself (used in some corner cases)
-    n->nlinks=1;
-    n->links[0].type = LINK_LOC;
-    n->links[0].remNode = n;
-    n->links[0].bw = LOC_BW;
    n->gpu.dev = NCCL_TOPO_UNDEF;
    n->gpu.rank = NCCL_TOPO_UNDEF;
    n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
@@ -279,8 +280,10 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN

  for (int l=0; l<node->nlinks; l++) {
    struct ncclTopoLink* link = node->links+l;
-    if (link->type == LINK_LOC) continue;
-    if (link->type != LINK_PCI || link->remNode != prevNode) {
+    if (link->type == LINK_LOC) {
+      sprintf(line+offset, "+ %s[%2.1f] - %s/%lX", topoLinkTypeStr[link->type], link->bw, topoNodeTypeStr[link->remNode->type], link->remNode->id);
+      INFO(NCCL_GRAPH, "%s", line);
+    } else if (link->type != LINK_PCI || link->remNode != prevNode) {
      sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->bw);
      int nextOffset = strlen(line);
      if (link->type == LINK_PCI) {
@@ -443,7 +446,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s

    for (int s=0; s<xmlPci->nSubs; s++) {
      struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
-      NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+      if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
+        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+      }
    }
  }

@@ -579,6 +584,38 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
  return ncclSuccess;
 }

+ncclResult_t ncclTopoAddPciLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
+  if (strcmp(node->name, "pcilink") == 0) {
+    struct ncclTopoNode* pci = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    pBusId = NCCL_TOPO_ID(systemId, pBusId);
+    NCCLCHECK(ncclTopoGetNode(system, &pci, PCI, pBusId));
+    if (pci == NULL) {
+      WARN("Add PCI Link error : could not find PCI SW %lx", pBusId);
+      return ncclInternalError;
+    }
+    struct ncclTopoNode* remote = NULL;
+    const char* target;
+    NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+    int64_t busId;
+    NCCLCHECK(busIdToInt64(target, &busId));
+    NCCLCHECK(ncclTopoGetNode(system, &remote, PCI, NCCL_TOPO_ID(systemId, busId)));
+    if (remote) NCCLCHECK(ncclTopoConnectNodes(pci, remote, LINK_LOC, LOC_BW));
+  } else {
+    if (strcmp(node->name, "cpu") == 0) {
+      NCCLCHECK(ncclGetSystemId(system, node, &systemId));
+    }
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddPciLinks(node->subs[s], system, busId ? busId : parentBusId, systemId));
+    }
+  }
+  return ncclSuccess;
+}
+
+
 ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId, int systemId) {
  if (strcmp(node->name, "c2c") == 0) {
    struct ncclTopoNode* gpu = NULL;
@@ -626,6 +663,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem

  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
  NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
+  NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0));

  NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
@@ -668,6 +706,18 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
  return ncclSuccess;
 }

+ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
+  //refresh the switch topology by reading the link below
+  FILE *fp = fopen("/sys/kernel/pci_switch_link/refresh_switch_toplogy", "r");
+  if (fp != NULL) {
+    int tmp;
+    size_t r = fread(&tmp, sizeof(tmp), 1, fp);
+    if (r != 1)
+      INFO(NCCL_GRAPH, "Failed to read refresh_switch_toplogy");
+    fclose(fp);
+  }
+  return ncclSuccess;
+}

 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
  struct ncclXml* xml;
@@ -687,18 +737,17 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
  }

-  // Auto-detect GPUs if needed
-  for (int r=0; r<comm->nRanks; r++) {
-    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
-      struct ncclXmlNode* node;
-      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
-      if (node == NULL) continue;
-      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
-      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
-      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
-    }
+  NCCLCHECK(ncclTopoRefreshBcmP2pLinks());
+
+  // Detect only the GPU managed by this process.  We'll get any others through XML fusion.
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  NCCLCHECK(int64ToBusId(comm->peerInfo[comm->rank].busId, busId));
+  struct ncclXmlNode* node;
+  NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+  if (node) {
+    NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
+    NCCLCHECK(xmlSetAttrInt(node, "rank", comm->rank));
+    NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport));
  }
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
@@ -728,6 +777,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  for (int n=0; n<netDevCount; n++) {
    ncclNetProperties_t props;
    NCCLCHECK(comm->ncclNet->getProperties(n, &props));
+    comm->netDeviceType = props.netDeviceType;
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -745,24 +795,46 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECK(ncclTopoTrimXml(xml));

+  // XML topo fusion.
+  int* localRanks;
+  int localRank = -1, nLocalRanks = 0;
  if (comm->MNNVL) {
    // MNNVL clique support
-    char* mem;
-    NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
-    struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
-    memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
-    NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
-    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
-    struct ncclXml* cliqueXml;
-    NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
-    for (int i = 0; i < comm->clique.size; i++) {
-      struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
-      NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
-      NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
+    nLocalRanks = comm->clique.size;
+    localRank = comm->cliqueRank;
+    localRanks = comm->clique.ranks;
+  } else {
+    // Intra-node fusion.  Much of the comm is not initialized yet at this point so we need to do our own calculations.
+    NCCLCHECK(ncclCalloc(&localRanks, comm->nRanks));
+    for (int i = 0; i < comm->nRanks; i++) {
+      if (comm->peerInfo[i].hostHash == comm->peerInfo[comm->rank].hostHash) {
+        if (i == comm->rank)
+          localRank = nLocalRanks;
+        localRanks[nLocalRanks++] = i;
+      }
    }
-    free(xml);
-    xml = cliqueXml;
  }
+  char* mem;
+  NCCLCHECK(ncclCalloc(&mem, nLocalRanks * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
+  struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*localRank);
+  memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
+  NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
+  NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, localRanks, localRank, nLocalRanks, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
+  if (comm->MNNVL) {
+    // Ensure that we have enough room when fusing topos from multiple nodes.
+    free(xml);
+    NCCLCHECK(xmlAlloc(&xml, nLocalRanks*NCCL_TOPO_XML_MAX_NODES));
+  } else {
+    // In the intra-node case there's no need to enlarge the topo xml.
+    xml->maxIndex = 0;
+    free(localRanks);
+  }
+  for (int i = 0; i < nLocalRanks; i++) {
+    struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
+    NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
+    NCCLCHECK(ncclTopoFuseXml(xml, peerXml));
+  }
+  free(mem);

  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
@@ -218,7 +218,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
      return ncclSuccess;
    }
  }
-  WARN("Could not find NET with id %lx\n", id);
+  WARN("Could not find NET with id %lx", id);
  return ncclInternalError;
 }

@@ -110,11 +110,9 @@ NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);

 static float getNetOverhead(struct ncclComm* comm) {
  if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_X86 && cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
-  else return 1.0;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
+  return 1.0;
 }

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
@@ -317,6 +315,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    }
    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+    if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
  }

  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
@@ -415,15 +414,15 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };

-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup) {
-  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
-  float lat = info->comm->latencies[info->coll][algorithm][protocol];
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
+  float bw = comm->bandwidths[coll][algorithm][protocol];
+  float lat = comm->latencies[coll][algorithm][protocol];

  if (backup) {
    *backup = false;
    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
      /* try back up RING algorithm */
-      bw = info->comm->ringbdw[info->coll][protocol];
+      bw = comm->ringbdw[coll][protocol];
      *backup = true;
    }
  }
@@ -431,15 +430,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
  if (bw == 0) {
    *time = -1.0; return ncclSuccess;
  }
-  int logSize = log2i(info->nBytes>>6);
-  if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
-  if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels;
-  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
-      && info->coll == ncclFuncAllReduce && info->nBytes/(info->comm->nChannels*info->comm->nRanks) >= 64) {
-    lat *= info->comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+  int logSize = log2i(nBytes>>6);
+  if (algorithm == NCCL_ALGO_TREE && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
+      && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
+    lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
  }
  // Tree pipelining saves latency in aggregation cases
-  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_WORK_ELEMENTS);
-  *time = lat * latCount + (info->nBytes) / (1000 * bw);
+  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
+  *time = lat * latCount + nBytes / (1000 * bw);
  return ncclSuccess;
 }
@@ -272,56 +272,34 @@ ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml)
  return ncclSuccess;
 }

-ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
-  struct ncclXmlNode* topNode;
-  NCCLCHECK(xmlFindTag(dst, "system", &topNode));
+static ncclResult_t xmlTopoFuseXmlRecursive(struct ncclXml* dst, struct ncclXmlNode* dstParent, struct ncclXmlNode* srcParent) {
+  for (int i = 0; i < srcParent->nSubs; i++) {
+    struct ncclXmlNode* srcNode = srcParent->subs[i];
+    struct ncclXmlNode* dstNode;
+    NCCLCHECK(xmlFindNode(dstParent, srcNode, &dstNode));
+    if (dstNode == NULL) {
+      NCCLCHECK(xmlAddTree(dst, dstParent, srcNode));
+    } else {
+      NCCLCHECK(xmlTopoFuseXmlRecursive(dst, dstNode, srcNode));
+    }
+  }
+  return ncclSuccess;
+}

-  if (topNode == NULL) {
+ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src) {
+  struct ncclXmlNode* topNodeDst;
+  NCCLCHECK(xmlFindTag(dst, "system", &topNodeDst));
+
+  if (topNodeDst == NULL) {
    xmlAddTree(dst, NULL, src->nodes);
    return ncclSuccess;
  }

-  // Fuse the CPUs with the first XML
-  struct ncclXmlNode* srcCpu;
-  NCCLCHECK(xmlFindTag(src, "cpu", &srcCpu));
-  while (srcCpu) {
-    const char* srcNumaId;
-    const char* srcHostHash;
-    NCCLCHECK(xmlGetAttr(srcCpu, "numaid", &srcNumaId));
-    if (srcNumaId == NULL) {
-      WARN("TopoFuseXmls : could not find CPU numa ID.");
-      return ncclInternalError;
-    }
-    xmlGetAttr(srcCpu, "host_hash", &srcHostHash);
-    if (srcHostHash == NULL)
-      srcHostHash = "0";
+  struct ncclXmlNode* topNodeSrc;
+  NCCLCHECK(xmlFindTag(src, "system", &topNodeSrc));

-    // Search through the destination for a duplicate.  Note that
-    // this makes the complexity of this whole function O(n^2), but n
-    // is expected to be small.
-    struct ncclXmlNode* dstCpu;
-    NCCLCHECK(xmlFindTag(dst, "cpu", &dstCpu));
-    while (dstCpu) {
-      const char* dstNumaId;
-      const char* dstHostHash;
-      NCCLCHECK(xmlGetAttr(dstCpu, "numaid", &dstNumaId));
-      if (dstNumaId == NULL) {
-        WARN("TopoFuseXmls : could not find CPU numa ID.");
-        return ncclInternalError;
-      }
-      xmlGetAttr(dstCpu, "host_hash", &dstHostHash);
-      if (dstHostHash == NULL)
-        dstHostHash = "0";
-      if (strcmp(srcNumaId, dstNumaId) == 0 && strcmp(srcHostHash, dstHostHash) == 0)
-        break;
+  NCCLCHECK(xmlTopoFuseXmlRecursive(dst, topNodeDst, topNodeSrc));

-      NCCLCHECK(xmlFindNextTag(dst, "cpu", dstCpu, &dstCpu));
-    }
-    // Only add the CPU if no duplicate was found
-    if (dstCpu == NULL)
-      NCCLCHECK(xmlAddTree(dst, topNode, srcCpu));
-    NCCLCHECK(xmlFindNextTag(src, "cpu", srcCpu, &srcCpu));
-  }
  return ncclSuccess;
 }

@@ -335,6 +313,11 @@ ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclX
  return ncclSuccess;
 }

+ncclResult_t ncclTopoXmlLoadPciLink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
  return ncclSuccess;
@@ -357,8 +340,8 @@ ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlN
 }

 ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
-  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
-  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic}, { "pcilink", ncclTopoXmlLoadPciLink} };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 4));
  return ncclSuccess;
 }

@@ -423,6 +406,28 @@ static ncclResult_t getPciPath(const char* busId, char** path) {
  return ncclSuccess;
 }

+#include <dirent.h>
+static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
+  *nlinks = 0;
+  *peers = NULL;
+  char dirPath[] = "/sys/kernel/pci_switch_link/virtual_switch_links/0000:00:00.0";
+  memcpylower(dirPath+sizeof("/sys/kernel/pci_switch_link/virtual_switch_links/")-1, busId, BUSID_SIZE-1);
+  DIR *dir = opendir(dirPath);
+  if (dir) {
+    struct dirent* file;
+    while ((file = readdir(dir)) != NULL) {
+      if (strlen(file->d_name) != BUSID_SIZE-1) continue;
+      char* path;
+      if (getPciPath(file->d_name, &path) == ncclSystemError) continue;
+      free(path);
+      NCCLCHECK(ncclRealloc(peers, (*nlinks)*BUSID_SIZE, ((*nlinks)+1)*BUSID_SIZE));
+      memcpy((*peers)+BUSID_SIZE*(*nlinks)++, file->d_name, BUSID_SIZE);
+    }
+    closedir(dir);
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
  char filePath[PATH_MAX];
  sprintf(filePath, "%s/%s", path, fileName);
@@ -541,10 +546,11 @@ ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct n
 // There can be trailing chars.
 int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
 int checkBDFFormat(char* bdf) {
-  if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
-  if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
-      isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
-      isHex(bdf[11] == 0)) return 0;
+  if (strlen(bdf) != 12) return 0;
+  if ((bdf[4] != ':') || (bdf[7] != ':') || (bdf[10] != '.')) return 0;
+  if ((isHex(bdf[0]) == 0) || (isHex(bdf[1]) == 0) || (isHex(bdf[2]) == 0) || (isHex(bdf[3]) == 0) ||
+      (isHex(bdf[5]) == 0) || (isHex(bdf[6]) == 0) || (isHex(bdf[8]) == 0) || (isHex(bdf[9]) == 0) ||
+      (isHex(bdf[11]) == 0)) return 0;
  return 1;
 }

@@ -608,6 +614,24 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
      NCCLCHECK(xmlSetAttr(pciNode, "link_width", ""));
    }
  }
+
+  const char* vendor;
+  NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
+  if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
+    int nlinks;
+    char* peers;
+    NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
+    for (int l=0; l<nlinks; l++) {
+      char* target = peers+l*BUSID_SIZE;
+      struct ncclXmlNode* linkNode;
+      NCCLCHECK(xmlGetSubKv(pciNode, "pcilink", &linkNode, "target", target));
+      if (linkNode == NULL) {
+        NCCLCHECK(xmlAddNode(xml, pciNode, "pcilink", &linkNode));
+        NCCLCHECK(xmlSetAttr(linkNode, "target", target));
+      }
+    }
+  }
+
  struct ncclXmlNode* parent = pciNode->parent;
  if (parent == NULL) {
    if (path) {
@@ -911,25 +935,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
  return ncclSuccess;
 }

-ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node) {
+ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
  const char* str;
  NCCLCHECK(xmlGetAttr(node, "keep", &str));
  if (str && strcmp(str, "1") == 0) {
    NCCLCHECK(xmlUnsetAttr(node, "keep"));
+    *keep = 1;
  } else {
    // Copy nSubs and subs as they could change as we trim recursively.
    struct ncclXmlNode* subs[MAX_SUBS];
    int nSubs = node->nSubs;
    memcpy(subs, node->subs, node->nSubs*sizeof(struct ncclXmlNode*));
+    *keep = 0;
    for (int s=0; s<nSubs; s++) {
-      NCCLCHECK(ncclTopoTrimXmlRec(subs[s]));
+      int k = 0;
+      NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
+      *keep += k;
+    }
+    if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
+        (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
+      NCCLCHECK(xmlRemoveNode(node));
    }
-    if (node->nSubs == 0) NCCLCHECK(xmlRemoveNode(node));
  }
  return ncclSuccess;
 }
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml) {
-  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes));
+  int keep = 0;
+  NCCLCHECK(ncclTopoTrimXmlRec(xml->nodes, &keep));
  return ncclSuccess;
 }

@@ -55,7 +55,7 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
 /* Remove unneeded parts */
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);

-/* Fuse multiple system XMLs into one, skipping duplicate CPUs */
+/* Fuse multiple system XMLs into one, skipping duplicate entries */
 ncclResult_t ncclTopoFuseXml(struct ncclXml* dst, struct ncclXml* src);
 /* Relocate pointers in XML to (de-)serialize the structure */
 ncclResult_t ncclTopoConvertXml(struct ncclXml* xml, uintptr_t base, int exp);
@@ -172,6 +172,29 @@ static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struc
  return ncclSuccess;
 }

+static ncclResult_t xmlFindNode(struct ncclXmlNode* parentNode, struct ncclXmlNode* searchNode, struct ncclXmlNode** node) {
+  *node = NULL;
+  // Search for the node at the current level only.
+  for (int i=0; i<parentNode->nSubs; i++) {
+    struct ncclXmlNode* n = parentNode->subs[i];
+    if (strcmp(n->name, searchNode->name) == 0 && n->type == searchNode->type && n->nAttrs == searchNode->nAttrs) {
+      int a;
+      // Ensure that all the attributes are the same.
+      for (a=0; a<searchNode->nAttrs; a++) {
+        const char* val;
+        NCCLCHECK(xmlGetAttr(n, searchNode->attrs[a].key, &val));
+        if (!val || strcmp(val, searchNode->attrs[a].value))
+          break;
+      }
+      if (a == searchNode->nAttrs) {
+        *node = n;
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
  int index;
  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
@@ -10,6 +10,7 @@
 #include "transport.h"
 #include "channel.h"
 #include <assert.h>
+#include "bootstrap.h"

 __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
 __thread ncclResult_t ncclGroupError = ncclSuccess;
@@ -31,6 +32,7 @@ ncclResult_t ncclAsyncLaunch(
  ) {
  ncclResult_t ret = ncclSuccess;

+  job->destroyFlag = comm->destroyFlag;
  if (ncclGroupDepth == 0) {
    ret = func(job);
    if (ret != ncclSuccess && undo) undo(job);
@@ -40,11 +42,15 @@ ncclResult_t ncclAsyncLaunch(
    job->undo = undo;
    job->destructor = destructor;
    job->abortFlag = comm->abortFlag;
+    job->abortFlagDev = comm->abortFlagDev;
    job->childAbortFlag = comm->childAbortFlag;
+    job->childAbortFlagDev = comm->childAbortFlagDev;
    job->state = ncclGroupJobRunning;
    job->comm = comm;
    /* check if there are blocking and nonblocking comms at the same time in group. */
-    if (ncclGroupBlocking == -1) {
+    if (comm->destroyFlag) {
+      ncclGroupBlocking = 1;
+    } else if (ncclGroupBlocking == -1) {
      /* first met communicator */
      ncclGroupBlocking = comm->config.blocking;
    } else if (ncclGroupBlocking != comm->config.blocking) {
@@ -98,11 +104,23 @@ exit:
  return ret;
 }

+NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo);
+ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
+  ncclResult_t ret = ncclSuccess;
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
+  TRACE_CALL("ncclGroupSimulateEnd()");
+exit:
+  return ret;
+}
+
 struct ncclPreconnectJob {
  struct ncclAsyncJob base;
  struct ncclComm* comm;
+  bool* algoNeedConnect;
 };
-ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
+
+ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
  struct ncclComm* comm = job->comm;
  CUDACHECK(cudaSetDevice(comm->cudaDev));
@@ -111,6 +129,57 @@ ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
  return ncclSuccess;
 }

+ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  ncclResult_t ret = ncclSuccess;
+
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
+    if (job->algoNeedConnect[i]) {
+      switch (i) {
+        case NCCL_ALGO_RING: {
+          NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_TREE: {
+          NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_NVLS: {
+          /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
+           * NVLS intra-node buffer */
+          NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_NVLS_TREE: {
+          NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_COLLNET_CHAIN: {
+          NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+          break;
+        }
+        case NCCL_ALGO_COLLNET_DIRECT: {
+          NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+          break;
+        }
+        default: {
+          ret = ncclInternalError;
+          goto fail;
+        }
+      }
+    }
+  }
+
+exit:
+  free(job->algoNeedConnect);
+  return ret;
+fail:
+  goto exit;
+}
+
 static ncclResult_t doLaunches(struct ncclComm* head) {
  ncclResult_t result = ncclSuccess;
  struct ncclComm* cliqueComm0 = head->intraComm0;
@@ -124,7 +193,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
    struct ncclComm* comm = cliqueHead;
    bool capturingYes = false, capturingNo = false;
    do {
-      (ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
+      (ncclCudaGraphValid(comm->planner.capturingGraph) ? capturingYes : capturingNo) = true;
      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
      NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
      if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
@@ -150,19 +219,19 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
          // Barrier reduction result tells us if this was the final round.
          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
        } else {
-          moreRounds |= comm->unlaunchedPlansHead != nullptr;
+          moreRounds |= comm->planner.unlaunchedPlansHead != nullptr;
        }
        if (moreRounds) {
          // Pop next unlaunched kernel
-          struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
+          struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead;
          if (plan != nullptr) {
-            comm->unlaunchedPlansHead = plan->next;
+            comm->planner.unlaunchedPlansHead = plan->next;
            CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
          }
          // Barrier reduction input indicates if we require further rounds.
-          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
+          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
          if (plan != nullptr) {
            NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
          }
@@ -210,37 +279,29 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
    // is needed.
    comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
    for (int i = 0; i < comm->nRanks; i++) {
-      comm->tasks.peers[i].sendSeen = false;
-      comm->tasks.peers[i].recvSeen = false;
      comm->connectSend[i] = 0UL;
      comm->connectRecv[i] = 0UL;
    }
-    comm->unlaunchedPlansHead = nullptr;
    // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
    // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
-    while (!ncclIntruQueueEmpty(&comm->planQueue)) {
-      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
+    while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
+      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
      // Persistent plans will be reclaimed via the callbackQueue when the
      // graph drops its UserObject reference.
      if (!plan->persistent) {
-        for (int c = 0; c < MAXCHANNELS; c++) {
-          while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
-            struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
-            ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
-          }
+        while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
+          struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
+          ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
        }
        ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
      }
    }
-    // Reset comm->tasks to empty.
-    comm->tasks.nTasksColl = 0;
-    comm->tasks.nTasksP2p = 0;
-    comm->tasks.workBytesTotal = 0;
-    comm->tasks.streams = nullptr;
-    ncclIntruQueueConstruct(&comm->tasks.collQueue);
-    for (int i = 0; i < comm->nRanks; i++) {
-      ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
-      ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
+
+    { // Reset comm->planner to empty.
+      ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+      memset(&comm->planner, 0, sizeof(comm->planner));
+      comm->planner.peers = tmp;
+      memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
    }

    if (!comm->config.blocking)
@@ -260,37 +321,10 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
  return;
 }

-static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
-  int savedDev;
+static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain, volatile bool *groupAbortFlag) {
  ncclResult_t ret = ncclSuccess;
  bool jobsDone = false;
  bool errorJobAbortFlag = false;
-  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
-  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
-  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
-  volatile bool *groupAbortFlag = gjob->abortFlagPtr;
-
-  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-
-  if (groupCommPreconnectHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommPreconnectHeadMain;
-    do {
-      struct ncclPreconnectJob* job;
-      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-      job->base.func = ncclPreconnectFunc;
-      job->base.undo = nullptr;
-      job->base.destructor = free;
-      job->base.state = ncclGroupJobRunning;
-      job->base.abortFlag = comm->abortFlag;
-      job->comm = comm;
-      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
-
-      struct ncclComm* next = comm->preconnectNext;
-      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
-      comm = next;
-    } while (comm != nullptr);
-  }

  if (!ncclIntruQueueEmpty(asyncJobsMain)) {
    struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
@@ -321,9 +355,13 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
          assert(state == ncclGroupJobJoined);
        }

-        if (__atomic_load_n(groupAbortFlag, __ATOMIC_RELAXED) || errorJobAbortFlag == true) {
-          __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELAXED);
-          if (job->childAbortFlag) __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELAXED);
+        if (!job->destroyFlag && (__atomic_load_n(groupAbortFlag, __ATOMIC_ACQUIRE) || errorJobAbortFlag == true)) {
+          __atomic_store_n(job->abortFlag, 1, __ATOMIC_RELEASE);
+          __atomic_store_n(job->abortFlagDev, 1, __ATOMIC_RELEASE);
+          if (job->childAbortFlag) {
+            __atomic_store_n(job->childAbortFlag, 1, __ATOMIC_RELEASE);
+            __atomic_store_n(job->childAbortFlagDev, 1, __ATOMIC_RELEASE);
+          }
        }

        job = job->next;
@@ -335,17 +373,86 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_) {
    if (ret != ncclSuccess) goto fail;
  }

-  if (groupCommHeadMain != nullptr) {
-    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
-  }
-
  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
-    if (job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
      (void) ncclCommSetAsyncError(job->comm, ret);
    if (job->destructor) job->destructor((void*)job);
  }

+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
+  int savedDev;
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
+  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
+  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
+  bool *groupAbortFlag = gjob->abortFlagPtr;
+
+  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
+
+  if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommPreconnectHeadMain;
+    do {
+      struct ncclPreconnectJob* job;
+      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+      job->base.func = ncclP2PPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->base.state = ncclGroupJobRunning;
+      job->base.abortFlag = comm->abortFlag;
+      job->base.abortFlagDev = comm->abortFlagDev;
+      job->comm = comm;
+      ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+
+      struct ncclComm* next = comm->preconnectNext;
+      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+      comm = next;
+    } while (comm != nullptr);
+  }
+
+  NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
+
+  /* Connect channels at runtime if cumem is supported */
+  if (groupCommHeadMain != nullptr) {
+    struct ncclComm* comm = groupCommHeadMain;
+
+    do {
+      bool needConnect = false;
+      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+
+      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+
+      if (comm->cuMemSupport && needConnect) {
+        struct ncclPreconnectJob* job;
+        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+        job->base.func = ncclCollPreconnectFunc;
+        job->base.undo = nullptr;
+        job->base.destructor = free;
+        job->base.state = ncclGroupJobRunning;
+        job->base.abortFlag = comm->abortFlag;
+        job->comm = comm;
+        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+        ncclIntruQueueEnqueue(asyncJobsMain, &job->base);
+      }
+      comm = comm->groupNext;
+    } while (comm);
+
+    NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
+  }
+
+  if ((!simInfo) && (groupCommHeadMain != nullptr)) {
+    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
+  }
+
  while (groupCommHeadMain != nullptr) {
    struct ncclComm* comm = groupCommHeadMain;
    struct ncclComm* next = comm->groupNext;
@@ -365,8 +472,17 @@ fail:
  goto exit;
 }

-ncclResult_t ncclGroupEndInternal() {
+static ncclResult_t groupLaunchNonBlocking(struct ncclAsyncJob *job_) {
+  return groupLaunch(job_ /* estimatedTime = NULL */);
+}
+
+ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
  ncclResult_t ret = ncclSuccess;
+  ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
+  ncclSimInfo_t* internalSimInfoPtr = NULL;
+  size_t realSize = 0;
+
+  internalSimInfo.magic = 0;

  if (ncclGroupDepth == 0) {
    WARN("ncclGroupEnd: not in a group call.");
@@ -378,6 +494,18 @@ ncclResult_t ncclGroupEndInternal() {

  if ((ret = ncclGroupError) != ncclSuccess) goto fail;

+  if (simInfo) {
+    memcpy((void*)&realSize, (void*)&simInfo->size, sizeof(size_t));
+    realSize = realSize > sizeof(ncclSimInfo_t) ? sizeof(ncclSimInfo_t) : realSize;
+    memcpy((void*)&internalSimInfo, (void*)simInfo, realSize);
+    if (internalSimInfo.magic != 0x74685283) {
+      WARN("ncclSimInfo_t argument not initialized via NCCL_SIM_INFO_INITIALIZER");
+      ret = ncclInvalidArgument;
+      goto fail;
+    }
+    internalSimInfoPtr = &internalSimInfo;
+  }
+
  if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
    ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
    ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
@@ -410,12 +538,13 @@ ncclResult_t ncclGroupEndInternal() {
        } while (comm);
      }

-      ncclGroupJobMainPtr->base.func = groupLaunch;
+      ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
      SYSCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), ret, fail);
      ret = ncclInProgress;
    } else {
      /* blocking group */
-      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base), ret, fail);
+      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
      groupResetJobState(ncclGroupJobMainPtr);
    }
  }
@@ -438,7 +567,7 @@ ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {

 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
  if (groupJob && groupJob->initialized) {
-    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELAXED);
+    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
    NCCLCHECK(ncclGroupJobComplete(groupJob));
  }
  return ncclSuccess;
@@ -1,47 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ALIGN_H_
-#define NCCL_ALIGN_H_
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_POWER(x, y) \
-    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-#if !__CUDA_ARCH__
-  #ifndef __host__
-    #define __host__
-  #endif
-  #ifndef __device__
-    #define __device__
-  #endif
-#endif
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
-  return (x+y-1)/y;
-}
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
-  return (x+y-1) - (x+y-1)%y;
-}
-
-// assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x+a-1) & Z(-a);
-}
-
-#endif
@@ -9,7 +9,7 @@

 #include "nccl.h"
 #include "checks.h"
-#include "align.h"
+#include "bitops.h"
 #include "utils.h"
 #include "p2p.h"
 #include <sys/mman.h>
@@ -19,18 +19,25 @@

 uint64_t clockNano(); // from utils.h with which we have a circular dependency

+template<typename T>
+constexpr size_t ncclSizeOfT() { return sizeof(T); }
+template<>
+constexpr size_t ncclSizeOfT<void>() { return 1; }
+
 template <typename T>
 ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
  ncclResult_t result = ncclSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
-  memset(*ptr, 0, nelem*sizeof(T));
+  if (nelem > 0) {
+    CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
+    memset(*ptr, 0, nelem*ncclSizeOfT<T>());
+  }
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
  return result;
 }
 #define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -42,14 +49,18 @@ inline ncclResult_t ncclCudaHostFree(void* ptr) {

 template <typename T>
 ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
+  if (nelem > 0) {
+    void* p = malloc(nelem*ncclSizeOfT<T>());
+    if (p == NULL) {
+      WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
+      return ncclSystemError;
+    }
+    //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), p);
+    memset(p, 0, nelem*ncclSizeOfT<T>());
+    *ptr = (T*)p;
+  } else {
+    *ptr = NULL;
  }
-  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
  return ncclSuccess;
 }
 #define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -60,16 +71,16 @@ ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
  if (nelem == oldNelem) return ncclSuccess;

  T* oldp = *ptr;
-  T* p = (T*)malloc(nelem*sizeof(T));
+  T* p = (T*)malloc(nelem*ncclSizeOfT<T>());
  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    WARN("Failed to malloc %ld bytes", nelem*ncclSizeOfT<T>());
    return ncclSystemError;
  }
-  memcpy(p, oldp, oldNelem*sizeof(T));
+  memcpy(p, oldp, oldNelem*ncclSizeOfT<T>());
  free(oldp);
-  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  memset(p+oldNelem, 0, (nelem-oldNelem)*ncclSizeOfT<T>());
  *ptr = (T*)p;
-  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*ncclSizeOfT<T>(), nelem*ncclSizeOfT<T>(), *ptr);
  return ncclSuccess;
 }

@@ -111,7 +122,7 @@ static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHand
  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
  if (handlep) *handlep = handle;
-  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
+  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zu pointer %p handle %llx", size, *ptr, handle);
  return result;
 }

@@ -123,7 +134,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
  CUCHECK(cuMemRelease(handle));
  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  TRACE(NCCL_ALLOC, "CuMem Free Size %zu pointer %p handle 0x%llx", size, ptr, handle);
  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
  CUCHECK(cuMemRelease(handle));
  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
@@ -151,15 +162,17 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
  }
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA malloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
  return result;
 }
 #define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -170,21 +183,23 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    // Need a side stream so as not to interfere with graph capture.
+    cudaStream_t stream;
+    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
+    CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
+    CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+    CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
  }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
  return result;
 }
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -195,16 +210,18 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  *ptr = nullptr;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  if (nelem > 0) {
+    if (ncclCuMemEnable()) {
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+    } else {
+      CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
+    }
+    CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*ncclSizeOfT<T>(), stream), result, finish);
  }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA calloc async %ld bytes", nelem*ncclSizeOfT<T>());
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
  return result;
 }
 #define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
@@ -230,7 +247,7 @@ ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stre
  ncclResult_t result = ncclSuccess;
  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*ncclSizeOfT<T>(), cudaMemcpyDefault, stream), result, finish);
 finish:
  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
  return result;
@@ -256,13 +273,17 @@ finish:
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
 inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return ncclSystemError;
-  memset(p, 0, size);
-  *ptr = p;
+  if (size > 0) {
+    size_t page_size = sysconf(_SC_PAGESIZE);
+    void* p;
+    int size_aligned = ROUNDUP(size, page_size);
+    int ret = posix_memalign(&p, page_size, size_aligned);
+    if (ret != 0) return ncclSystemError;
+    memset(p, 0, size);
+    *ptr = p;
+  } else {
+    *ptr = NULL;
+  }
  INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
  return ncclSuccess;
 }
@@ -0,0 +1,277 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BITOPS_H_
+#define NCCL_BITOPS_H_
+
+#include <stdint.h>
+
+#if !__NVCC__
+  #ifndef __host__
+    #define __host__
+  #endif
+  #ifndef __device__
+    #define __device__
+  #endif
+#endif
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_POWER(x, y) \
+    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z divUp(X x, Y y) {
+  return (x+y-1)/y;
+}
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z roundUp(X x, Y y) {
+  return (x+y-1) - (x+y-1)%y;
+}
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z roundDown(X x, Y y) {
+  return x - x%y;
+}
+
+// assumes second argument is a power of 2
+template<typename X, typename Z = decltype(X()+int())>
+__host__ __device__ constexpr Z alignUp(X x, int a) {
+  return (x + a-1) & Z(-a);
+}
+// assumes second argument is a power of 2
+template<typename X, typename Z = decltype(X()+int())>
+__host__ __device__ constexpr Z alignDown(X x, int a) {
+  return x & Z(-a);
+}
+
+template<typename Int>
+inline __host__ __device__ int countOneBits(Int x) {
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(unsigned int)) {
+    return __popc((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    return __popcll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+    return -1;
+  }
+#else
+  if (sizeof(Int) <= sizeof(unsigned int)) {
+    return __builtin_popcount((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    return __builtin_popcountl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    return __builtin_popcountll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+    return -1;
+  }
+#endif
+}
+
+// Returns index of first one bit or returns -1 if mask is zero.
+template<typename Int>
+inline __host__ __device__ int firstOneBit(Int mask) {
+  int i;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    i = __ffs((int)mask);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    i = __ffsll((long long)mask);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (sizeof(Int) <= sizeof(int)) {
+    i = __builtin_ffs((int)mask);
+  } else if (sizeof(Int) <= sizeof(long)) {
+    i = __builtin_ffsl((long)mask);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    i = __builtin_ffsll((long long)mask);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#endif
+  return i-1;
+}
+
+template<typename Int>
+inline __host__ __device__ int popFirstOneBit(Int* mask) {
+  Int tmp = *mask;
+  *mask &= *mask-1;
+  return firstOneBit(tmp);
+}
+
+template<typename Int>
+inline __host__ __device__ int log2Down(Int x) {
+  int w, n;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    w = 8*sizeof(int);
+    n = __clz((int)x);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    w = 8*sizeof(long long);
+    n = __clzll((long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (x == 0) {
+    return -1;
+  } else if (sizeof(Int) <= sizeof(unsigned int)) {
+    w = 8*sizeof(unsigned int);
+    n = __builtin_clz((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    w = 8*sizeof(unsigned long);
+    n = __builtin_clzl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    w = 8*sizeof(unsigned long long);
+    n = __builtin_clzll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+  }
+#endif
+  return (w-1)-n;
+}
+
+template<typename Int>
+inline __host__ __device__ int log2Up(Int x) {
+  int w, n;
+  if (x != 0) x -= 1;
+#if __CUDA_ARCH__
+  if (sizeof(Int) <= sizeof(int)) {
+    w = 8*sizeof(int);
+    n = __clz((int)x);
+  } else if (sizeof(Int) <= sizeof(long long)) {
+    w = 8*sizeof(long long);
+    n = __clzll((long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(long long), "Unsupported integer size.");
+  }
+#else
+  if (x == 0) {
+    return 0;
+  } else if (sizeof(Int) <= sizeof(unsigned int)) {
+    w = 8*sizeof(unsigned int);
+    n = __builtin_clz((unsigned int)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long)) {
+    w = 8*sizeof(unsigned long);
+    n = __builtin_clzl((unsigned long)x);
+  } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+    w = 8*sizeof(unsigned long long);
+    n = __builtin_clzll((unsigned long long)x);
+  } else {
+    static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer size.");
+  }
+#endif
+  return w-n;
+}
+
+template<typename Int>
+inline __host__ __device__ Int pow2Up(Int x) {
+  return Int(1)<<log2Up(x);
+}
+
+template<typename Int>
+inline __host__ __device__ Int pow2Down(Int x) {
+  return Int(1)<<log2Down(x);
+}
+
+template<typename UInt, int nSubBits>
+inline __host__ UInt reverseSubBits(UInt x) {
+  if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
+    switch (8*sizeof(UInt)) {
+    case 16: x = __builtin_bswap16(x); break;
+    case 32: x = __builtin_bswap32(x); break;
+    case 64: x = __builtin_bswap64(x); break;
+    default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
+    }
+    return reverseSubBits<UInt, 8>(x);
+  } else if (nSubBits == 1) {
+    return x;
+  } else {
+    UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
+    x = (x & m)<<(nSubBits/2) | (x & ~m)>>(nSubBits/2);
+    return reverseSubBits<UInt, nSubBits/2>(x);
+  }
+}
+
+template<typename T> struct ncclToUnsigned;
+template<> struct ncclToUnsigned<char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<signed char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<unsigned char> { using type = unsigned char; };
+template<> struct ncclToUnsigned<signed short> { using type = unsigned short; };
+template<> struct ncclToUnsigned<unsigned short> { using type = unsigned short; };
+template<> struct ncclToUnsigned<signed int> { using type = unsigned int; };
+template<> struct ncclToUnsigned<unsigned int> { using type = unsigned int; };
+template<> struct ncclToUnsigned<signed long> { using type = unsigned long; };
+template<> struct ncclToUnsigned<unsigned long> { using type = unsigned long; };
+template<> struct ncclToUnsigned<signed long long> { using type = unsigned long long; };
+template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned long long; };
+
+// Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
+template<typename Int>
+inline __host__ __device__ Int reverseBits(Int x, int nBits) {
+  using UInt = typename ncclToUnsigned<Int>::type;
+  union { UInt ux; Int sx; };
+  sx = x;
+  #if __CUDA_ARCH__
+    if (sizeof(Int) <= sizeof(unsigned int)) {
+      ux = __brev(ux);
+    } else if (sizeof(Int) <= sizeof(unsigned long long)) {
+      ux = __brevll(ux);
+    } else {
+      static_assert(sizeof(Int) <= sizeof(unsigned long long), "Unsupported integer type.");
+    }
+  #else
+    ux = reverseSubBits<UInt, 8*sizeof(UInt)>(ux);
+  #endif
+  ux = nBits==0 ? 0 : ux>>(8*sizeof(UInt)-nBits);
+  return sx;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Custom 8 bit floating point format for approximating 32 bit uints. This format
+// has nearly the full range of uint32_t except it only keeps the top 3 bits
+// beneath the leading 1 bit and thus has a max value of 0xf0000000.
+
+inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
+  int log2x;
+  #if __CUDA_ARCH__
+    log2x = 31-__clz(x|1);
+  #else
+    log2x = 31-__builtin_clz(x|1);
+  #endif
+  uint32_t mantissa = x>>(log2x >= bitsPerPow2 ? log2x-bitsPerPow2 : 0) & ((1u<<bitsPerPow2)-1);
+  uint32_t exponent = log2x >= bitsPerPow2 ? log2x-(bitsPerPow2-1) : 0;
+  return exponent<<bitsPerPow2 | mantissa;
+}
+
+inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
+  uint32_t exponent = x>>bitsPerPow2;
+  uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
+  if (exponent != 0) exponent -= 1;
+  return mantissa<<exponent;
+}
+
+constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
+
+inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
+  return u32fpEncode(x, 3);
+}
+inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
+  return u32fpDecode(x, 3);
+}
+
+#endif
@@ -7,42 +7,25 @@
 #ifndef NCCL_CHANNEL_H_
 #define NCCL_CHANNEL_H_
 #include "comm.h"
+#include "utils.h"
+
+#include <algorithm>

 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
-static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
-  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
-  int peerNode = comm->rankToNode[peer];
-  int peerIndex = comm->rankToLocalRank[peer];
-  int nsteps = comm->maxLocalRanks;
-  int rankIndex = comm->rankToLocalRank[comm->rank];
-  int step, delta;
-  if (coll == ncclFuncSend) {
-    step = (nsteps + peerIndex - rankIndex)%nsteps;
-    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
-  } else if (coll == ncclFuncRecv) {
-    step = (nsteps + rankIndex - peerIndex)%nsteps;
-    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+
+inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
+  if (comm->nNodes > 1) {
+    int nodeDelta = p2pRound/comm->maxLocalRanks;
+    int localDelta = p2pRound%comm->maxLocalRanks;
+    int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+    base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+    return base & 0xff;
  } else {
-    return ncclInternalError;
+    return p2pRound & 0xff;
  }
-  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
-  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
-  int base;
-  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
-  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
-  return ncclSuccess;
 }

 #endif
@@ -123,23 +123,23 @@
 } while (0);

 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  uint32_t* tmpAbortFlag = (abortFlagPtr);     \
  ncclResult_t RES = call;                \
  if (RES != ncclSuccess && RES != ncclInProgress) {               \
    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
    return ncclInternalError;             \
  }                                       \
-  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+  if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECK(*tmpAbortFlag, 0); \
 } while (!(cond));

 #define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  uint32_t* tmpAbortFlag = (abortFlagPtr);             \
  RES = call;                             \
  if (RES != ncclSuccess && RES != ncclInProgress) {               \
    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
    goto label;                           \
  }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
+  if (__atomic_load(tmpAbortFlag, __ATOMIC_ACQUIRE)) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
 } while (!(cond));

 #define NCCLCHECKTHREAD(a, args) do { \
@@ -8,6 +8,8 @@
 #define NCCL_COLLECTIVES_H_

 #include "nccl.h"
+#include "nccl_common.h"
+#include "device.h"

 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -22,6 +24,12 @@
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above

+const char* ncclFuncToString(ncclFunc_t op);
+const char* ncclDevRedOpToString(ncclDevRedOp_t op);
+const char* ncclDatatypeToString(ncclDataType_t type);
+const char* ncclAlgoToString(int algo);
+const char* ncclProtoToString(int proto);
+
 inline int ncclTypeSize(ncclDataType_t type) {
  switch (type) {
  case ncclInt8:
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_

-#include "transport.h"
+//#include "transport.h"
 #include "p2p.h"
 #include "collectives.h"
 #include "nccl_tuner.h"
@@ -15,6 +15,7 @@
 #include "strongstream.h"
 #include "nccl_net.h"
 #include "register.h"
+#include "graph.h"

 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -144,7 +145,7 @@ struct ncclChannel {
  struct ncclNvls nvls;

  int id; // index of this channel
-  uint32_t workFifoSent; // last used work index+1
+  uint32_t workFifoProduced; // +1 successor of last used work fifo byte

  /* comm split sharable resources */
  struct ncclChannelPeer* collnetPeers;
@@ -153,22 +154,15 @@ struct ncclChannel {
  struct ncclDevChannelPeer* nvlsDevPeers;
 };

-struct ncclWorkList {
+struct ncclWorkBatchList {
+  struct ncclWorkBatchList* next;
+  struct ncclDevWorkBatch batch;
+};
+struct alignas(16) ncclWorkList {
  struct ncclWorkList* next;
-  struct ncclWork work;
-};
-
-struct ncclPointerList {
-  struct ncclPointerList* next;
-  void *ptr;
-};
-
-struct ncclNvlsMcHandleList {
-  struct ncclNvlsMcHandleList *next;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
+  enum ncclDevWorkType workType;
+  int size; // Size of struct following this node
+  // ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]...
 };

 struct ncclCollnetHandleList {
@@ -188,33 +182,190 @@ struct ncclKernelPlan {
  struct ncclKernelPlan* next;

  bool persistent; // aka captured in a graph
+  enum ncclDevWorkStorageType workStorageType;
  bool kernelSpecialized;
  void *kernelFn;
-  int channelUbound; // only channels c < channelUbound are present
-  int channelCount; // number of channels present
-  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  struct ncclDevKernelArgs* kernelArgs;
+  size_t kernelArgsSize;
+  uint64_t channelMask; // bitset of which channels are present
  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
  int threadPerBlock;
-  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
-  struct ncclWork* workHead;

-  int collOpCount; // zero based for this plan
+  int collOpCount; // Number of collectives in this plan.
+  int nWorkBatches; // Number of work batches.
+  size_t workBytes; // Sum size of all work (in the fifo) in bytes.
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> cleanupQueue;
+  void* workBufPersistent;

-  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
-  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
-  struct ncclIntruQueue<struct ncclCollnetHandleList, &ncclCollnetHandleList::next> collnetHandleQueue;
+  struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+};

-  struct Channel {
-    int nWork;
-    union {
-      int nWorkElem; // used for coll and reg coll
-      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
-    };
-    size_t collBytes;
-    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
-    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
-  } channels[MAXCHANNELS];
-  size_t maxBytesPerChannel;
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclRedOp_t opHost;
+  struct ncclDevRedOpFull opDev;
+  int chunkSteps, sliceSteps;
+  // Computed later:
+  size_t trafficBytes;
+  int32_t nMaxChannels:8;
+  int32_t nWarps:8;
+  int32_t algorithm:8, protocol:8;
+  uint32_t isCollnet:1, isNvls:1;
+  uint32_t devFuncId:30;
+  enum ncclRegBufferType regBufType;
+  // number of elements in planner->ipcMemQueue associated with this collective
+  int nCleanupQueueElts;
+
+  void* sendMhandle;
+  void* recvMhandle;
+};
+struct ncclTaskP2p {
+  struct ncclTaskP2p* next;
+  void* buff;
+  size_t bytes;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Roughly sorts ncclTaskColl's by their size descending. This structure is
+// self-referential, meaning that pointers it contains internally may point
+// into the structure itself. This means that it is NOT memcpy-moveable:
+
+struct ncclTaskCollSorter {
+  static constexpr int UnitLog2 = 10; // 1K
+  static constexpr size_t UnitSize = 1<<UnitLog2;
+  static constexpr int MaxLog2 = 30; // 1GB
+  static constexpr size_t MaxSize = 1ull<<MaxLog2;
+  // Number of bins between powers of 2. For 4 bins, the worst case out-of-order
+  // relative magnitude is (5/4)-1 = 25%
+  static constexpr int BitsPerPow2 = 2;
+  static constexpr int BinsPerPow2 = 1<<BitsPerPow2;
+  static constexpr int BinCount = 1 + (MaxLog2-UnitLog2)*BinsPerPow2;
+
+  struct ncclTaskColl* head;
+  struct ncclTaskColl* tail;
+  // Least bin such that it and all above are empty.
+  int binEdge;
+  // Pointer to the pointer to this bin's head node which is either the
+  // previous node's `next` field or `head`.
+  struct ncclTaskColl** bins[BinCount];
+};
+
+inline void ncclTaskCollSorterInsert(
+    struct ncclTaskCollSorter* me, struct ncclTaskColl* x, size_t size
+  ) {
+  constexpr int UnitLog2 = ncclTaskCollSorter::UnitLog2;
+  constexpr size_t MaxSize = ncclTaskCollSorter::MaxSize;
+  constexpr int BitsPerPow2 = ncclTaskCollSorter::BitsPerPow2;
+  constexpr int BinCount = ncclTaskCollSorter::BinCount;
+  int bin = u32fpEncode(std::min(MaxSize, size)>>UnitLog2, BitsPerPow2);
+  bin = BinCount-1 - bin; // descending bin
+
+  if (me->bins[bin] == nullptr) {
+    if (me->binEdge <= bin) {
+      me->binEdge = bin+1;
+      me->bins[bin] = me->tail ? &me->tail->next : &me->head;
+      me->tail = x;
+    } else {
+      // Find successor non-empty bin after this one.
+      int succ = bin+1;
+      while (me->bins[succ] == nullptr) succ++;
+      // What was our successor's head's previous is now our head's previous.
+      me->bins[bin] = me->bins[succ];
+      // The first node we insert is our tail, so that becomes our successor's
+      // head's new previous.
+      me->bins[succ] = &x->next;
+    }
+  }
+  // Push a new head for this bin.
+  x->next = *me->bins[bin];
+  *me->bins[bin] = x;
+}
+
+inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) {
+  return me->head == nullptr;
+}
+
+// Reset sorter and return sorted linked list of its coll tasks.
+inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) {
+  struct ncclTaskColl* head = me->head;
+  if (head != nullptr) memset(me, 0, sizeof(*me));
+  return head;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+
+struct ncclKernelPlanner {
+  //////////////////////////////////////////////////////////////////////////////
+  // State for accumulating tasks between ncclGroupStart/End()
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclTaskCollSorter collSorter;
+  struct Peer* peers/*[nRanks]*/;
+  int nTasksColl, nTasksP2p;
+  bool persistent;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Lists of tasks to be assembled into plans.
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
+  struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // State for building current (Work-In-Progress) plan:
+  //////////////////////////////////////////////////////////////////////////////
+
+  struct WipPlan {
+    struct Channel {
+      struct {
+        int workBytes; // Sum size of work metadata referenced by this batch.
+        int nP2ps; // Number of p2p works in this batch
+        int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch.
+      } wipBatch; // work-in-progress batch which will be next tail of workBatchQueue
+      int nWorkBatchesP2p; // number of p2p batches for this channel.
+      struct ncclIntruQueue<struct ncclWorkBatchList, &ncclWorkBatchList::next> workBatchQueue;
+      struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+    } channels[MAXCHANNELS];
+  } wipPlan;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // State for launching built plans:
+  //////////////////////////////////////////////////////////////////////////////
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
 };

 #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
@@ -233,12 +384,18 @@ struct ncclComm {
  struct ncclPeerInfo* peerInfo;
  struct ncclTopoSystem* topo;

+  int netPluginLoaded;
  ncclNet_t* ncclNet;
+  ncclNetDeviceType netDeviceType;
  ncclCollNet_t* ncclCollNet;
  void* bootstrap;
  // Bitmasks for ncclTransportP2pSetup
  uint64_t* connectSend;
  uint64_t* connectRecv;
+  struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
+  bool runtimeConn; // if dynamic connection is supported
+  int cuMemSupport;

  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.

@@ -253,6 +410,9 @@ struct ncclComm {
  cpu_set_t cpuAffinity; // CPU affinity of the GPU
  int cudaArch; // matches __CUDA_ARCH__ of device

+  int cpuArch;   // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed
+  int cpuVendor; // vendor - As defined in src/include/graph.h
+
  int node;
  int nNodes;
  int localRank;
@@ -278,10 +438,11 @@ struct ncclComm {
  int nChannels; // connection nChannels
  int collChannels; // enqueue nChannels
  int nvlsChannels; // enqueue nChannels
+  // all nvls heads stored to check if we can splitShare
+  int nvlsHeads[MAXCHANNELS];
  // Channels (per peer) for p2p
  int p2pnChannels;
  int p2pnChannelsPerPeer;
-  int p2pChannels[MAXCHANNELS];

  // Should this comm allocate LL buffers for network P2P connections?
  bool allocP2pNetLLBuffers;
@@ -303,23 +464,28 @@ struct ncclComm {
  ncclResult_t asyncResult;

  // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
-  volatile uint32_t *childAbortFlag;
-  uint32_t *abortFlagRefCount;
+  uint32_t* abortFlag;
+  uint32_t* abortFlagDev;
+  int* abortFlagRefCount;
+  uint32_t* childAbortFlag;
+  uint32_t* childAbortFlagDev;
+  uint32_t destroyFlag;

  // Device side of the communicator (for cudaFree's)
  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm

-  // Operation pool.
-  int workFifoDepth; // size of workFifoHeap[], power of 2
-  struct ncclWork* workFifoHeap;
-  struct ncclWork* devWorkFifoHeap;
-  void* workFifoHeapGdrHandle;
+  uint32_t workArgsBytes; // max size of kernel args
+  uint32_t workFifoBytes; // size of workFifoBuf, power of 2
+  void* workFifoBuf;
+  void* workFifoBufDev;
+  void* workFifoBufGdrHandle;

-  // Work completion notificaion
-  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
-  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
-  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
+  // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
+  uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
+  // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
+  uint32_t workFifoConsumedLeast;
+  // Monotonic number of bytes (mod 1<<32) sent to fifo.
+  uint32_t workFifoProduced;

  // Intra-process sync
  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -337,7 +503,7 @@ struct ncclComm {
  // Whether this communicator uses collNet
  int collNetSupport;
  bool collNetRegSupport;
-  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
+  uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
  int intraHighestTransportType;
  int* collNetHeads;
  int collNetHeadsNum;
@@ -355,16 +521,16 @@ struct ncclComm {
  // pools backed by comm->memPermanent
  struct ncclMemoryPool memPool_ncclProxyOp;
  struct ncclMemoryPool memPool_ncclKernelPlan;
-  struct ncclMemoryPool memPool_ncclPointerList;
-  struct ncclMemoryPool memPool_ncclNvlsHandleList;
-  struct ncclMemoryPool memPool_ncclCollnetHandleList;
+
  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
  // this comm is not yet in a group.
  struct ncclComm* groupNext;
  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
  struct ncclComm* preconnectNext;
  int persistentRefs; // number of persistent plan-lists capturing this comm
-  struct ncclTasks tasks;
+  struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
+
+  struct ncclKernelPlanner planner;

  // user-created reduction ops
  int userRedOpCapacity, userRedOpFreeHead;
@@ -373,11 +539,6 @@ struct ncclComm {
  // Queue of things for the main thread to do
  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;

-  // List of kernel plans built form tasks.
-  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
-  // First of the unlaunched kernels in `planQueue`
-  struct ncclKernelPlan* unlaunchedPlansHead;
-
  ncclConfig_t config;
  // initState is to more conveniently reclaim resources when errors happen.
  ncclResult_t initState;
@@ -389,6 +550,7 @@ struct ncclComm {
  struct ncclGroupJob *groupJob;

  // Tuning plugin
+  int tunerPluginLoaded;
  ncclTuner_t* tuner;
  void *tunerContext;
  // buffer registration cache
@@ -80,6 +80,10 @@ DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
 DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
 DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
 DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
+#if CUDART_VERSION >= 11080
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
+#endif
 // cuMem API support
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
 DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
@@ -10,21 +10,14 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include <stdio.h>
-#include <chrono>
-#include <type_traits>

-#include <limits.h>
-#include <string.h>
 #include <pthread.h>

 // Conform to pthread and NVTX standard
 #define NCCL_THREAD_NAMELEN 16

 extern int ncclDebugLevel;
-extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);

 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));

@@ -32,13 +25,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
 extern thread_local int ncclDebugNoWarn;
 extern char ncclLastError[];

+#define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 #define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)

 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::steady_clock::time_point ncclEpoch;
 #else
 #define TRACE(...)
 #endif
@@ -9,8 +9,10 @@

 #include "nccl.h"
 #include "nccl_common.h"
-#include "align.h"
+#include "bitops.h"
+#include <algorithm>
 #include <stdint.h>
+#include <sys/types.h>

 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];

@@ -21,6 +23,12 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

+#ifdef __CUDA_ARCH__
+  #define NCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+  #define NCCL_CUDA_ARCH 0
+#endif
+
 #include "net_device.h"

 enum ncclDevRedOp_t {
@@ -52,8 +60,11 @@ union ncclLLFifoLine {

 #define WARP_SIZE 32
 #define MAXCHANNELS 32
+#define NCCL_MAX_LOCAL_RANKS 64
 #define NCCL_MAX_NTHREADS 640
+#define NCCL_MIN_NTHREADS (4*WARP_SIZE)
 #define NCCL_SIMPLE_MAX_NTHREADS 512
+#define NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE (3*WARP_SIZE)
 #define NCCL_LL_MAX_NTHREADS 512
 #define NCCL_LL_LINES_PER_THREAD 8
 #ifdef TEST_LL_CLEANUP
@@ -84,6 +95,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_IPC_READ     0x10
 #define NCCL_NVLS_MIN_POLL 0x20

+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS 16
+
 #define NCCL_MAX_COLLNET_SIZE (1L << 29)

 enum ncclRegBufferType {
@@ -196,112 +210,155 @@ struct ncclChannelPeer {

 struct ncclDevComm;

-/* ncclWork is to be a power of two, currently 8x64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclWorkElem. */
-#define NCCL_WORK_SIZE 512
+struct alignas(16) ncclDevWorkP2p {
+  void *sendAddr, *recvAddr;
+  size_t sendBytes, recvBytes;
+  int sendRank, recvRank;
+  // From the part index, nP2pChannels, and channelBase the device code can
+  // calculate which part of the transfer a channel is responsible for.
+  uint8_t nP2pChannels; // Always equal to comm->p2pnChannels
+  uint8_t channelBase; // Channel owning first part.
+  // Zero channels indicates no work in that direction.
+  uint8_t nSendChannels, nRecvChannels;
+  // Chunk size stored in 8 bits via u32fp8Encode/Decode.
+  uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;

-enum ncclWorkType : uint8_t {
-   ncclWorkTypeUnused=0,
-   ncclWorkTypeColl=1,
-   ncclWorkTypeP2p=2,
-   ncclWorkTypeRegColl=3
-};
-enum ncclWorkP2PType : uint8_t {
-  ncclWorkP2pTypeUnused=0,
-  ncclWorkP2pTypeSend,
-  ncclWorkP2pTypeRecv
+  uint8_t sendProtoLL:1, recvProtoLL:1;
+  uint8_t sendRegistered:1, recvRegistered:1;
 };

-struct ncclWorkHeader {
-  union {
-    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
-    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
-  };
-  uint16_t funcIndex;
-  uint8_t isLast:1; // last work for this kernel
-  uint8_t inFifo:1; // is this work in the fifo
-  enum ncclWorkType type;
-};
+// Compute the subset of the data transfer corresponding to the given part index.
+inline __host__ __device__ void ncclP2pPartBounds(int nParts, int part, size_t bytes, size_t* partBeg, size_t* partEnd) {
+  size_t partBytes = alignUp(divUp(bytes, nParts), 4<<10);
+  #if __CUDA_ARCH__
+    *partBeg = min((part+0)*partBytes, bytes);
+    *partEnd = min((part+1)*partBytes, bytes);
+  #else
+    *partBeg = std::min<size_t>((part+0)*partBytes, bytes);
+    *partEnd = std::min<size_t>((part+1)*partBytes, bytes);
+  #endif
+}

-struct ncclWorkElem {
-  union {
-    uint8_t flagBits;
-    struct {
-      uint8_t isUsed:1, redOpArgIsPtr:1, oneNode:1;
-    };
-  };
-  uint8_t regUsed;
-  uint8_t nWarps;
-  uint8_t direct;
+// implemented in channel.h
+inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound);
+
+// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
+// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
+inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) {
+  // Only works because nP2pChannels is pow2
+  int nChannelsLog2 = countOneBits(nP2pChannels-1);
+  int delta = reverseBits(part, nChannelsLog2);
+  return (base + delta) & (nP2pChannels-1);
+}
+inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) {
+  // Only works because nP2pChannels is pow2
+  int nChannelsLog2 = countOneBits(nP2pChannels-1);
+  int delta = (channel-base) & (nP2pChannels-1);
+  return reverseBits(delta, nChannelsLog2);
+}
+
+struct alignas(16) ncclDevWorkColl {
+  // Running on channels [channelLo..channelHi], hi is inclusive.
+  //   nChannels == (channelHi - channelLo) + 1
+  uint32_t channelLo:8, channelHi:8;
+  uint32_t nWarps:8;
+  uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
  uint32_t root;
-  const void *sendbuff;
-  void *recvbuff;
-
-  size_t count;
-  uint64_t redOpArg;
-  uint64_t chunkCount:25, workCount:39;
+  void* recvbuff;
+  void* sendbuff;
  union {
+    // Continuous-byte-distribution scheduling. The lo and hi channels are of
+    // different size than the channels in the middle.
    struct {
-      uint64_t lastChunkCount:25;
-      uint64_t workOffset:39;
-    };
+      size_t countLo, countMid, countHi;
+      // Chunk counts where units are ncclProtoGrainSize(protocol) bytes
+      uint64_t chunkGrainsLo:21, chunkGrainsMid:21, chunkGrainsHi:21;
+    } cbd;
+    // Collnet scheduling. All channels divide work evenly.
    struct {
-      uint64_t bid:32;
-      uint64_t nChannels:32;
-    };
+      size_t count; // Total size, not divided per channel.
+      uint32_t chunkCount;
+    } collnet;
  };
+  uint64_t redOpArg;
 };

-#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
-static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");

-struct ncclWorkElemP2p {
-  int peer : 30;
-  int proto : 2;
+__host__ __device__ constexpr int ncclProtoGrainSize(int proto) {
+  return proto == NCCL_PROTO_LL ? 16 :
+         proto == NCCL_PROTO_LL128 ? WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD/NCCL_LL128_LINEELEMS*NCCL_LL128_DATAELEMS*sizeof(uint64_t) :
+         proto == NCCL_PROTO_SIMPLE ? 512 :
+         -1;
+}

-  enum ncclWorkP2PType p2pType;
-  uint8_t reg:1;
-  uint8_t nWarps:5;
-  uint8_t warpStart;
-  uint8_t ngroups;
-  // Important not to use any fields with greater than 4-byte alignment since
-  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
-  // there were 8-byte fields.
-  //void* buff;
-  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
-  //size_t count;
-  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
-  int chunkSize;
-};
+template<typename Int>
+__host__ __device__ inline void ncclCollCbdPart(
+    struct ncclDevWorkColl* work, uint32_t channelId, int proto, int eltSize,
+    Int* count, Int* partOffset, Int* partCount, Int* chunkCount
+  ) {
+  int eltPerGrain = ncclProtoGrainSize(proto)/eltSize;
+  int nMidChannels = work->channelHi - work->channelLo - 1;
+  // We can assum that nMidChannels<0 implies countMid==0, which let's us assume
+  // that countMid*nMidChannels == 0.
+  if (count != nullptr) {
+    *count = work->cbd.countLo + work->cbd.countMid*nMidChannels + work->cbd.countHi;
+  }
+  if (channelId == work->channelLo) {
+    *partOffset = 0;
+    *partCount = work->cbd.countLo;
+    *chunkCount = work->cbd.chunkGrainsLo*eltPerGrain;
+  } else if (channelId == work->channelHi) {
+    *partOffset = work->cbd.countLo + nMidChannels*work->cbd.countMid;
+    *partCount = work->cbd.countHi;
+    *chunkCount = work->cbd.chunkGrainsHi*eltPerGrain;
+  } else {
+    int mid = channelId - work->channelLo - 1;
+    *partOffset = work->cbd.countLo + mid*work->cbd.countMid;
+    *partCount = work->cbd.countMid;
+    *chunkCount = work->cbd.chunkGrainsMid*eltPerGrain;
+  }
+}

-static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
-#define NCCL_MAX_WORK_ELEMENTS_P2P 16
-
-struct ncclWorkElemReg {
-  struct ncclWorkElem elem;
+struct alignas(16) ncclDevWorkCollReg {
+  struct ncclDevWorkColl coll;
  void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
 };

-#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
-static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
-
-// Number of named barriers supported by CUDA
-#define NCCL_MAX_GROUPS 16
-
-struct ncclWork {
-  struct ncclWorkHeader header;
-  union {
-    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
-    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
-    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
-  };
+enum ncclDevWorkType: uint8_t {
+  ncclDevWorkTypeP2p,
+  ncclDevWorkTypeColl,
+  ncclDevWorkTypeCollReg
+};
+
+constexpr size_t ncclDevWorkSize(enum ncclDevWorkType type) {
+  return type == ncclDevWorkTypeP2p ? sizeof(ncclDevWorkP2p) :
+         type == ncclDevWorkTypeColl ? sizeof(ncclDevWorkColl) : sizeof(ncclDevWorkCollReg);
+}
+
+#define NCCL_MAX_DEV_WORK_BATCH_BYTES 1024
+#define NCCL_MAX_DEV_WORK_BATCH_COLLS (NCCL_MAX_DEV_WORK_BATCH_BYTES/sizeof(ncclDevWorkColl))
+#define NCCL_MAX_DEV_WORK_P2P_PER_BATCH 8
+struct alignas(16) ncclDevWorkBatch {
+  union {
+    struct {
+      // nextExtends: should next one be merged into this one.
+      // nextJump=0: end of this channel's batch list
+      // nextJump>0: batches[thisIndex+nextJump] is next batch in this list
+      uint32_t nextJump:14, nextExtends:1;
+      uint32_t workType:2, funcId:15;
+    };
+    // Unioning bitfields with underlying type hints compiler to emit the best
+    // SASS LD/ST accesses.
+    uint32_t flags;
+  };
+  // Rolling offset in fifo where this batch's work structs begin
+  uint32_t offsetBase;
+  // Set of relative offsets from offsetBase for this channel's subset of the batch:
+  // For each bit index i in offsetMask, find work at fifo offset: offsetBase + i*sizeof(WorkStructType)
+  uint64_t offsetBitset;
 };
-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
-static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");

 struct ncclDevChannelPeer {
  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
@@ -328,9 +385,8 @@ struct ncclDevComm {
  int buffSizes[NCCL_NUM_PROTOCOLS];
  int p2pChunkSize;

-  // Operation list for aggregation
-  int workFifoDepth;
-  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+  // Work fifo return credits
+  uint32_t* workConsumed/*[MAXCHANNELS]*/;

  int* collNetDenseToUserRank;

@@ -346,11 +402,37 @@ struct alignas(16) ncclDevCommAndChannels {
  struct ncclDevChannel channels[MAXCHANNELS];
 };

-#ifdef __CUDA_ARCH__
-  #define NCCL_CUDA_ARCH __CUDA_ARCH__
-#else
-  #define NCCL_CUDA_ARCH 0
-#endif
+enum ncclDevWorkStorageType: uint8_t {
+  ncclDevWorkStorageTypeArgs=0,
+  ncclDevWorkStorageTypeFifo=1,
+  ncclDevWorkStorageTypePersistent=2
+};
+
+struct alignas(16) ncclDevKernelArgs {
+  struct ncclDevComm* comm;
+  uint64_t channelMask;
+  enum ncclDevWorkStorageType workStorageType;
+  uint32_t workMask;
+  void* workBuf;
+  // A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list.
+  // struct ncclDevWorkBatch batches[];
+};
+
+__host__ __device__ constexpr int ncclMaxKernelArgsSize(/*int cudaDriver, */int cudaArch=NCCL_CUDA_ARCH) {
+  //return (cudaArch < 700 || cudaDriver < 12010) ? 4<<10 : (32<<10)-4;
+  return 4<<10;
+}
+
+template<size_t capacity>
+struct alignas(16) ncclDevKernelArgsStorage {
+  union {
+    struct ncclDevKernelArgs args;
+    ulong2 storage[capacity/sizeof(ulong2)];
+  };
+};
+
+typedef ncclDevKernelArgsStorage<(4<<10)> ncclDevKernelArgs4K;
+//typedef ncclDevKernelArgsStorage<(32<<10)-4> ncclDevKernelArgs31K;

 template<typename T>
 __host__ __device__ constexpr T min_constexpr(T a) { return a; }
@@ -366,6 +448,10 @@ __host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
  return max_constexpr<T>((a > b ? a : b), c...);
 }

+constexpr int ncclDevMaxChannelsForArgsBytes(size_t argsBytes) {
+  return min_constexpr<size_t>(MAXCHANNELS, (argsBytes - sizeof(struct ncclDevKernelArgs))/sizeof(struct ncclDevWorkBatch));
+}
+
 // Calculate the unroll factor given:
 // * bytePerPack: number of bytes accessed per instruction
 // * insns: max permissible unroll value
@@ -412,6 +498,7 @@ extern int const ncclDevKernelCount;
 extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];

 // Table of most specialized kernel function to run given func index.
+extern int const ncclDevFuncIdCount;
 extern int const ncclDevFuncRowToId[];
 extern void* const ncclDevKernelForFunc[/*funcIndex*/];
 extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
@@ -24,5 +24,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
+ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);

 #endif // End include guard
@@ -8,6 +8,7 @@
 #define NCCL_GDRWRAP_H_

 #include "nccl.h"
+#include "alloc.h"
 #include <stdint.h> // for standard [u]intX_t types
 #include <stdio.h>
 #include <stdlib.h>
@@ -194,7 +195,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
  char *devMem;
  void *gdrMap;

-  mapSize = sizeof(T)*nelem;
+  mapSize = ncclSizeOfT<T>()*nelem;

  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
@@ -203,7 +204,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
  uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
  size_t align = alignedAddr - (uint64_t)devMem;

-  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
+  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zu size %zu", alignedAddr, devMem, align, mapSize);
  NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));

  NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
@@ -226,7 +227,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
  *ptr = (T *)((char *)gdrMap+off);
  if (devPtr) *devPtr = (T *)(devMem+off+align);

-  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zu at %p",
       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);

  return ncclSuccess;
@@ -235,7 +236,7 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
 template <typename T>
 static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
-  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
+  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*ncclSizeOfT<T>()));
  return ncclSuccess;
 }

@@ -29,6 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
 int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);

 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
@@ -46,9 +47,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_ARCH_X86 1
 #define NCCL_TOPO_CPU_ARCH_POWER 2
 #define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_ARCH_MIXED 4
 #define NCCL_TOPO_CPU_VENDOR_INTEL 1
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
+#define NCCL_TOPO_CPU_VENDOR_MIXED 4
 #define NCCL_TOPO_CPU_TYPE_BDW 1
 #define NCCL_TOPO_CPU_TYPE_SKL 2
 #define NCCL_TOPO_CPU_TYPE_YONGFENG 1
@@ -70,6 +73,7 @@ ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
 #define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 #define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
+#define NCCL_TOPO_PATTERN_COLLNET_DIRECT 6  // Collnet Direct
 struct ncclTopoGraph {
  // Input / output
  int id; // ring : 0, tree : 1, collnet : 2
@@ -113,7 +117,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);

 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-#include "info.h"
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);

 #endif
@@ -35,9 +35,12 @@ struct ncclAsyncJob {
  void(*undo)(struct ncclAsyncJob*);
  void(*destructor)(void*);
  ncclGroupJobState_t state;
-  volatile uint32_t *abortFlag; /* point to comm abortFlag */
-  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
+  uint32_t* abortFlag; /* point to comm abortFlag */
+  uint32_t* abortFlagDev; /* point to comm abortFlagDev */
+  uint32_t* childAbortFlag; /* point to child abortFlag */
+  uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
  ncclComm_t comm;
+  int destroyFlag;
 };

 ncclResult_t ncclAsyncLaunch(
@@ -52,14 +55,14 @@ struct ncclGroupJob {
  struct ncclComm **groupCommHeadPtr;
  struct ncclComm **groupCommPreconnectHeadPtr;
  ncclResult_t *groupErrorPtr;
-  volatile bool *abortFlagPtr;
+  bool *abortFlagPtr;
  int *groupBlockingPtr;
  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
  bool initialized;
 };

 ncclResult_t ncclGroupStartInternal();
-ncclResult_t ncclGroupEndInternal();
+ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo = NULL);
 ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);

 ////////////////////////////////////////////////////////////////////////////////
@@ -114,6 +117,10 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
    // Comms gets a new memory stack scope upon joining. Each task batched for
    // this comm is allocated there.
    ncclMemoryStackPush(&comm->memScoped);
+    // Initialize planner
+    ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+    memset(&comm->planner, 0, sizeof(comm->planner));
+    comm->planner.peers = tmp;
  }

  ncclGroupBlocking = comm->config.blocking;
@@ -8,28 +8,9 @@
 #define NCCL_INFO_H_

 #include "nccl.h"
-#include "device.h"
 #include "collectives.h"
 #include "core.h"
 #include "utils.h"
-#include "strongstream.h"
-#define NCCL_MAX_LOCAL_RANKS 64
-
-typedef enum : uint8_t {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown,
-  ncclPatternCollnetChain,
-  ncclPatternCollnetDirect,
-  ncclPatternNvls,
-  ncclPatternNvlsTree,
-  ncclPatternSend,
-  ncclPatternRecv
-} ncclPattern_t;

 // Used to pass NCCL call information between functions
 struct ncclInfo {
@@ -47,110 +28,6 @@ struct ncclInfo {
  // Algorithm details
  int chunkSteps;
  int sliceSteps;
-  // Computed later
-  ncclDevRedOpFull opFull;
-  ncclPattern_t pattern;
-  size_t nBytes;
-  size_t aggnBytes;
-  size_t workBytes;
-  size_t sendbuffSize;
-  size_t recvbuffSize;
-  int stepSize;
-  int chunkCount;
-  int chunkSize;
-  int channelId;
-  int workFuncIndex;
-  ncclRegBufferType regBufType;
-  void* regBufSend[NCCL_MAX_LOCAL_RANKS];
-  void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
-  // collnet buffer reg handles
-  void* sendMhandle;
-  void* recvMhandle;
-  // Need to initialize
-  int nThreads;
-  int nChannels;
-  int algorithm;
-  int protocol;
-  bool userTuned;
-  struct ncclInfo *next;
-};
-
-inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
-  info->nBytes = info->workBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->workBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
-
-  /* compute buffer size for NVLS buffer registration */
-  if (info->coll == ncclFuncAllGather) {
-    info->sendbuffSize = info->workBytes;
-    info->recvbuffSize = info->sendbuffSize * nRanks;
-  } else if (info->coll == ncclFuncReduceScatter) {
-    info->recvbuffSize = info->workBytes;
-    info->sendbuffSize = info->recvbuffSize * nRanks;
-  } else {
-    info->sendbuffSize = info->recvbuffSize = info->workBytes;
-  }
-  return ncclSuccess;
-}
-
-struct ncclTaskColl {
-  struct ncclTaskColl* next;
-  ncclFunc_t func;
-  void const* sendbuff;
-  void* recvbuff;
-  size_t count;
-  int root;
-  ncclDataType_t datatype;
-  ncclDevRedOpFull op;
-  int chunkSteps, sliceSteps;
-  struct ncclInfo info;
-};
-struct ncclTaskP2p {
-  ncclTaskP2p *next;
-  void *buff;
-  size_t bytes;
-  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
-  // of where it left off.
-  int chunk;
-};
-
-struct ncclCudaStreamList {
-  struct ncclCudaStreamList *next;
-  cudaStream_t stream;
-};
-struct ncclTasks {
-  struct Peer {
-    bool sendSeen, recvSeen;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
-  };
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collQueue;
-  // Queue for user-tuned executed collectives
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collTunedQueue;
-  // Queue for continuous bytes distribution (CBD) collectives
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collCBDQueue;
-  // Queue for collnet
-  struct ncclIntruQueue<struct ncclInfo, &ncclInfo::next> collnetQueue;
-  size_t workBytesTotal;
-  int usableChannels;
-  bool sorted;
-  struct Peer* peers/*[nRanks]*/;
-  int *p2pSendOrder, *p2pRecvOrder;
-  int p2pOrderSteps;
-  int nTasksColl, nTasksP2p;
-
-  // The list of user streams aggregated over all tasks present.
-  struct ncclCudaStreamList* streams;
-  // The most recent user stream. Ignored if streams==nullptr
-  cudaStream_t streamRecent;
-  // The graph capturing all user streams or invalid if none. Thus we restrict the
-  // user that all streams must be captured in the same graph or not captured
-  // at all. Technically we could probably relax this, but that would mean
-  // collecting a different `ncclTasks` per graph and one for non-graph.
-  struct ncclCudaGraph capturingGraph;
 };

 #endif
@@ -7,8 +7,33 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_

-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {
+  NCCL_LOG_NONE = 0,
+  NCCL_LOG_VERSION = 1,
+  NCCL_LOG_WARN = 2,
+  NCCL_LOG_INFO = 3,
+  NCCL_LOG_ABORT = 4,
+  NCCL_LOG_TRACE = 5
+} ncclDebugLogLevel;
+
+typedef enum {
+  NCCL_INIT = 0x1,
+  NCCL_COLL = 0x2,
+  NCCL_P2P = 0x4,
+  NCCL_SHM = 0x8,
+  NCCL_NET = 0x10,
+  NCCL_GRAPH = 0x20,
+  NCCL_TUNING = 0x40,
+  NCCL_ENV = 0x80,
+  NCCL_ALLOC = 0x100,
+  NCCL_CALL = 0x200,
+  NCCL_PROXY = 0x400,
+  NCCL_NVLS = 0x800,
+  NCCL_BOOTSTRAP = 0x1000,
+  NCCL_REG = 0x2000,
+  NCCL_PROFILE = 0x4000,
+  NCCL_ALL = ~0
+} ncclDebugLogSubSys;

 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

@@ -40,4 +65,5 @@ typedef enum {
 #define NCCL_PROTO_LL128 1
 #define NCCL_PROTO_SIMPLE 2

+#define NCCL_ALGO_PROTO_IGNORE -1.0
 #endif
@@ -11,6 +11,54 @@
 #include "nccl.h"
 #include "nccl_common.h"

+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+typedef ncclTuner_v3_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+
 // API to be implemented by external tuner
 typedef struct {
  // Name of the tuner
@@ -36,7 +84,7 @@ typedef struct {
  //
  // Outputs:
  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
  //   - nChannels: number of channels (hence SMs) to be used.
  //
  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
@@ -46,15 +94,11 @@ typedef struct {
  // Unset fields will be set automatically by NCCL.
  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels);
+                              int* algorithm, int* protocol, int* nChannels);

  // Terminates the plugin and cleans up any resources that the plugin allocated.
  // context: tuner context object
  ncclResult_t (*destroy)(void* context);
 } ncclTuner_v2_t;

-typedef ncclTuner_v2_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
-
 #endif
@@ -14,8 +14,10 @@

 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];

-ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
+ncclResult_t ncclNetFinalize(struct ncclComm* comm);
 int ncclNetVersion(struct ncclComm* comm);

 // Test whether the current GPU support GPU Direct RDMA.
@@ -253,6 +253,38 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
 */
 #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)

+/**
+ * Confidential Compute Feature Status values
+ */
+#define NVML_CC_SYSTEM_FEATURE_DISABLED 0
+#define NVML_CC_SYSTEM_FEATURE_ENABLED  1
+
+typedef struct nvmlConfComputeSystemState_st {
+    unsigned int environment;
+    unsigned int ccFeature;
+    unsigned int devToolsMode;
+} nvmlConfComputeSystemState_t;
+
+/**
+ * Confidential Compute Multigpu mode values
+ */
+#define NVML_CC_SYSTEM_MULTIGPU_NONE 0
+#define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
+
+/**
+ * Confidential Compute System settings
+ */
+typedef struct {
+    unsigned int version;
+    unsigned int environment;
+    unsigned int ccFeature;
+    unsigned int devToolsMode;
+    unsigned int multiGpuMode;
+} nvmlSystemConfComputeSettings_v1_t;
+
+typedef nvmlSystemConfComputeSettings_v1_t nvmlSystemConfComputeSettings_t;
+#define nvmlSystemConfComputeSettings_v1 NVML_STRUCT_VERSION(SystemConfComputeSettings, 1)
+
 /* End of nvml.h */
 #endif // NCCL_NVML_DIRECT

@@ -268,6 +300,11 @@ extern int ncclNvmlDeviceCount;
 extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
 extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];

+struct ncclNvmlCCStatus {
+    bool CCEnabled;
+    bool multiGpuCCEnabled;
+};
+
 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
 // Outsiders need only call it if they want to inspect the ncclNvml global
 // tables above.
@@ -283,5 +320,6 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma
 ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
 ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
+ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status);

 #endif // End include guard
@@ -63,7 +63,7 @@ class payload_schema {
    nullptr,
    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
-    nullptr, 0, 0, 0};
+    nullptr, 0, 0, 0, 0, nullptr};
 };

 // Create NVTX push/pop range with parameters
@@ -25,9 +25,9 @@
 *
 * \section INITIALIZATION_SECTION Initialization
 *
- * Typically the tool's library that plugs into NVTX is indirectly 
- * loaded via enviromental properties that are platform specific. 
- * For some platform or special cases, the user may be required 
+ * Typically the tool's library that plugs into NVTX is indirectly
+ * loaded via enviromental properties that are platform specific.
+ * For some platform or special cases, the user may be required
 * to instead explicity initialize instead though.   This can also
 * be helpful to control when the API loads a tool's library instead
 * of what would typically be the first function call to emit info.
@@ -37,16 +37,16 @@
 *
 * Markers and ranges are used to describe events at a specific time (markers)
 * or over a time span (ranges) during the execution of the application
- * respectively. 
+ * respectively.
 *
 * \subsection MARKERS Markers
- * 
+ *
 * Markers denote specific moments in time.
- * 
- * 
+ *
+ *
 * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
 * how to specify the domain.
- * 
+ *
 * \subsection THREAD_RANGES Thread Ranges
 *
 * Thread ranges denote nested time ranges. Nesting is maintained per thread
@@ -59,9 +59,9 @@
 *
 * \subsection PROCESS_RANGES Process Ranges
 *
- * Process ranges denote a time span that can expose arbitrary concurrency, as 
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
 * opposed to thread ranges that only support nesting. In addition the range
- * start event can happen on a different thread than the end marker. For the 
+ * start event can happen on a different thread than the end marker. For the
 * correlation of a start/end pair an unique correlation ID is used that is
 * returned from the start API call and needs to be passed into the end API
 * call.
@@ -87,15 +87,15 @@
 *
 * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
 * a named domain.
- * 
+ *
 * Each domain maintains its own
 * - categories
 * - thread range stacks
 * - registered strings
 *
- * The function ::nvtxDomainDestroy marks the end of the domain. Destroying 
- * a domain unregisters and destroys all objects associated with it such as 
- * registered strings, resource objects, named categories, and started ranges. 
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
+ * a domain unregisters and destroys all objects associated with it such as
+ * registered strings, resource objects, named categories, and started ranges.
 *
 * \section RESOURCE_NAMING Resource Naming
 *
@@ -105,41 +105,41 @@
 * The functions can be called multiple times during the execution of an
 * application, however, in that case it is implementation dependent which
 * name will be reported by the tool.
- * 
+ *
 * \subsection CATEGORY_NAMING Category Naming
 *
- * Some function in this library support associating an integer category 
- * to enable filtering and sorting.  The category naming functions allow 
- * the application to associate a user friendly name with the integer 
- * category.  Support for domains have been added in NVTX_VERSION_2 to 
- * avoid collisions when domains are developed independantly. 
+ * Some function in this library support associating an integer category
+ * to enable filtering and sorting.  The category naming functions allow
+ * the application to associate a user friendly name with the integer
+ * category.  Support for domains have been added in NVTX_VERSION_2 to
+ * avoid collisions when domains are developed independantly.
 *
 * \subsection RESOURCE_OBJECTS Resource Objects
 *
- * Resource objects are a generic mechanism for attaching data to an application 
- * resource.  The identifier field makes the association to a pointer or handle, 
- * while the type field helps provide deeper understanding of the identifier as 
+ * Resource objects are a generic mechanism for attaching data to an application
+ * resource.  The identifier field makes the association to a pointer or handle,
+ * while the type field helps provide deeper understanding of the identifier as
 * well as enabling differentiation in cases where handles generated by different
 * APIs may collide.  The resource object may also have an associated message to
- * associate with the application resource, enabling further annotation of this 
+ * associate with the application resource, enabling further annotation of this
 * object and how it is used.
- * 
+ *
 * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
 * functions and allow the application resource identified by those functions to be
 * associated to a domain.  The other naming functions are still supported for backward
 * compatibility but will be associated only to the default domain.
 *
 * \subsection RESOURCE_NAMING_OS Resource Naming
- * 
- * Some operating system resources creation APIs do not support providing a user friendly 
- * name, such as some OS thread creation APIs.  This API support resource naming though 
- * both through resource objects and functions following the pattern 
- * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2 
+ *
+ * Some operating system resources creation APIs do not support providing a user friendly
+ * name, such as some OS thread creation APIs.  This API support resource naming though
+ * both through resource objects and functions following the pattern
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2
 * supersede the other functions with a a more general method of assigning names to OS resources,
- * along with associating them to domains too.  The older nvtxName* functions are only associated 
+ * along with associating them to domains too.  The older nvtxName* functions are only associated
 * with the default domain.
 * \section EXTENSIONS Optional Extensions
- * Optional extensions will either appear within the existing sections the extend or appear 
+ * Optional extensions will either appear within the existing sections the extend or appear
 * in the "Related Pages" when they introduce new concepts.
 */

@@ -159,7 +159,11 @@
 #define NVTX_INLINE_STATIC __inline static
 #else /*defined(__GNUC__)*/
 #define NVTX_API
+#if defined(__cplusplus) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define NVTX_INLINE_STATIC inline static
+#else
+#define NVTX_INLINE_STATIC __inline__ static
+#endif
 #endif /* Platform */

 #if defined(NVTX_NO_IMPL)
@@ -212,7 +216,7 @@
 extern "C" {
 #endif /* __cplusplus */

-/** 
+/**
 * Result Codes
 */

@@ -281,12 +285,12 @@ typedef enum nvtxColorType_t
 * ------------------------------------------------------------------------- */
 typedef enum nvtxMessageType_t
 {
-    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message payload is unused. */
+    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message attribute is unused. */
    NVTX_MESSAGE_TYPE_ASCII       = 1,    /**< A character sequence is used as payload. */
    NVTX_MESSAGE_TYPE_UNICODE     = 2,     /**< A wide character sequence is used as payload. */
    /* NVTX_VERSION_2 */
    NVTX_MESSAGE_TYPE_REGISTERED  = 3,    /**< A unique string handle that was registered
-                                                with \ref nvtxDomainRegisterStringA() or 
+                                                with \ref nvtxDomainRegisterStringA() or
                                                \ref nvtxDomainRegisterStringW(). */
 } nvtxMessageType_t;

@@ -338,7 +342,7 @@ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
 * ------------------------------------------------------------------------- */
 typedef enum nvtxPayloadType_t
 {
-    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Payload attribute is unused. */
    NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1,   /**< A 64 bit unsigned integer value is used as payload. */
    NVTX_PAYLOAD_TYPE_INT64 = 2,   /**< A 64 bit signed integer value is used as payload. */
    NVTX_PAYLOAD_TYPE_DOUBLE = 3,   /**< A 64 bit floating point value is used as payload. */
@@ -714,10 +718,10 @@ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
 /* ------------------------------------------------------------------------- */
 /** \brief Ends a process range.
 *
-* \param domain - The domain 
+* \param domain - The domain
 * \param id - The correlation ID returned from a nvtxRangeStart call.
 *
-* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd. 
+* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
 * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
 *
 * \par Example:
@@ -929,10 +933,10 @@ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);

 /*  ------------------------------------------------------------------------- */
 /** \cond SHOW_HIDDEN
-* \brief Resource typing helpers.  
+* \brief Resource typing helpers.
 *
-* Classes are used to make it easy to create a series of resource types 
-* per API without collisions 
+* Classes are used to make it easy to create a series of resource types
+* per API without collisions
 */
 #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
 #define NVTX_RESOURCE_CLASS_GENERIC 1
@@ -1062,7 +1066,7 @@ typedef struct nvtxResourceAttributes_v0
    int32_t identifierType;            /* values from enums following the pattern nvtxResource[name]Type_t */

    /**
-    * \brief Identifier for the resource. 
+    * \brief Identifier for the resource.
    * \anchor RESOURCE_IDENTIFIER_FIELD
    *
    * An identifier may be a pointer or a handle to an OS or middleware API object.
@@ -1093,7 +1097,7 @@ typedef struct nvtxResourceAttributes_v0

 typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;

-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \version \NVTX_VERSION_2
 */
 #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
@@ -1106,7 +1110,7 @@ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
 /** \brief Create a resource object to track and associate data with OS and middleware objects
 *
 * Allows users to associate an API handle or pointer with a user-provided name.
-* 
+*
 *
 * \param domain - Domain to own the resource object
 * \param attribs - Attributes to be associated with the resource
@@ -1240,7 +1244,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t*
 * POSIX pthread_t type returned by pthread_self() may not comply with these
 * expectations. Please use OS-specific thread ID instead of pthread_t.
 *
- * The thread name is associated to the default domain.  To support domains 
+ * The thread name is associated to the default domain.  To support domains
 * use resource objects via ::nvtxDomainResourceCreate.
 *
 * \param threadId - The ID of the thread to name.
@@ -1457,7 +1461,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
 } /* extern "C" */
 #endif /* __cplusplus */

-#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD /* Ensure other headers cannot be included directly */

 #include "nvtxDetail/nvtxTypes.h"

@@ -0,0 +1,335 @@
+/**
+ * The NVTX counters extension is intended to collect counter values of various
+ * sources. It uses the NVTX payload extension to specify the data layout a
+ * counter group.
+ *
+ * A counter group is a set of counters that are collected together (at the same
+ * time). Counters are always registered as a group. Hence, a single counter is
+ * represented by a group with one counter.
+ *
+ * A sample refers to all values for a given timestamp. These values must
+ * include counter values and may include multiple instances of a counter group.
+ *
+ * The NVTX domain handle is the first argument to all counter collect
+ * functions. 0/NULL/nullptr represents the default domain (no domain).
+ */
+
+#include "nvToolsExtPayload.h"
+
+#ifndef NVTOOLSEXT_COUNTERS_H
+#define NVTOOLSEXT_COUNTERS_H
+
+/**
+ * \brief The compatibility ID is used for versioning of this extension.
+ */
+#ifndef NVTX_EXT_COUNTERS_COMPATID
+#define NVTX_EXT_COUNTERS_COMPATID 0x0101
+#endif
+
+/**
+ * \brief The module ID identifies the payload extension. It has to be unique
+ * among the extension modules.
+ */
+#ifndef NVTX_EXT_COUNTERS_MODULEID
+#define NVTX_EXT_COUNTERS_MODULEID 4
+#endif
+
+
+/** Identifies an invalid scope and indicates an error if returned by `nvtxScopeRegister`. */
+#define NVTX_SCOPE_NONE                   0 /* no scope */
+
+#define NVTX_SCOPE_ROOT                   1
+
+#define NVTX_SCOPE_CURRENT_HW_MACHINE     2 /* Node/machine name, Device? */
+#define NVTX_SCOPE_CURRENT_HW_SOCKET      3
+#define NVTX_SCOPE_CURRENT_HW_CPU         4
+#define NVTX_SCOPE_CURRENT_HW_CPU_LOGICAL 5
+/* Innermost HW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_HW_INNERMOST   6
+
+/* Virtualized hardware, virtual machines, OS (if you don't know any better) */
+#define NVTX_SCOPE_CURRENT_HYPERVISOR     7
+#define NVTX_SCOPE_CURRENT_VM             8
+#define NVTX_SCOPE_CURRENT_KERNEL         9
+#define NVTX_SCOPE_CURRENT_CONTAINER     10
+#define NVTX_SCOPE_CURRENT_OS	         11
+
+/* Software scopes */
+#define NVTX_SCOPE_CURRENT_SW_PROCESS 	 12 /* Process scope */
+#define NVTX_SCOPE_CURRENT_SW_THREAD  	 13 /* Thread scope */
+#define NVTX_SCOPE_CURRENT_SW_FIBER      14
+/* Innermost SW execution context at registration time */
+#define NVTX_SCOPE_CURRENT_SW_INNERMOST  15
+
+/** Static (user-provided) scope IDs (feed forward) */
+#define NVTX_SCOPE_ID_STATIC_START  (1 << 24)
+
+/** Dynamically (tool) generated scope IDs */
+#define NVTX_SCOPE_ID_DYNAMIC_START 4294967296  /* 1 << 32 */
+
+
+/** Identifier of the semantic extension for counters. */
+#define NVTX_SEMANTIC_ID_COUNTERS_V1 5
+
+/***  Flags to augment the counter value. ***/
+#define NVTX_COUNTERS_FLAG_NONE       0
+
+/**
+ * Convert the fixed point value to a normalized floating point.
+ * Use the sign/unsign from the underlying type this flag is applied to.
+ * Unsigned [0f : 1f] or signed [-1f : 1f]
+ */
+#define NVTX_COUNTERS_FLAG_NORM       (1 << 1)
+
+/**
+ * Tools should apply scale and limits when graphing, ideally in a "soft" way to
+ * to see when limits are exceeded.
+ */
+#define NVTX_COUNTERS_FLAG_LIMIT_MIN  (1 << 2)
+#define NVTX_COUNTERS_FLAG_LIMIT_MAX  (1 << 3)
+#define NVTX_COUNTERS_FLAG_LIMITS \
+    (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
+
+/** Counter time scope **/
+#define NVTX_COUNTERS_FLAG_TIME_POINT       (1 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_SINCE_LAST  (2 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_UNTIL_NEXT  (3 << 5)
+#define NVTX_COUNTERS_FLAG_TIME_SINCE_START (4 << 5)
+
+/** Counter value type **/
+#define NVTX_COUNTERS_FLAG_VALUE_ABSOLUTE   (1 << 10)
+#define NVTX_COUNTERS_FLAG_VALUE_DELTA      (2 << 10) // delta to previous counter sample
+
+/** Counter visualization hints **/
+#define NVTX_COUNTERS_FLAG_INTERPOLATE      (1 << 14)
+
+/** Datatypes for limits union (value of `limitType`). */
+#define NVTX_COUNTERS_LIMIT_I64 0
+#define NVTX_COUNTERS_LIMIT_U64 1
+#define NVTX_COUNTERS_LIMIT_F64 2
+
+/** Reasons for the missing sample value. */
+#define NVTX_COUNTERS_SAMPLE_ZERO        0
+#define NVTX_COUNTERS_SAMPLE_UNCHANGED   1
+#define NVTX_COUNTERS_SAMPLE_UNAVAILABLE 2
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \brief Specify additional properties of a counter or counter group.
+ */
+typedef struct nvtxSemanticsCounter_v1
+{
+    /** Header of the semantic extension (with identifier, version, etc.). */
+    struct nvtxSemanticsHeader_v1 header;
+
+    /**
+     * Flag if normalization, scale limits, etc. should be applied to counter
+     * values.
+     */
+    uint64_t flags;
+
+    /** Unit of the counter value (case insensitive) */
+    const char* unit;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleNumerator;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleDenominator;
+
+    /** Determines the used union member. Use defines `NVTX_COUNTERS_LIMIT_*`. */
+    int64_t limitType;
+
+    /** Soft graph limit. */
+    union limits_t {
+        int64_t i64[2];
+        uint64_t u64[2];
+        double d[2];
+    } limits;
+} nvtxSemanticsCounter_t;
+
+typedef struct nvtxCountersAttr_v1
+{
+    size_t structSize;
+
+    /**
+     * A schema ID referring to the data layout of the counter group or a
+     * predefined NVTX payloads number type.
+     */
+    uint64_t schemaId;
+
+    /** Name of the counter group. */
+    const char* name;
+
+    /** Identifier of the scope of the counters. */
+    uint64_t scopeId;
+
+    /**
+     * (Optional) Specify additional semantics for a counter (group). The
+     * semantics provided are applied to the all counters in a group. If the
+     * semantics should only refer to a single counter in a group, the semantics
+     * field of the payload entry has to be used. Accepted semantics are
+     * `nvtxSemanticsCounter_t` and `nvtxSemanticsTime_t`.
+     */
+    const nvtxSemanticsHeader_t* semantics;
+} nvtxCountersAttr_t;
+
+/* Forward declaration of opaque counter group registration structure */
+struct nvtxCountersRegistration_st;
+typedef struct nvtxCountersRegistration_st nvtxCountersRegistration;
+
+/* \brief Counters Handle Structure.
+* \anchor COUNTERS_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference a counter group.
+* This type is returned from tools when using the NVTX API to create a counters group.
+*/
+typedef nvtxCountersRegistration* nvtxCountersHandle_t;
+
+typedef struct nvtxCountersBatch_v1
+{
+    /** Handle to attributes (data layout, scope, etc.) of a counter (group). */
+    nvtxCountersHandle_t hCounter;
+
+    /** Array of counter samples. */
+    const void* counters;
+
+    /** Size of the `counters` array (in bytes). */
+    size_t cntArrSize;
+
+    /** Array of timestamps or reference-time plus delta pair. `NULL` is used, if
+    timestamps are part of the counter (group) layout.) */
+    const void* timestamps;
+
+    /** Size of the `timestamps` array or definition (in bytes). */
+    size_t tsSize;
+} nvtxCountersBatch_t;
+
+/**
+ * \brief Register a counter group.
+ *
+ * @param hDomain NVTX domain handle.
+ * @param attr Pointer to the attributes of the counter (group).
+ *
+ * @return Counter handle identifying a counter or counter (group).
+ *         The counter handle is unique within the NVTX domain.
+ */
+NVTX_DECLSPEC nvtxCountersHandle_t NVTX_API nvtxCountersRegister(
+    nvtxDomainHandle_t hDomain,
+    const nvtxCountersAttr_t* attr);
+
+/**
+ * \brief Sample one integer counter by value immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param value 64-bit integer counter value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleInt64(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    int64_t value);
+
+/**
+ * \brief Sample one floating point counter by value immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param value 64-bit floating-point counter value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleFloat64(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    double value);
+
+/**
+ * \brief Sample a counter group by reference immediately (the NVTX tool determines the timestamp).
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param counters pointer to one or more counter values.
+ * @param size size of the counter value(s) in bytes.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSample(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    void* values,
+    size_t size);
+
+/**
+ * \brief Sample without value.
+ *
+ * @param hDomain handle of the NVTX domain.
+ * @param hCounter handle of the NVTX counter (group).
+ * @param reason reason for the missing sample value.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSampleNoValue(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    uint8_t reason);
+
+/**
+ * \brief Submit a batch of counters in the given domain.
+ *        Timestamps are part of the counter sample data.
+ *
+ * The size of a data sampling point is defined by the `staticSize` field of the
+ * payload schema. An NVTX tool can assume that the counter samples are stored
+ * as an array with each entry being `staticSize` bytes.
+ *
+ * @param hDomain handle of the NVTX domain
+ * @param hCounter handle of the counter group (includes counter data decoding schema)
+ * @param counters blob containing counter data and timestamps
+ * @param size size of the counter data blob in bytes
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatch(
+    nvtxDomainHandle_t hDomain,
+    nvtxCountersHandle_t hCounter,
+    const void* counters,
+    size_t size);
+
+/**
+ * \brief Submit a batch of counters in the given domain.
+ *        Timestamps are separated from the counter data.
+ *
+ * @param hDomain handle of the NVTX domain
+ * @param counterBatch Pointer to the counter data to be submitted.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx(
+    nvtxDomainHandle_t hDomain,
+    const nvtxCountersBatch_t* counterBatch);
+
+
+#define NVTX3EXT_CBID_nvtxCountersRegister           0
+#define NVTX3EXT_CBID_nvtxCountersSampleInt64        1
+#define NVTX3EXT_CBID_nvtxCountersSampleFloat64      2
+#define NVTX3EXT_CBID_nvtxCountersSample             3
+#define NVTX3EXT_CBID_nvtxCountersSampleNoValue      4
+#define NVTX3EXT_CBID_nvtxCountersSubmitBatch        5
+#define NVTX3EXT_CBID_nvtxCountersSubmitBatchEx      6
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_COUNTERS_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtImplCounters_v1.h"
+#undef NVTX_EXT_IMPL_COUNTERS_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_COUNTERS_H */
@@ -30,7 +30,7 @@ extern "C" {
 */

 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -133,7 +133,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 #endif /* __cplusplus */

 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplCuda_v3.h"
 #undef NVTX_IMPL_GUARD_CUDA
 #endif /*NVTX_NO_IMPL*/
@@ -31,7 +31,7 @@ extern "C" {
 */

 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -109,7 +109,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t*
 #endif /* __cplusplus */

 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplCudaRt_v3.h"
 #undef NVTX_IMPL_GUARD_CUDART
 #endif /*NVTX_NO_IMPL*/
@@ -0,0 +1,694 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvToolsExt.h"
+
+#ifndef NVTOOLSEXTV3_MEM_V1
+#define NVTOOLSEXTV3_MEM_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define NVTX_EXT_MODULEID_MEM 1
+
+/* \cond SHOW_HIDDEN
+ * \brief A compatibility ID value used in structures and initialization to
+ * identify version differences.
+ */
+#define NVTX_EXT_COMPATID_MEM 0x0102
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemHeapHandle_t`,
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_HEAP_HANDLE_NO_TOOL ((nvtxMemHeapHandle_t)(intptr_t)-1)
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemRegionHandle_t`
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_REGION_HANDLE_NO_TOOL ((nvtxMemRegionHandle_t)(intptr_t)-1)
+
+/* \cond SHOW_HIDDEN
+ * \brief This value is returned by functions that return `nvtxMemPermissionsHandle_t`
+ * if a tool is not attached.
+ */
+#define NVTX_MEM_PERMISSIONS_HANDLE_NO_TOOL ((nvtxMemPermissionsHandle_t)-1)
+
+
+/* \cond SHOW_HIDDEN
+ * \brief This should not be used and is considered an error but defined to
+ * detect an accidental use of zero or NULL.
+ */
+#define NVTX_MEM_HEAP_USAGE_UNKNOWN 0x0
+
+
+/* \cond SHOW_HIDDEN
+ * \brief This should not be used and is considered an error but defined to
+ * detect an accidental use of zero or NULL.
+ */
+#define NVTX_MEM_TYPE_UNKNOWN 0x0
+
+
+/*  ------------------------------------------------------------------------- */
+/** \defgroup MEMORY Memory
+ * See page \ref PAGE_MEMORY.
+ * @{
+ */
+
+/**
+ * \brief To indicate the full process virtual address space as a heap for
+ * functions where a nvtxMemHeapHandle_t is accepted.
+ *
+ * The heap by default is always read-write-execute permissions without creating regions.
+ * Regions created in this heap have read-write access by default but not execute.
+ */
+#define NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE ((nvtxMemHeapHandle_t)0)
+
+/** \brief This heap is a sub-allocator.
+ *
+ * Heap created with this usage should not be accessed by the user until regions are registered.
+ * Regions from a heap with this usage have read-write access by default but not execute.
+ */
+#define NVTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR 0x1
+
+/**
+ * \brief This is a heap of memory that has an explicit layout.
+ *
+ * The layout could be static or dynamic (calculated). This often represents an algorithm's
+ * structures that are packed together. By default this heap is assumed to be accessible for
+ * scopes where the memory is naturally accessible by hardware. Regions may be use to further
+ * annotate or restrict access. A tool may have an option to be more strict, but special
+ * consideration must be made for `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
+ *
+ * The behavior of this usage is similar to NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE but
+ * a tool can use it to track special behaviors and reservation.
+ *
+ * Memory in a heap with this usage has read-write permissions by default but not execute without
+ * creating regions. Regions created in this heap have the same default permission access.
+ */
+#define NVTX_MEM_HEAP_USAGE_TYPE_LAYOUT 0x2
+
+
+/**
+ * \brief Standard process userspace virtual addresses for linear allocations.
+ *
+ * APIs that map into this space, such as CUDA UVA should use this type.
+ *
+ * Relevant functions: cudaMalloc, cudaMallocManaged, cudaHostAlloc, cudaMallocHost
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type nvtxMemVirtualRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_VIRTUAL_ADDRESS 0x1
+
+
+/**
+ * \brief To indicate you are modifying permissions to the process-wide
+ * full virtual address space.
+ *
+ * This is a companion object to `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`.
+ */
+#define NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE ((nvtxMemPermissionsHandle_t)0)
+
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_NONE 0x0
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ 0x1
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE 0x2
+#define NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_ATOMIC 0x4
+
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory heap structure.
+ */
+struct nvtxMemHeap_v1;
+typedef struct nvtxMemHeap_v1 nvtxMemHeap_t;
+
+/** \brief A handle returned by a tool to represent a memory heap. */
+typedef nvtxMemHeap_t* nvtxMemHeapHandle_t;
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory heap structure.
+ */
+struct nvtxMemRegion_v1;
+typedef struct nvtxMemRegion_v1 nvtxMemRegion_t;
+
+/** \brief A handle returned by a tool to represent a memory region. */
+typedef nvtxMemRegion_t* nvtxMemRegionHandle_t;
+
+/** \brief A reference to a memory region (by pointer or handle).
+ * Which member of the union will be determined by a type or flag field outside.
+ */
+typedef union nvtxMemRegionRef_t
+{
+    void const* pointer;
+    nvtxMemRegionHandle_t handle;
+} nvtxMemRegionRef_t;
+
+/* \cond SHOW_HIDDEN
+ * \brief Forward declaration of opaque memory permissions structure
+ */
+struct nvtxMemPermissions_v1;
+typedef struct nvtxMemPermissions_v1 nvtxMemPermissions_t;
+
+/** \brief A handle returned by a tool to represent a memory permissions mask. */
+typedef nvtxMemPermissions_t* nvtxMemPermissionsHandle_t;
+
+
+typedef struct nvtxMemVirtualRangeDesc_v1
+{
+    size_t  size;
+    void const*  ptr;
+} nvtxMemVirtualRangeDesc_v1 ;
+typedef nvtxMemVirtualRangeDesc_v1 nvtxMemVirtualRangeDesc_t;
+
+
+/** \brief structure to describe a heap in process virtual memory. */
+typedef struct nvtxMemHeapDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+
+    /** \brief Usage characteristics of the heap
+     *
+     * Usage characteristics help tools like memcheckers, santiizer,
+     * as well as other debugging and profiling tools to determine some
+     * special behaviors they should apply to the heap and it's regions.
+     * The value follows the convention NVTX_MEM_HEAP_USAGE_*
+     *
+     * Default Value is 0, which is invalid.
+     */
+    uint32_t usage;
+
+    /** \brief Memory type characteristics of the heap
+     *
+     * The 'type' indicates how to interpret the ptr field of the heapDesc.
+     * This is intended to support many additional types of memory, beyond
+     * standard process virtual memory, such as API specific memory only
+     * addressed by handles or multi-dimensional memory requiring more complex
+     * descriptions to handle features like strides, tiling, or interlace.
+     *
+     * The values conforms to NVTX_MEM_TYPE_*
+     *
+     * The value in the field 'type' identifies the descriptor type that will
+     * be in the field 'typeSpecificDesc'.  'typeSpecificDesc' is void* because
+     * it is extensible.  Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
+     * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
+     *
+     * Default Value is 0, which is invalid.
+     */
+    uint32_t type;
+
+    /** \brief size of the heap memory descriptor pointed to by typeSpecificDesc
+     *
+     * Default Value is 0 which is invalid.
+     */
+    size_t typeSpecificDescSize;
+
+    /** \brief Pointer to the heap memory descriptor
+     *
+     * The value in the field 'type' identifies the descriptor type that will
+     * be in the field 'typeSpecificDesc'.  'typeSpecificDesc' is void* because
+     * it is extensible.  Example usage is if type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
+     * then typeSpecificDesc points to a nvtxMemVirtualRangeDesc_t.
+     *
+     * Default Value is 0, which is invalid.
+     */
+    void const* typeSpecificDesc;
+
+    /** \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0.
+     */
+    uint32_t category;
+
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is `NVTX_MESSAGE_UNKNOWN`.
+     */
+    uint32_t messageType;            /* nvtxMessageType_t */
+
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    nvtxMessageValue_t message;
+
+} nvtxMemHeapDesc_v1 ;
+typedef nvtxMemHeapDesc_v1 nvtxMemHeapDesc_t;
+
+/**
+ * \brief Create a memory heap to represent a object or range of memory that will be further
+ * sub-divided into regions.
+ *
+ * The handle used to addrss the heap will depend on the heap's type.  Where the heap is virtual
+ * memory accessible, the addrss of the heap's memory itself is it's handle. This will likewise
+ * be returned from the function.
+ *
+ * For more advanced types, where the heap is not virtual memory accessible the tools may be
+ * responsible for returning a void const * that that uniquely identifies the object. Please see
+ * the description of each heap type for more details on whether this is expected to be a uniquely
+ * generated by the tool or otherwise.
+ */
+NVTX_DECLSPEC nvtxMemHeapHandle_t NVTX_API nvtxMemHeapRegister(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapDesc_t const* desc);
+
+ /** \brief Destroy a memory heap. */
+NVTX_DECLSPEC void NVTX_API nvtxMemHeapUnregister(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapHandle_t heap);/* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported */
+
+/**
+ * \brief Reset the memory heap wipes out any changes, as if it were a fresh heap.
+ *
+ * This includes invalidating all regions and their handles.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemHeapReset(
+    nvtxDomainHandle_t domain,
+    nvtxMemHeapHandle_t heap); /* NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is supported */
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides. This can be from
+ * `nvtxMemHeapRegister`, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or one provided
+ * from other extension API.
+ *
+ * The regionType arg will define which type is used in regionDescArray.
+ * The most commonly used type is `NVTX_MEM_TYPE_VIRTUAL_ADDRESS`.
+ * In this case regionDescElements is an array of `nvtxMemVirtualRangeDesc_t`.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
+ * a pointer is provided, it is expected to have regionCount elements. This pointer can be NULL if
+ * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
+ * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsRegisterBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t regionType; /* NVTX_MEM_TYPE_* */
+
+    nvtxMemHeapHandle_t heap;
+
+    size_t regionCount;
+    size_t regionDescElementSize;
+    void const* regionDescElements; /* This will also become the handle for this region. */
+    nvtxMemRegionHandle_t* regionHandleElementsOut; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsRegisterBatch_v1;
+typedef nvtxMemRegionsRegisterBatch_v1 nvtxMemRegionsRegisterBatch_t;
+
+ /** \brief Register a region of memory inside of a heap of linear process virtual memory
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsRegister(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsRegisterBatch_t const* desc);
+
+
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides.
+ * This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or
+ * one provided from other extension API.
+ *
+ * The regionType arg will define which type is used in regionDescArray.
+ * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles. If
+ * a pointer if provided, it is expected to have regionCount elements. This pointer can be NULL if
+ * regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS. In this case, the user can use the pointer to the
+ * virtual memory to reference the region in other related functions which accept nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsResizeBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t regionType; /* NVTX_MEM_TYPE_* */
+
+    size_t regionDescCount;
+    size_t regionDescElementSize;
+    void const* regionDescElements; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsResizeBatch_v1;
+typedef nvtxMemRegionsResizeBatch_v1 nvtxMemRegionsResizeBatch_t;
+
+ /** \brief Register a region of memory inside of a heap of linear process virtual memory
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsResize(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsResizeBatch_t const* desc);
+
+
+#define NVTX_MEM_REGION_REF_TYPE_UNKNOWN 0x0
+#define NVTX_MEM_REGION_REF_TYPE_POINTER 0x1
+#define NVTX_MEM_REGION_REF_TYPE_HANDLE 0x2
+
+/**
+ * \brief Register a region of memory inside of a heap.
+ *
+ * The heap refers the the heap within which the region resides.
+ * This can be from nvtxMemHeapRegister, `NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE`, or
+ * one provided from other extension API.
+ *
+ * The regionType arg will define which type is used in `regionDescArray`.
+ * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
+ *
+ * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
+ *
+ * The regionHandleArrayOut arg points to an array where the tool will provide region handles.
+ * If a pointer if provided, it is expected to have regionCount elements.
+ * This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.  In this case,
+ * the user can use the pointer to the virtual memory to reference the region in other
+ * related functions which accept a nvtMemRegionRef_t.
+ */
+typedef struct nvtxMemRegionsUnregisterBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t refType; /* NVTX_MEM_REGION_REF_TYPE_* */
+
+    size_t refCount; /* count of elements in refArray */
+    size_t refElementSize;
+    nvtxMemRegionRef_t const* refElements; /* This will also become the handle for this region. */
+
+} nvtxMemRegionsUnregisterBatch_v1;
+typedef nvtxMemRegionsUnregisterBatch_v1 nvtxMemRegionsUnregisterBatch_t;
+
+/**
+ * \brief Unregistration for regions of process virtual memory
+ *
+ * This is not necessary if the nvtx heap destroy function has been called that
+ * contains this object.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsUnregister(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsUnregisterBatch_t const* desc);
+
+typedef struct nvtxMemRegionNameDesc_v1
+{
+    uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
+    uint32_t nameType; /* nvtxMessageType_t */
+
+    nvtxMemRegionRef_t region;
+    nvtxMessageValue_t name;
+
+    uint32_t category;
+    uint32_t reserved0;
+} nvtxMemRegionNameDesc_v1;
+typedef nvtxMemRegionNameDesc_v1 nvtxMemRegionNameDesc_t;
+
+
+typedef struct nvtxMemRegionsNameBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t reserved0;
+
+    size_t regionCount;
+    size_t regionElementSize;
+    nvtxMemRegionNameDesc_t const* regionElements;
+    size_t reserved1;
+} nvtxMemRegionsNameBatch_v1 ;
+typedef nvtxMemRegionsNameBatch_v1 nvtxMemRegionsNameBatch_t;
+
+
+ /** \brief Name or rename a region. */
+NVTX_DECLSPEC void NVTX_API nvtxMemRegionsName(
+    nvtxDomainHandle_t domain,
+    nvtxMemRegionsNameBatch_t const* desc);
+
+/** \brief There are no permissions for this memory. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_NONE 0x0
+
+/** \brief The memory is readable. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_READ 0x1
+
+/** \brief The memory is writable. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_WRITE 0x2
+
+/** \brief The memory is for atomic RW. */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_ATOMIC 0x4
+
+/**
+ * \brief The memory access permissions are reset for a region.
+ *
+ * This is as if never set, rather than documented defaults.  As as result any flags
+ * indicating how unspecified regions are handle will affect this area.
+ *
+ * This should not be used with READ, WRITE, nor ATOMIC, as those flags would have no effect.
+ */
+#define NVTX_MEM_PERMISSIONS_REGION_FLAGS_RESET 0x8
+
+
+typedef struct nvtxMemPermissionsAssignRegionDesc_v1
+{
+    uint32_t flags; /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
+    uint32_t regionRefType; /* NVTX_MEM_REGION_REF_TYPE_* */
+    nvtxMemRegionRef_t region;
+
+} nvtxMemPermissionsAssignRegionDesc_v1 ;
+typedef nvtxMemPermissionsAssignRegionDesc_v1 nvtxMemPermissionsAssignRegionDesc_t;
+
+
+typedef struct nvtxMemPermissionsAssignBatch_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+
+    uint32_t reserved0;
+
+    nvtxMemPermissionsHandle_t permissions;
+
+    size_t regionCount;
+    size_t regionElementSize;
+    nvtxMemPermissionsAssignRegionDesc_t const* regionElements;
+
+    size_t reserved1;
+} nvtxMemPermissionsAssignBatch_v1 ;
+typedef nvtxMemPermissionsAssignBatch_v1 nvtxMemPermissionsAssignBatch_t;
+
+
+ /** \brief Change the permissions of a region of process virtual memory. */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsAssign(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsAssignBatch_t const* desc);
+
+
+/**
+ * \brief Create a permissions object for fine grain thread-local control in
+ * multi-threading scenarios
+ *
+ * Unlike the global permissions object (NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE), a new
+ * permissions object is empty. There are no regions registered to it, so more memory is accessible
+ * if bound(bind) without calls to nvtxMemPermissionsSetAccess* first. The permissions are not
+ * active until nvtxMemPermissionsBind. See `nvtxMemPermissionsBind` for more details.
+ *
+ * Use the flags NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_* to control  how the regions in
+ * this permission object will interact with global permissions when bound. You may choose to
+ * either replace global memory regions setting or overlay on top of them. The most common uses are
+ * as follows:
+ *     * To limit tools to validate writing exclusively specified in this object but inherit all
+ *       global read access regions use `NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE`
+ *     * To limit tools to validate both read & write permissions exclusively specified in this
+ *        object use NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_READ
+ *                   & NVTX_MEM_PERMISSIONS_CREATE_FLAGS_EXCLUDE_GLOBAL_WRITE
+ *
+ * Also see `nvtxMemPermissionsBind` & `nvtxMemPermissionsSetAccess*`.
+ */
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemPermissionsCreate(
+    nvtxDomainHandle_t domain,
+    int32_t creationflags); /* NVTX_MEM_PERMISSIONS_CREATE_FLAGS_* */
+
+/**
+ * \brief Destroy the permissions object.
+ *
+ * If bound(bind), destroy will also unbind it.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsDestroy(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissionsHandle); /* only supported on objects from nvtxMemPermissionsCreate */
+
+/** \brief Reset the permissions object back to its created state. */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsReset(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissionsHandle);
+/* NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and other special handles are supported */
+
+
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_NONE 0x0
+
+ /** \brief Upon binding, with the thread, exclude parent scope write regions instead of overlaying on top of them.
+  *
+   * EX A developer may chose to first prevent all writes except the ones specified to avoid
+  * OOB writes, since there are typically less regions written to than read from.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_WRITE 0x2
+
+ /** \brief Upon binding, with the thread, exclude parent scope read regions instead of overlaying on top of them.
+  *
+  * EX After eliminating any errors when applying strict writes, a developer may then choose to
+  * annotate and enforce strict reads behaviors in segments of code.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_READ 0x1
+
+ /** \brief Upon binding, with the thread, exclude parent scope atomic RW regions instead of overlaying on top of them.
+  *
+  * EX After eliminating any errors from read and write, a developer may chose to ensure
+  * that atomics are in their own region, removing standard read/write, and replacing with
+  * this strict atomic only access.  This way they know that conventional reads or writes
+  * will not cause unepected issues.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_FLAGS_STRICT_ATOMIC 0x4
+
+
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_UNKNOWN 0x0
+
+ /** \brief Bind to thread scope.  In this case, tools should validate that local thread's
+  * execution is honoring the permissions as well as the state of NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE
+  * at the time of binding.  If this is not bound then NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE should be
+  * used to validate the memory.
+  *
+  * Not all tools will support every scope, such a GPU sanitizer.
+ **/
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD 0x1
+
+/**
+ * \brief Bind to CUDA stream scope.
+ *
+ * In this case, work enqueued to a CUDA stream should be validated by the tool,
+ * when it executes, that it respect the permission of the permission at the point
+ * of binding, as well as the appropriate nvtxMemCudaGetDevicePermissions at the
+ * time of binding. If this is not bound then nvtxMemCudaGetDevicePermissions at
+ * the time of stream enqueue should be used to validate the memory.
+ *
+ * This could apply to work done either on the GPU like a kernel launch or to
+ * CPU based callbacks like cudaStreamAddCallback if the tools supports it.
+ *
+ * Binding is applies locally to a CPU thread so that if N CPU threads are enqueing
+ * work to the same stream (like the default stream) that there cannot be a race
+ * condition between thread binding vs launching their work. IE users should
+ * expect the permissions bound in the thread to be honored by the proceeding
+ * work (launches, copies, etc) invoked from in the CPU thread until unbound.
+ */
+#define NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM 0x2
+
+
+/**
+ * \brief Bind the permissions object into a particular scope on the caller thread
+ *
+ * Permissions do not take affect until binding. Binding permissions is a thread local
+ * activity that overrides global behaviors.  This is to avoid multi-threaded race conditions,
+ *
+ * The scope dictates what type of processing it applies to, and when in some cases.
+ * EX1: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CPU_THREAD applies to CPU code accessing memory while bound.
+ * EX2: NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM applies to CUDA streams, and the permissions
+ * must be recorded and applied when the work in the stream dequeues to executes.  In this case
+ * it could be GPU or CPU, if the tool support both.
+ *
+ * Bind can be called again on the same object and thread to take any updates to the
+ * specified permission object or the inherited properties.
+ *
+ * Bind flags support changing how the binding process inherits region access control.
+ * In the case of thread scope this is NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE and from CUDA_STREAM
+ * this is nvtxMemCudaGetDevicePermissions.  Choosing stricter modes allows the user to
+ * further reduce the access with less work, since memory by default, behaves as natural
+ * until the NVTX annotations instructs a tool to treat it anther way.  See strict flags
+ * for more details.
+ *
+ * Also see nvtxMemPermissionsUnbind
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsBind(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissions, /* special object like NVTX_MEM_PERMISSIONS_HANDLE_PROCESS_WIDE are not supported */
+    uint32_t bindScope, /* NVTX_MEM_PERMISSIONS_BIND_SCOPE_* */
+    uint32_t bindFlags); /* NVTX_MEM_PERMISSIONS_BIND_FLAGS_* */
+
+/**
+ * \brief Unbind the permissions object bound to the caller thread.
+ *
+ * Upon unbind, the thread local permissions for a scope are restored to the default
+ * behavior defined by the scope.
+ */
+NVTX_DECLSPEC void NVTX_API nvtxMemPermissionsUnbind(
+    nvtxDomainHandle_t domain,
+    uint32_t bindScope);
+
+/** @} */ /*END defgroup*/
+
+typedef enum NvtxExtMemCallbackId
+{
+    /* CBID 0 is invalid */
+    NVTX3EXT_CBID_nvtxMemHeapRegister                  = 1,
+    NVTX3EXT_CBID_nvtxMemHeapUnregister                = 2,
+    NVTX3EXT_CBID_nvtxMemHeapReset                     = 3,
+    NVTX3EXT_CBID_nvtxMemRegionsRegister               = 4,
+    NVTX3EXT_CBID_nvtxMemRegionsResize                 = 5,
+    NVTX3EXT_CBID_nvtxMemRegionsUnregister             = 6,
+    NVTX3EXT_CBID_nvtxMemRegionsName                   = 7,
+    NVTX3EXT_CBID_nvtxMemPermissionsAssign             = 8,
+    NVTX3EXT_CBID_nvtxMemPermissionsCreate             = 9,
+    NVTX3EXT_CBID_nvtxMemPermissionsDestroy            = 10,
+    NVTX3EXT_CBID_nvtxMemPermissionsReset              = 11,
+    NVTX3EXT_CBID_nvtxMemPermissionsBind               = 12,
+    NVTX3EXT_CBID_nvtxMemPermissionsUnbind             = 13,
+
+    /* 14-16 in nvtExtImplMemCudaRt1.h */
+    NVTX3EXT_CBID_nvtxMemCudaGetProcessWidePermissions = 14,
+    NVTX3EXT_CBID_nvtxMemCudaGetDeviceWidePermissions  = 15,
+    NVTX3EXT_CBID_nvtxMemCudaSetPeerAccess             = 16,
+
+    NVTX3EXT_CBID_MEM_FN_NUM                           = 17
+} NvtxExtMemCallbackId;
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+/* Extension types are required for the implementation and the NVTX handler. */
+#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+/* Ensure other headers cannot be included directly */
+#define NVTX_EXT_IMPL_MEM_GUARD
+#include "nvtxDetail/nvtxExtImplMem_v1.h"
+#undef NVTX_EXT_IMPL_MEM_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXTV3_MEM_V1 */
@@ -0,0 +1,150 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+#ifndef NVTOOLSEXTV3_MEM_CUDART_V1
+#define NVTOOLSEXTV3_MEM_CUDART_V1
+
+#include "nvToolsExtMem.h"
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+
+/** \brief The memory is from a CUDA runtime array.
+ *
+ * Relevant functions: cudaMallocArray,  cudaMalloc3DArray
+ * Also cudaArray_t from other types such as cudaMipmappedArray_t
+ *
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CUDA_ARRAY 0x11
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCudaArrayRangeDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    cudaArray_t  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCudaArrayRangeDesc_v1;
+typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
+
+
+/** \brief The memory is from a CUDA device array.
+ *
+ * Relevant functions: cuArrayCreate,  cuArray3DCreate
+ * Also CUarray from other types such as CUmipmappedArray
+ *
+ * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
+ *
+ * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
+ * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
+ */
+#define NVTX_MEM_TYPE_CU_ARRAY 0x12
+
+/** \brief structure to describe memory in a CUDA array object
+ */
+typedef struct nvtxMemCuArrayRangeDesc_v1
+{
+    uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
+    uint16_t structSize; /* Size of the structure. */
+    uint32_t reserved0;
+    CUarray  src;
+    size_t offset[3];
+    size_t extent[3];
+} nvtxMemCuArrayRangeDesc_v1;
+typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
+
+/* Reserving 0x2-0xF for more common types */
+
+#define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
+
+/** \brief Get the permission object that represent the CUDA runtime device
+ * or cuda driver context
+ *
+ * This object will allow developers to adjust permissions applied to work executed
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ *
+ * Ex. change the peer to peer access permissions between devices in entirety
+ * or punch through special holes
+ *
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ *
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
+    nvtxDomainHandle_t domain);
+
+/** \brief Get the permission object that represent the CUDA runtime device
+ * or cuda driver context
+ *
+ * This object will allow developers to adjust permissions applied to work executed
+ * on the GPU.  It may be inherited or overridden by permissions object bound
+ * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
+ *
+ * Ex. change the peer to peer access permissions between devices in entirety
+ * or punch through special holes
+ *
+ * By default, all memory is accessible that naturally would be to a CUDA kernel until
+ * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
+ *
+ * This object should also represent the CUDA driver API level context.
+*/
+NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
+    nvtxDomainHandle_t domain,
+    int device);
+
+/** \brief Change the default behavior for all memory mapped in from a particular device.
+ *
+ * While typically all memory defaults to readable and writable, users may desire to limit
+ * access to reduced default permissions such as read-only and a per-device basis.
+ *
+ * Regions can used to further override smaller windows of memory.
+ *
+ * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
+ *
+*/
+NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess(
+    nvtxDomainHandle_t domain,
+    nvtxMemPermissionsHandle_t permissions,
+    int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
+    uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
+
+/** @} */ /*END defgroup*/
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */
+#include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h"
+#undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXTV3_MEM_CUDART_V1 */
@@ -30,11 +30,11 @@ extern "C" {
 */

 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
-#define NVTX_RESOURCE_CLASS_OPENCL 6 
+#define NVTX_RESOURCE_CLASS_OPENCL 6
 /** \endcond */

 /*  ------------------------------------------------------------------------- */
@@ -183,7 +183,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name)
 #endif /* __cplusplus */

 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplOpenCL_v3.h"
 #undef NVTX_IMPL_GUARD_OPENCL
 #endif /*NVTX_NO_IMPL*/
@@ -0,0 +1,170 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvtxDetail/nvtxExtPayloadHelperInternal.h"
+
+
+/* This is just an empty marker (for readability), which can be omitted. */
+/* TODO: Fix issue with trailing comma at end of entry list. */
+#define NVTX_PAYLOAD_ENTRIES
+
+
+/**
+ * Use this macro for payload entries that are defined by a schema (nested
+ * payload schema).
+ */
+#define NVTX_PAYLOAD_NESTED(schemaId) _NVTX_PAYLOAD_NESTED(schemaId)
+
+
+/**
+ * \brief Define a payload schema for an existing C `struct` definition.
+ *
+ *  This macro does
+ *   1) create schema description (array of schema entries).
+ *   2) set the schema attributes for a static data layout.
+ *
+ * It can be used in static code or within a function context.
+ *
+ * Example:
+ *  NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
+ *      NVTX_PAYLOAD_ENTRIES(
+ *          (index, TYPE_INT, "integer value"),
+ *          (dpfloat, TYPE_DOUBLE, "fp64 value"),
+ *          (text, TYPE_CSTRING, "text", NULL, 24)
+ *      )
+ *  )
+ *
+ * It is required to at least provide the struct name and the payload entries.
+ * The first two fields (member name and NVTX entry type) of each payload entry
+ * are required.
+ *
+ * The optional parameters are only allowed to be passed in the predefined order.
+ * Hence, `payload_flags` requires `payload_schema` to be given and
+ * `prefix` requires `payload_flags` and `payload_schema` to be given.
+ * The payload entries are always the last parameter. A maximum of 16 schema
+ * entries is supported.
+ *
+ * It is recommended to use `NVTX_PAYLOAD_SCHEMA_REGISTER` to register the schema.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
+ * @param prefix (Optional 2) prefix before the schema and attributes variables,
+ *               e.g. `static const`. Leave this empty, if no prefix is desired.
+ * @param schema_flags (Optional 2) flags to augment the payload schema.
+ *                     Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
+ * @param schema_id (Optional 4) User-defined payload schema ID.
+ * @param entries (Mandatory) Payload schema entries. This is always the last
+ *                parameter to the macro.
+ */
+#define NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
+    _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__)
+
+
+/**
+ * \brief Define a C struct together with a matching schema.
+ *
+ * This macro does
+ *   1) define the payload type (typedef struct).
+ *   2) create schema description (array of schema entries).
+ *   3) set the schema attributes for a static data layout.
+ *
+ * The macro can be used in static code or within a function context.
+ *
+ * It defines the schema attributes in `struct_id##Attr`. Thus, it is recommended
+ * to use `NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id)` to register the schema.
+ *
+ * Example:
+ *  NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
+ *      NVTX_PAYLOAD_ENTRIES(
+ *          (int, index, TYPE_INT, "integer value"),
+ *          (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
+ *          (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
+ *      )
+ *  )
+ *
+ * The first three fields (C type, member, entry type) of each entry are required.
+ * A fixed-size array or string requires a special notation with the member
+ * name and the size separated by comma and put into brackets (see last entry
+ * in the example).
+ *
+ * The optional parameters are positional (only allowed to be passed in the
+ * predefined order). A maximum of 16 schema entries is supported.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name (Optional 1) name of the payload schema. Default is `NULL`.
+ * @param prefix (Optional 2) prefix before the schema and attributes variables,
+ *               e.g. `static const`. Leave this empty, if no prefix is desired.
+ * @param schema_flags (Optional 3) flags to augment the payload schema.
+ *                     Default is `NVTX_PAYLOAD_SCHEMA_FLAG_NONE`.
+ * @param schema_id (Optional 4) User-defined payload schema ID.
+ * @param entries (Mandatory) The schema entries. This is always the last
+ *                parameter to the macro.
+ */
+#define NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
+    _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__)
+
+/**
+ * \brief Initialize and register the NVTX binary payload schema.
+ *
+ * This does essentially the same as `NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but in
+ * addition the schema is registered. The schema ID will be defined as follows:
+ * `const uint64_t struct_id##_schemaId`.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * All other parameters are similar to `NVTX_DEFINE_STRUCT_WITH_SCHEMA`.
+ */
+#define NVTX_DEFINE_STRUCT_WITH_SCHEMA_AND_REGISTER(domain, struct_id, ...) \
+    _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, __VA_ARGS__) \
+    const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
+/**
+ * \brief Define payload schema for an existing `struct` and register the schema.
+ *
+ * This does essentially the same as `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`, but in
+ * addition, the schema is registered and `uint64_t struct_id##_schemaId` set.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * All other parameters are similar to `NVTX_PAYLOAD_STATIC_SCHEMA_DEFINE`.
+ */
+#define NVTX_DEFINE_SCHEMA_FOR_STRUCT_AND_REGISTER(domain, struct_id, ...) \
+    _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, __VA_ARGS__) \
+    const uint64_t struct_id##_schemaId = nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
+/**
+ * \brief Create a type definition for the given struct ID and members.
+ *
+ * This is a convenience macro. A normal `typedef` can be used instead.
+ *
+ * Example usage:
+ *   NVTX_DEFINE_STRUCT(your_struct,
+ *           (double, fp64),
+ *           (uint8_t, u8),
+ *           (float, fp32[3])
+ *   )
+ *
+ * @param struct_id The name of the struct.
+ * @param members The members of the struct.
+ */
+#define NVTX_DEFINE_STRUCT(struct_id, ...) \
+    _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, __VA_ARGS__)
+
+/**
+ * \brief Register an NVTX binary payload schema.
+ *
+ * This is a convenience macro, which takes the same `struct_id` that has been
+ * used in other helper macros. Instead, `nvtxPayloadSchemaRegister` can also be
+ * used, but `&struct_id##Attr` has to be passed.
+ *
+ * @param domain The NVTX domain handle (0 for default domain).
+ * @param struct_id The name of the struct.
+ *
+ * @return NVTX schema ID
+ */
+#define NVTX_PAYLOAD_SCHEMA_REGISTER(domain, struct_id) \
+    nvtxPayloadSchemaRegister(domain, &struct_id##Attr);
+
@@ -0,0 +1,88 @@
+/*
+* Copyright 2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/**
+ * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
+ */
+
+#ifndef NVTX_SEMANTIC_ID_COUNTERS_V1
+#define NVTX_SEMANTIC_ID_COUNTERS_V1 2
+
+/**
+ * Flags to extend the semantics of counters.
+ */
+#define NVTX_COUNTERS_FLAGS_NONE  0
+
+/**
+ * Convert the fixed point value to a normalized floating point value.
+ * Unsigned [0f : 1f] or signed [-1f : 1f] is determined by the underlying type
+ * this flag is applied to.
+ */
+#define NVTX_COUNTERS_FLAG_NORMALIZE    (1 << 1)
+
+/**
+ *  Visual tools should apply scale and limits when graphing.
+ */
+#define NVTX_COUNTERS_FLAG_LIMIT_MIN    (1 << 2)
+#define NVTX_COUNTERS_FLAG_LIMIT_MAX    (1 << 3)
+#define NVTX_COUNTERS_FLAG_LIMITS \
+    (NVTX_COUNTERS_FLAG_LIMIT_MIN | NVTX_COUNTERS_FLAG_LIMIT_MAX)
+
+/**
+ * Counter time scopes.
+ */
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_POINT        (1 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_LAST   (2 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_UNTIL_NEXT   (3 << 5)
+#define NVTX_COUNTERS_FLAG_TIMESCOPE_SINCE_START  (4 << 5)
+
+/**
+ * Counter value types.
+ */
+#define NVTX_COUNTERS_FLAG_VALUETYPE_ABSOLUTE (1 << 10)
+/** Delta to previous value of same counter type. */
+#define NVTX_COUNTERS_FLAG_VALUETYPE_DELTA    (2 << 10)
+
+/**
+ * Datatypes for the `limits` union.
+ */
+#define NVTX_COUNTERS_LIMIT_I64 0
+#define NVTX_COUNTERS_LIMIT_U64 1
+#define NVTX_COUNTERS_LIMIT_F64 2
+
+/**
+ *\brief Specify counter semantics.
+ */
+typedef struct nvtxSemanticsCounter_v1 {
+    /** Header of the semantic extensions (with identifier, version, etc.). */
+    struct nvtxSemanticsHeader_v1 header;
+
+    /** Flags to provide more context about the counter value. */
+    uint64_t flags;
+
+    /** Unit of the counter value (case-insensitive). */
+    const char*  unit;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleNumerator;
+
+    /** Should be 1 if not used. */
+    uint64_t unitScaleDenominator;
+
+    /** Determines the used union member. Use defines `NVTX_COUNTER_LIMIT_*`. */
+    int64_t limitType;
+
+    /** Graph limits {minimum, maximum}. */
+    union limits_t {
+        int64_t  i64[2];
+        uint64_t u64[2];
+        double   d[2];
+    } limits;
+} nvtxSemanticsCounter_t;
+
+#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
@@ -0,0 +1,30 @@
+/*
+* Copyright 2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/**
+ * NVTX semantic headers require nvToolsExtPayload.h to be included beforehand.
+ */
+
+#ifndef NVTX_SEMANTIC_ID_SCOPE_V1
+#define NVTX_SEMANTIC_ID_SCOPE_V1 1
+
+/**
+ * \brief Specify the NVTX scope for a payload entry.
+ *
+ * This allows the scope to be set for a specific value or counter in a payload.
+ * The scope must be known at schema registration time.
+ */
+typedef struct nvtxSemanticsScope_v1
+{
+    struct nvtxSemanticsHeader_v1 header;
+
+    /** Specifies the scope of a payload entry, e.g. a counter or timestamp. */
+    uint64_t scopeId;
+} nvtxSemanticsScope_t;
+
+#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
@@ -15,23 +15,23 @@
 extern "C" {
 #endif /* __cplusplus */

-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \version \NVTX_VERSION_2
 */
 #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
 /** \endcond */


-/** 
+/**
 * \page PAGE_SYNCHRONIZATION Synchronization
 *
 * This section covers a subset of the API that allow users to track additional
-* synchronization details of their application.   Naming OS synchronization primitives 
-* may allow users to better understand the data collected by traced synchronization 
+* synchronization details of their application.   Naming OS synchronization primitives
+* may allow users to better understand the data collected by traced synchronization
 * APIs.  Additionally, a user defined synchronization object can allow the users to
 * to tell the tools when the user is building their own synchronization system
 * that do not rely on the OS to provide behaviors and instead use techniques like
-* atomic operations and spinlocks.  
+* atomic operations and spinlocks.
 *
 * See module \ref SYNCHRONIZATION for details.
 *
@@ -59,7 +59,7 @@ extern "C" {
 *
 *     bool Lock() {
 *          nvtxDomainSyncUserAcquireStart(hSync);
-*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic 
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic

 *          if (acquired) {
 *              nvtxDomainSyncUserAcquireSuccess(hSync);
@@ -76,12 +76,12 @@ extern "C" {
 *     }
 * };
 * \endcode
-* 
+*
 * \version \NVTX_VERSION_2
 */

 /*  ------------------------------------------------------------------------- */
-/* \cond SHOW_HIDDEN 
+/* \cond SHOW_HIDDEN
 * \brief Used to build a non-colliding value for resource types separated class
 * \version \NVTX_VERSION_2
 */
@@ -154,8 +154,8 @@ typedef struct nvtxSyncUser* nvtxSyncUser_t;
 /** \brief User Defined Synchronization Object Attributes Structure.
 * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
 *
-* This structure is used to describe the attributes of a user defined synchronization 
-* object.  The layout of the structure is defined by a specific version of the tools 
+* This structure is used to describe the attributes of a user defined synchronization
+* object.  The layout of the structure is defined by a specific version of the tools
 * extension library and can change between different versions of the Tools Extension
 * library.
 *
@@ -259,7 +259,7 @@ typedef struct nvtxSyncUserAttributes_v0
 typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;

 /* ------------------------------------------------------------------------- */
-/** \brief Create a user defined synchronization object 
+/** \brief Create a user defined synchronization object
 * This is used to track non-OS synchronization working with spinlocks and atomics
 *
 * \param domain - Domain to own the resource
@@ -317,7 +317,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle
 /* ------------------------------------------------------------------------- */
 /** \brief Signal to tools of failure in acquiring a user defined synchronization object
 * This should be called after \ref nvtxDomainSyncUserAcquireStart
-* 
+*
 * \param handle - A handle to the object to operate on.
 *
 * \sa
@@ -374,7 +374,7 @@ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
 #endif /* __cplusplus */

 #ifndef NVTX_NO_IMPL
-#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
+#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot be included directly */
 #include "nvtxDetail/nvtxImplSync_v3.h"
 #undef NVTX_IMPL_GUARD_SYNC
 #endif /*NVTX_NO_IMPL*/
@@ -12,6 +12,11 @@
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
+ *
+ *  Licensed under the Apache License v2.0 with LLVM Exceptions.
+ *  See https://llvm.org/LICENSE.txt for license information.
+ *
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 */

 /* Temporary helper #defines, #undef'ed at end of header */
@@ -1937,9 +1942,9 @@ class event_attributes {
        0,                              // color value
        NVTX_PAYLOAD_UNKNOWN,           // payload type
        0,                              // reserved 4B
-        0,                              // payload value (union)
+        {0},                            // payload value (union)
        NVTX_MESSAGE_UNKNOWN,           // message type
-        0                               // message value (union)
+        {0}                             // message value (union)
      }
  {
  }
@@ -2003,20 +2008,20 @@ class event_attributes {
    attributes_.messageType = m.get_type();
  }

-   /**
-   * @brief Variadic constructor where the first argument is a binary payload.
+  /**
+   * @brief Variadic constructor where the first argument is an extended payload.
   *
-   * Sets the value of the `EventAttribute`s message based on `m` and forwards
+   * Sets the `ullValue` of the `EventAttribute`s payload and forwards
   * the remaining variadic parameter pack to the next constructor.
   *
   */
  template <typename... Args>
-  NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
+  NVTX3_CONSTEXPR_IF_CPP14 explicit event_attributes(nvtxPayloadData_t const* p, Args const&... args) noexcept
    : event_attributes(args...)
  {
-    attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
+    attributes_.payloadType = NVTX_PAYLOAD_TYPE_EXT;
    attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
-    attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
+    attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p);
  }

  ~event_attributes() = default;
@@ -0,0 +1,31 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_HELPER_MACROS_H
+#define NVTX_EXT_HELPER_MACROS_H
+
+/* Combine tokens */
+#define _NVTX_EXT_CONCAT(a, b) a##b
+#define NVTX_EXT_CONCAT(a, b) _NVTX_EXT_CONCAT(a, b)
+
+/* Resolves to the number of arguments passed. */
+#define NVTX_EXT_NUM_ARGS(...) \
+    NVTX_EXT_SELECTA16(__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, throwaway)
+#define NVTX_EXT_SELECTA16(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16, ...) a16
+
+/* Cast argument(s) to void to prevent unused variable warnings. */
+#define _NVTX_EXT_VOIDIFY1(a1) (void)a1;
+#define _NVTX_EXT_VOIDIFY2(a1, a2) (void)a1; (void)a2;
+#define _NVTX_EXT_VOIDIFY3(a1, a2, a3) (void)a1; (void)a2; (void)a3;
+#define _NVTX_EXT_VOIDIFY4(a1, a2, a3, a4) (void)a1; (void)a2; (void)a3; (void)a4;
+
+/* Mark function arguments as unused. */
+#define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
+    NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#endif /* NVTX_EXT_HELPER_MACROS_H */
@@ -14,7 +14,12 @@
 #define NVTX_EXT_IMPL_H
 /* ---- Include required platform headers ---- */

-#if defined(_WIN32) 
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
+#if defined(_WIN32)

 #include <Windows.h>

@@ -22,27 +27,19 @@
 #include <unistd.h>

 #if defined(__ANDROID__)
-#include <android/api-level.h> 
+#include <android/api-level.h>
 #endif

 #if defined(__linux__) || defined(__CYGWIN__)
 #include <sched.h>
 #endif

+#include <sys/types.h>
 #include <limits.h>
 #include <dlfcn.h>
 #include <fcntl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <errno.h>
-
-#include <string.h>
-#include <sys/types.h>
 #include <pthread.h>
-#include <stdlib.h>
-#include <wchar.h>

 #endif

@@ -66,26 +63,35 @@
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
-
-// #ifdef __GNUC__
-// #pragma GCC visibility push(hidden)
-// #endif
-
+/*
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+*/
 #define NVTX_EXTENSION_FRESH 0
 #define NVTX_EXTENSION_DISABLED 1
 #define NVTX_EXTENSION_STARTING 2
 #define NVTX_EXTENSION_LOADED 3

-NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
+/* Function slots are local to each extension */
+typedef struct nvtxExtGlobals1_t
+{
+    NvtxExtInitializeInjectionFunc_t injectionFnPtr;
+} nvtxExtGlobals1_t;
+
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1) =
+{
+    (NvtxExtInitializeInjectionFunc_t)0
+};

 #define NVTX_EXT_INIT_GUARD
 #include "nvtxExtInit.h"
 #undef NVTX_EXT_INIT_GUARD
-
-// #ifdef __GNUC__
-// #pragma GCC visibility pop
-// #endif
-
+/*
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+*/
 #ifdef __cplusplus
 } /* extern "C" */
 #endif /* __cplusplus */
@@ -0,0 +1,148 @@
+/*
+* Copyright 2023-2024  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_COUNTERS_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtCounters.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifndef NVTX_EXT_IMPL_COUNTERS_V1
+#define NVTX_EXT_IMPL_COUNTERS_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Macros to create versioned symbols. */
+#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_bpl##COMPATID
+#define NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_COUNTERS_VERSIONED_ID(NAME) \
+    NVTX_EXT_COUNTERS_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COUNTERS_COMPATID)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else /* NVTX_DISABLE */
+
+/*
+ * Function slots for the counters extension. First entry is the module state,
+ * initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+#define NVTX_EXT_COUNTERS_SLOT_COUNT 63
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX_EXT_COUNTERS_SLOT_COUNT + 1]
+    = {0};
+
+/* Avoid warnings about missing prototype. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX_EXT_COUNTERS_SLOT_COUNT,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_COUNTERS_MODULEID, NVTX_EXT_COUNTERS_COMPATID,
+        1, &segment, /* number of segments, segments */
+        NULL, /* no export function needed */
+        /* bake type sizes and alignment information into program binary */
+        NULL
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots));
+}
+
+#define NVTX_EXT_COUNTERS_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
+typedef ret_type (*fn_name##_impl_fntype)signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_COUNTERS_VERSIONED_ID(nvtxExtCountersSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(nvtxCountersHandle_t, nvtxCountersRegister,
+    (nvtxDomainHandle_t domain, const nvtxCountersAttr_t* attr),
+    (domain, attr))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleInt64,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, int64_t value),
+    (domain, hCounter, value))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleFloat64,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, double value),
+    (domain, hCounter, value))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSample,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, void* values, size_t size),
+    (domain, hCounter, values, size))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSampleNoValue,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounter, uint8_t reason),
+    (domain, hCounter, reason))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatch,
+    (nvtxDomainHandle_t domain, nvtxCountersHandle_t hCounters,
+    const void* counters, size_t size), (domain, hCounters, counters, size))
+
+NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx,
+    (nvtxDomainHandle_t domain, const nvtxCountersBatch_t* countersBatch),
+    (domain, countersBatch))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+/* Keep NVTX_EXT_COUNTERS_IMPL_FN_V1 defined for a future version of this extension. */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
@@ -0,0 +1,74 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_MEM_CUDART_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtMemCudaRt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else  /* NVTX_DISABLE */
+
+#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
+typedef ret_type ( * fn_name##_impl_fntype )signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetProcessWidePermissions, (nvtxDomainHandle_t domain), (domain))
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemCudaGetDeviceWidePermissions, (nvtxDomainHandle_t domain, int device), (domain, device))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_FN_IMPL(void, nvtxMemCudaSetPeerAccess, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, int devicePeer, uint32_t flags), (domain, permissions, devicePeer, flags))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+#undef NVTX_EXT_FN_IMPL
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,133 @@
+/*
+* Copyright 2009-2020,2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_MEM_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtMem.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) NAME##_v##VERSION##_mem##COMPATID
+#define NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) NVTXMEM_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_MEM_VERSIONED_ID(NAME) NVTXMEM_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_MEM)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else  /* NVTX_DISABLE */
+
+/*
+ * Function slots for the memory extension. First entry is the module
+ * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_MEM_FN_NUM + 2]
+    = {0};
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX3EXT_CBID_MEM_FN_NUM,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_MODULEID_MEM, NVTX_EXT_COMPATID_MEM,
+        1, &segment,
+        NULL, /* no export function needed */
+        NULL
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots));
+}
+
+#define NVTX_EXT_FN_IMPL(ret_type, fn_name, signature, arg_names) \
+typedef ret_type ( * fn_name##_impl_fntype )signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_MEM_VERSIONED_ID(nvtxExtMemSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_FN_IMPL(nvtxMemHeapHandle_t, nvtxMemHeapRegister, (nvtxDomainHandle_t domain, nvtxMemHeapDesc_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(nvtxMemPermissionsHandle_t, nvtxMemPermissionsCreate, (nvtxDomainHandle_t domain, int32_t creationflags), (domain, creationflags))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_FN_IMPL(void, nvtxMemHeapUnregister, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemHeapReset, (nvtxDomainHandle_t domain, nvtxMemHeapHandle_t heap), (domain, heap))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsRegister, (nvtxDomainHandle_t domain, nvtxMemRegionsRegisterBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsResize, (nvtxDomainHandle_t domain,nvtxMemRegionsResizeBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsUnregister, (nvtxDomainHandle_t domain,nvtxMemRegionsUnregisterBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemRegionsName, (nvtxDomainHandle_t domain,nvtxMemRegionsNameBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsAssign, (nvtxDomainHandle_t domain,nvtxMemPermissionsAssignBatch_t const* desc), (domain, desc))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsDestroy, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsReset, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions), (domain, permissions))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsBind, (nvtxDomainHandle_t domain, nvtxMemPermissionsHandle_t permissions, uint32_t bindScope, uint32_t bindFlags), (domain, permissions, bindScope, bindFlags))
+
+NVTX_EXT_FN_IMPL(void, nvtxMemPermissionsUnbind, (nvtxDomainHandle_t domain, uint32_t bindScope), (domain, bindScope))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+#undef NVTX_EXT_FN_IMPL
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,155 @@
+/*
+* Copyright 2021-2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_V1
+#define NVTX_EXT_IMPL_PAYLOAD_V1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* Macros to create versioned symbols. */
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_bpl##COMPATID
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_PAYLOAD_COMPATID)
+
+#ifdef NVTX_DISABLE
+
+#include "nvtxExtHelperMacros.h"
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_val, fn_name, signature, arg_names) \
+ret_val fn_name signature { \
+    NVTX_EXT_HELPER_UNUSED_ARGS arg_names \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+#else /* NVTX_DISABLE */
+
+#include "nvtxExtPayloadTypeInfo.h"
+
+/*
+ * Function slots for the payload extension. First entry is the module state,
+ * initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+#define NVTX_EXT_PAYLOAD_SLOT_COUNT 63
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX_EXT_PAYLOAD_SLOT_COUNT + 1]
+    = {0};
+
+/* Avoid warnings about missing prototype. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(void);
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
+{
+    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
+    nvtxExtModuleSegment_t segment = {
+        0, /* unused (only one segment) */
+        NVTX_EXT_PAYLOAD_SLOT_COUNT,
+        fnSlots
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_PAYLOAD_MODULEID, NVTX_EXT_PAYLOAD_COMPATID,
+        1, &segment, /* number of segments, segments */
+        NULL, /* no export function needed */
+        /* bake type sizes and alignment information into program binary */
+        &(NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo))
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
+}
+
+#define NVTX_EXT_PAYLOAD_IMPL_FN_V1(ret_type, fn_name, signature, arg_names) \
+typedef ret_type (*fn_name##_impl_fntype)signature; \
+    NVTX_DECLSPEC ret_type NVTX_API fn_name signature { \
+    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot != NVTX_EXTENSION_FRESH) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
+            /* Re-read function slot after extension initialization. */ \
+            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot != NVTX_EXTENSION_FRESH) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    NVTX_EXT_FN_RETURN_INVALID(ret_type) \
+}
+
+#endif /*NVTX_DISABLE*/
+
+/* Non-void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype) return ((rtype)(intptr_t)-1);
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadSchemaRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxPayloadEnumRegister,
+    (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr),
+    (domain, attr))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePushPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(int, nvtxRangePopPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(nvtxRangeId_t, nvtxRangeStartPayload,
+    (nvtxDomainHandle_t domain, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint8_t, nvtxDomainIsEnabled, (nvtxDomainHandle_t domain), (domain))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(uint64_t, nvtxScopeRegister, (nvtxDomainHandle_t domain,
+    const nvtxScopeAttr_t* attr), (domain, attr))
+
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: Non-void functions. */
+
+/* void functions. */
+#define NVTX_EXT_FN_RETURN_INVALID(rtype)
+#define return
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxMarkPayload, (nvtxDomainHandle_t domain,
+    const nvtxPayloadData_t* payloadData, size_t count), (domain, payloadData, count))
+
+NVTX_EXT_PAYLOAD_IMPL_FN_V1(void, nvtxRangeEndPayload, (nvtxDomainHandle_t domain,
+    nvtxRangeId_t id, const nvtxPayloadData_t* payloadData, size_t count),
+    (domain, id, payloadData, count))
+
+#undef return
+#undef NVTX_EXT_FN_RETURN_INVALID
+/* END: void functions. */
+
+/* Keep NVTX_EXT_PAYLOAD_IMPL_FN_V1 defined for a future version of this extension. */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_PAYLOAD_V1 */
+
@@ -1,5 +1,5 @@
 /*
-* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+* Copyright 2009-2023  NVIDIA Corporation.  All rights reserved.
 *
 * Licensed under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
@@ -22,7 +22,7 @@ extern "C" {
 #define NVTX_PATHCHAR   wchar_t
 #define NVTX_STR(x)     L##x
 #define NVTX_GETENV     _wgetenv
-#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  HMODULE
 #define NVTX_DLLOPEN(x) LoadLibraryW(x)
 #define NVTX_DLLFUNC    GetProcAddress
@@ -39,14 +39,14 @@ extern "C" {
 #define NVTX_PATHCHAR   char
 #define NVTX_STR(x)     x
 #define NVTX_GETENV     getenv
-#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  void*
 #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
 #define NVTX_DLLFUNC    dlsym
 #define NVTX_DLLCLOSE   dlclose
 #define NVTX_YIELD()    sched_yield()
 #define NVTX_MEMBAR()   __sync_synchronize()
-/* Ensure full memory barrier for atomics, to match Windows functions */
+/* Ensure full memory barrier for atomics, to match Windows functions. */
 #define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
 #define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
 #define NVTX_ATOMIC_WRITE_PTR(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
@@ -63,7 +63,7 @@ extern "C" {
 #define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
 #endif

-/* Define this to 1 for platforms that support environment variables */
+/* Define this to 1 for platforms that support environment variables. */
 /* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
 /* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
 #define NVTX_SUPPORT_ENV_VARS 1
@@ -72,16 +72,16 @@ extern "C" {
 #define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1

 /* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
-*  and this will override any dynamic injection.  Useful for platforms where dynamic
-*  injection is not available.  Since weak symbols not explicitly marked extern are
-*  guaranteed to be initialized to zero if no definitions are found by the linker, the
-*  dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
+ * which will override any dynamic injection. This is useful for platforms, where dynamic
+ * injection is not available. Since weak symbols, not explicitly marked extern, are
+ * guaranteed to be initialized to zero, if no definitions are found by the linker, the
+ * dynamic injection process proceeds normally, if pfnInitializeInjectionNvtx2 is 0. */
 #if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
 #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
 /* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
-*  symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
-*  does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
-*  injection library. */
+ * symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension, which
+ * does not need to be named "InitializeInjectionNvtxExtension" as it is necessary in a dynamic
+ * injection library. */
 __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
 #else
 #define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
@@ -89,35 +89,37 @@ __attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxEx



-/* This function tries to find or load an NVTX injection library and get the
-*  address of its InitializeInjectionExtension function.  If such a function pointer
-*  is found, it is called, and passed the address of this NVTX instance's
-*  nvtxGetExportTable function, so the injection can attach to this instance.
-*  If the initialization fails for any reason, any dynamic library loaded will
-*  be freed, and all NVTX implementation functions will be set to no-ops.  If
-*  initialization succeeds, NVTX functions not attached to the tool will be set
-*  to no-ops.  This is implemented as one function instead of several small
-*  functions to minimize the number of weak symbols the linker must resolve.
-*  Order of search is:
-*  - Pre-injected library exporting InitializeInjectionNvtxExtension
-*  - Loadable library exporting InitializeInjectionNvtxExtension
-*      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
-*      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
-*  - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
-*/
-NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
-NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
+/* This function tries to find or load an NVTX injection library and get the address of its
+ * `InitializeInjectionExtension` function. If such a function pointer is found, it is called and
+ * passed the address of this NVTX instance's `nvtxGetExportTable` function, so that the injection
+ * can attach to this instance.
+ * If the initialization fails for any reason, any dynamic library loaded will  be freed, and all
+ * NVTX implementation functions will be set to no-ops. If the initialization succeeds, NVTX
+ * functions that are not attached to the tool will be set to no-ops. This is implemented as one
+ * function instead of several small functions to minimize the number of weak symbols the linker
+ * must resolve. The order of search is:
+ *  1) Pre-injected library exporting InitializeInjectionNvtxExtension
+ *  2) Loadable library exporting InitializeInjectionNvtxExtension
+ *      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+ *      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+ *  3) Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+ */
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(
+    NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
 {
    const char* const initFuncName = "InitializeInjectionNvtxExtension";
    NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
    NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;

-    if(out_init_fnptr){
+    if (out_init_fnptr)
+    {
        *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
    }

 #if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
-    /* Use POSIX global symbol chain to query for init function from any module */
+    /* Use POSIX global symbol chain to query for init function from any module. */
    init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
 #endif

@@ -127,7 +129,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
    {
 #if NVTX_SUPPORT_ENV_VARS
        /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
-        *  to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+           to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
        const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
            ? NVTX_STR("NVTX_INJECTION32_PATH")
            : NVTX_STR("NVTX_INJECTION64_PATH");
@@ -135,12 +137,12 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
        NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
        const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;

-        /* Refer to this variable explicitly in case all references to it are #if'ed out */
+        /* Refer to this variable explicitly in case all references to it are #if'ed out. */
        (void)injectionLibraryPathBuf;

 #if NVTX_SUPPORT_ENV_VARS
        /* Disable the warning for getenv & _wgetenv -- this usage is safe because
-        *  these functions are not called again before using the returned value. */
+           these functions are not called again before using the returned value. */
 #if defined(_MSC_VER)
 #pragma warning( push )
 #pragma warning( disable : 4996 )
@@ -188,7 +190,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection

            pkgName[bytesRead] = 0;

-            /* String can contain colon as a process separator. In this case the package name is before the colon. */
+            /* String can contain colon as a process separator. In this case the
+               package name is before the colon. */
            pos = 0;
            while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
            {
@@ -223,8 +226,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
        }
 #endif

-        /* At this point, injectionLibraryPath is specified if a dynamic
-        *  injection library was specified by a tool. */
+        /* At this point, `injectionLibraryPath` is specified if a dynamic
+           injection library was specified by a tool. */
        if (injectionLibraryPath)
        {
            /* Load the injection library */
@@ -236,7 +239,7 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
            }
            else
            {
-                /* Attempt to get the injection library's entry-point */
+                /* Attempt to get the injection library's entry-point. */
                init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
                if (!init_fnptr)
                {
@@ -252,8 +255,8 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
 #if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
    if (!init_fnptr)
    {
-        /* Check weakly-defined function pointer.  A statically-linked injection can define this as
-        *  a normal symbol and it will take precedence over a dynamic injection. */
+        /* Check weakly-defined function pointer.  A statically-linked injection can define
+           this as a normal symbol and it will take precedence over a dynamic injection. */
        if (InitializeInjectionNvtxExtension_fnptr)
        {
            init_fnptr = InitializeInjectionNvtxExtension_fnptr;
@@ -261,13 +264,13 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
    }
 #endif

-    if(out_init_fnptr){
+    if (out_init_fnptr)
+    {
        *out_init_fnptr = init_fnptr;
    }

-    /* At this point, if init_fnptr is not set, then no tool has specified
-    *  an NVTX injection library -- return non-success result so all NVTX
-    *  API functions will be set to no-ops. */
+    /* At this point, if `init_fnptr` is not set, no tool has specified an NVTX injection library.
+       Non-success result is returned, so that all NVTX API functions will be set to no-ops. */
    if (!init_fnptr)
    {
        return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
@@ -276,16 +279,19 @@ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjection
    return NVTX_SUCCESS;
 }

+/* Avoid warnings about missing prototypes. */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState);
 NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
-    nvtxExtModuleInfo_t* moduleInfo,
-    intptr_t* moduleState
-    )
+    nvtxExtModuleInfo_t* moduleInfo, intptr_t* moduleState)
 {
    intptr_t old;

    NVTX_INFO( "%s\n", __FUNCTION__ );

-    if( *moduleState == NVTX_EXTENSION_LOADED) {
+    if (*moduleState == NVTX_EXTENSION_LOADED)
+    {
+        NVTX_INFO("Module loaded\n");
        return;
    }

@@ -296,45 +302,55 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
        NVTX_EXTENSION_FRESH);
    if (old == NVTX_EXTENSION_FRESH)
    {
-        NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
+        NvtxExtInitializeInjectionFunc_t init_fnptr =
+            NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr;
        int entryPointStatus = 0;
        int forceAllToNoops = 0;
+        size_t s;

-        /* Load & initialize injection library -- it will assign the function pointers */
-        if(init_fnptr == 0){
+        /* Load and initialize injection library, which will assign the function pointers. */
+        if (init_fnptr == 0)
+        {
            int result = 0;

-            /* try to load vanilla NVTX first*/
+            /* Try to load vanilla NVTX first. */
            nvtxInitialize(0);

            result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
-            /*at this point init_fnptr will be either 0 or a real function*/
+            /* At this point `init_fnptr` will be either 0 or a real function. */

-            if(result == NVTX_SUCCESS) {
-                NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
+            if (result == NVTX_SUCCESS)
+            {
+                NVTX_VERSIONED_IDENTIFIER(nvtxExtGlobals1).injectionFnPtr = init_fnptr;
            }
-            else {
+            else
+            {
                NVTX_ERR("Failed to load injection library\n");
            }
        }

-        if(init_fnptr != 0) {
-            /* Invoke injection library's initialization function.  If it returns
-            *  0 (failure) and a dynamic injection was loaded, unload it. */
+        if (init_fnptr != 0)
+        {
+            /* Invoke injection library's initialization function. If it returns
+               0 (failure) and a dynamic injection was loaded, unload it. */
            entryPointStatus = init_fnptr(moduleInfo);
-            if (entryPointStatus == 0) {
+            if (entryPointStatus == 0)
+            {
                NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
            }
        }

-        /* Clean up any functions that are still uninitialized so that they are skipped.
-         * Set all to null if injection init function failed as well.
-        */
+        /* Clean up any functions that are still uninitialized so that they are
+           skipped. Set all to null if injection init function failed as well. */
        forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
-        for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
-            nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
-            for(size_t i = 0; i < segment->slotCount; ++i){
-                if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
+        for (s = 0; s < moduleInfo->segmentsCount; ++s)
+        {
+            nvtxExtModuleSegment_t* segment = moduleInfo->segments + s;
+            size_t i;
+            for (i = 0; i < segment->slotCount; ++i)
+            {
+                if (forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH))
+                {
                    segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
                }
            }
@@ -342,12 +358,11 @@ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (

        NVTX_MEMBAR();

-        /* Signal that initialization has finished, so now the assigned function pointers will be used */
-        NVTX_ATOMIC_WRITE_PTR(
-            moduleState,
-            NVTX_EXTENSION_LOADED);
+        /* Signal that initialization has finished and the assigned function
+           pointers will be used. */
+        NVTX_ATOMIC_WRITE_PTR(moduleState, NVTX_EXTENSION_LOADED);
    }
-    else /* Spin-wait until initialization has finished */
+    else /* Spin-wait until initialization has finished. */
    {
        NVTX_MEMBAR();
        while (*moduleState != NVTX_EXTENSION_LOADED)
@@ -0,0 +1,272 @@
+/*
+* Copyright 2023  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
+#define NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H
+
+/* General helper macros */
+#include "nvtxExtHelperMacros.h"
+
+/* Get variable name with line number (almost unique per file). */
+#define _NVTX_PAYLOAD_DATA_VAR NVTX_EXT_CONCAT(nvtxDFDB,__LINE__)
+
+/* Create real arguments from just pasting tokens next to each other. */
+#define _NVTX_PAYLOAD_PASS_THROUGH(...) __VA_ARGS__
+
+/* Avoid prefixing `NVTX_PAYLOAD_ENTRY_` for nested payloads. */
+#define NVTX_PAYLOAD_ENTRY_THROWAWAY
+#define _NVTX_PAYLOAD_NESTED(id) THROWAWAY id
+
+/*
+ * Create the NVTX binary payloads schema attributes.
+ *
+ * @param struct_id The name of the struct.
+ * @param schema_name The name of the schema.
+ * @param schema_flags Additional schema flags
+ * @param mask_add Fields to be added to the mask.
+ * @param num_entries The number schema entries.
+ */
+#define NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, mask_add, num_entries) \
+    nvtxPayloadSchemaAttr_t struct_id##Attr = { \
+        /*.fieldMask = */NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | mask_add \
+            NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | \
+            NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | \
+            NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE, \
+        /*.name = */schema_name, \
+        /*.type = */NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, \
+        /*.flags = */schema_flags, \
+        /*.entries = */struct_id##Schema, /*.numEntries = */num_entries, \
+        /*.payloadStaticSize = */sizeof(struct_id), \
+        /*.packAlign = */0, /*.schemaId = */schema_id};
+
+
+/*****************************************************************/
+/*** Helper for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]` ***/
+
+/* First part of schema entry for different number of arguments. */
+#define _NVTX_PAYLOAD_SCHEMA_EF2(member, etype) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF3(member, etype, name) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF4(member, etype, name, desc) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
+#define _NVTX_PAYLOAD_SCHEMA_EF5(member, etype, name, desc, arraylen) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+#define _NVTX_PAYLOAD_SCHEMA_EF6(member, etype, name, desc, arraylen, flags) \
+    NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SCHEMA_EF, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Second part of schema entry (append struct member).
+   (At least two arguments are passed (`member` and `etype`). */
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY_END(member, ...) member
+
+/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
+#define _NVTX_PAYLOAD_SCHEMA_ENTRY(struct_id, entry) \
+    {_NVTX_PAYLOAD_SCHEMA_ENTRY_FRONT entry \
+    offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_ENTRY_END entry)},
+
+/* Handle up to 16 schema entries. */
+#define _NVTX_PAYLOAD_SME1(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1)
+#define _NVTX_PAYLOAD_SME2(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME1(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME3(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME2(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME4(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME3(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME5(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME4(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME6(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME5(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME7(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME6(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME8(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME7(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME9(s,e1,...)  _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME8(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME10(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME9(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME11(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME10(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME12(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME11(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME13(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME12(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME14(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME13(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME15(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME14(s,__VA_ARGS__)
+#define _NVTX_PAYLOAD_SME16(s,e1,...) _NVTX_PAYLOAD_SCHEMA_ENTRY(s,e1) _NVTX_PAYLOAD_SME15(s,__VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, ...) \
+  nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
+    {0, 0} \
+  };
+
+/*
+ * Handle optional parameters for `NVTX_DEFINE_SCHEMA_FOR_STRUCT[_AND_REGISTER]`.
+ */
+#define _NVTX_DEFINE_S4S_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |,\
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_5(struct_id, schema_name, prefix, schema_flags, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_4(struct_id, schema_name, prefix, entries) \
+    prefix _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
+        NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_S4S_3(struct_id, schema_name, entries) \
+    _NVTX_DEFINE_S4S_4(struct_id, schema_name, /*prefix*/, entries)
+#define _NVTX_DEFINE_S4S_2(struct_id, entries) \
+    _NVTX_PAYLOAD_SCHEMA_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+    NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, ,\
+        NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+
+#define _NVTX_DEFINE_SCHEMA_FOR_STRUCT(struct_id, ...) \
+    NVTX_EXT_CONCAT(_NVTX_DEFINE_S4S_, \
+        NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
+
+/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{DEFINE,SETUP}` ***/
+
+
+/******************************************************************/
+/*** Helper for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]` ***/
+
+/* Extract struct member for fixed-size arrays. */
+#define _NVTX_PAYLOAD_STRUCT_ARR_MEM1(name) name
+#define _NVTX_PAYLOAD_STRUCT_ARR_MEM2(name, count) name[count]
+
+/* Extract type and member name and handle special case of fixed-size array. */
+#define _NVTX_PAYLOAD_STRUCT_E2(type, member) type member;
+#define _NVTX_PAYLOAD_STRUCT_E3(type, member, etype) type member;
+#define _NVTX_PAYLOAD_STRUCT_E4(type, member, etype, name) type member;
+#define _NVTX_PAYLOAD_STRUCT_E5(type, member, etype, name, desc) type member;
+#define _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen) \
+    type NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_ARR_MEM, NVTX_EXT_NUM_ARGS member) member;
+#define _NVTX_PAYLOAD_STRUCT_E7(type, member, etype, name, desc, arraylen, flags) \
+    _NVTX_PAYLOAD_STRUCT_E6(type, member, etype, name, desc, arraylen)
+
+/* Handle different number of arguments per struct entry. */
+#define _NVTX_PAYLOAD_STRUCT_ENTRY_(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT_E, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Handle up to 16 struct members. */
+#define _NVTX_PAYLOAD_STRUCT_ENTRY(entry) _NVTX_PAYLOAD_STRUCT_ENTRY_ entry
+#define _NVTX_PAYLOAD_STRUCT1(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1)
+#define _NVTX_PAYLOAD_STRUCT2(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT1(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT3(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT2(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT4(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT3(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT5(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT4(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT6(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT5(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT7(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT6(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT8(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT7(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT9(e1, ...)  _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT8(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT10(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT9(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT11(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT10(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT12(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT11(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT13(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT12(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT14(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT13(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT15(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT14(__VA_ARGS__)
+#define _NVTX_PAYLOAD_STRUCT16(e1, ...) _NVTX_PAYLOAD_STRUCT_ENTRY(e1) _NVTX_PAYLOAD_STRUCT15(__VA_ARGS__)
+
+/* Generate the typedef. */
+#define _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, ...) \
+  typedef struct { \
+      NVTX_EXT_CONCAT(_NVTX_PAYLOAD_STRUCT, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) \
+  } struct_id;
+
+/* Generate first part of the schema entry. */
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N3(type, memberId, etype) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, NULL, NULL, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N4(type, memberId, etype, name) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, NULL, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N5(type, memberId, etype, name, desc) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, 0,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N6(type, memberId, etype, name, desc, arraylen) \
+    0, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+#define _NVTX_PAYLOAD_INIT_SCHEMA_N7(type, memberId, etype, name, desc, arraylen, flags) \
+    NVTX_PAYLOAD_ENTRY_FLAG_##flags, NVTX_PAYLOAD_ENTRY_##etype, name, desc, arraylen,
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_N, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define _NVTX_PAYLOAD_ARRAY_MEMBER1(name) name
+#define _NVTX_PAYLOAD_ARRAY_MEMBER2(name, count) name
+
+/* Resolve to last part of schema entry (append struct member). */
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX3(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX4(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX5(type, memberId, ...) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, ...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_ARRAY_MEMBER, NVTX_EXT_NUM_ARGS memberId) memberId
+#define _NVTX_PAYLOAD_INIT_SCHEMA_NX7(type, memberId, ...) \
+    _NVTX_PAYLOAD_INIT_SCHEMA_NX6(type, memberId, __VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END(...) \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SCHEMA_NX, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+/* Resolve to schema entry. `entry` is `(ctype, name, ...)`. */
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(struct_id, entry) \
+    {_NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_FRONT entry \
+    offsetof(struct_id, _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY_END entry)},
+
+/* Handle up to 16 schema entries. */
+#define _NVTX_PAYLOAD_INIT_SME1(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1)
+#define _NVTX_PAYLOAD_INIT_SME2(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME1(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME3(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME2(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME4(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME3(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME5(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME4(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME6(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME5(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME7(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME6(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME8(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME7(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME9(s, e1, ...)  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME8(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME10(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME9(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME11(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME10(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME12(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME11(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME13(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME12(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME14(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME13(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME15(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME14(s, __VA_ARGS__)
+#define _NVTX_PAYLOAD_INIT_SME16(s, e1, ...) _NVTX_PAYLOAD_SCHEMA_INIT_ENTRY(s, e1) _NVTX_PAYLOAD_INIT_SME15(s, __VA_ARGS__)
+
+#define _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, ...) \
+  nvtxPayloadSchemaEntry_t struct_id##Schema[] = { \
+    NVTX_EXT_CONCAT(_NVTX_PAYLOAD_INIT_SME, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(struct_id, __VA_ARGS__) \
+    {0, 0} \
+  };
+
+/*
+ * Handle optional parameters for `NVTX_DEFINE_STRUCT_WITH_SCHEMA[_AND_REGISTER]`.
+ */
+#define _NVTX_DEFINE_SWS_6(struct_id, schema_name, prefix, schema_flags, schema_id, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, schema_id, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS | \
+      NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_5(struct_id, schema_name, prefix, schema_flags, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, schema_flags, 0, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME | NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_4(struct_id, schema_name, prefix, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, schema_name, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, \
+      NVTX_PAYLOAD_SCHEMA_ATTR_NAME |, \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+#define _NVTX_DEFINE_SWS_3(struct_id, schema_name, entries) \
+  _NVTX_DEFINE_SWS_4(struct_id, schema_name, /* no prefix */, entries)
+#define _NVTX_DEFINE_SWS_2(struct_id, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  NVTX_PAYLOAD_SCHEMA_ATTR(struct_id, NULL, NVTX_PAYLOAD_SCHEMA_FLAG_NONE, 0, , \
+      NVTX_EXT_NUM_ARGS(_NVTX_PAYLOAD_PASS_THROUGH entries))
+
+#define _NVTX_DEFINE_STRUCT_WITH_SCHEMA(struct_id, ...) \
+    NVTX_EXT_CONCAT(_NVTX_DEFINE_SWS_, \
+        NVTX_EXT_NUM_ARGS(struct_id, __VA_ARGS__))(struct_id, __VA_ARGS__)
+
+/*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */
+
+#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
@@ -10,14 +10,14 @@
 #error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
 #endif

-typedef void* pointer_type;
+typedef void* nvtx_payload_pointer_type;

 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 #include <uchar.h>
 #include <stdalign.h>
 #endif

-/* `alignof` is available as of C11 or C++11 */
+/* `alignof` is available as of C11 or C++11. */
 #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || (defined(__cplusplus) && __cplusplus >= 201103L)

 #define nvtx_alignof(type) alignof(type)
@@ -54,7 +54,7 @@ MKTYPEDEF(double);
 MKTYPEDEF2(long double, longdouble);

 MKTYPEDEF(size_t);
-MKTYPEDEF(pointer_type);
+MKTYPEDEF(nvtx_payload_pointer_type);

 MKTYPEDEF(wchar_t);

@@ -85,8 +85,16 @@ MKTYPEDEF(wchar_t);
 /*
 * Helper array to get the alignment for each predefined C/C++ language type.
 * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
+ *
+ * In C++, `const` variables use internal linkage by default, but we need it to
+ * be public (extern) since weak declarations must be public.
 */
-const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
+NVTX_LINKONCE_DEFINE_GLOBAL
+#ifdef __cplusplus
+extern
+#endif
+const nvtxPayloadEntryTypeInfo_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
 {
    /* The first entry contains this array's length and the size of each entry in this array. */
    {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
@@ -119,7 +127,7 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
    /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},

    /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */    {sizeof(size_t),       nvtx_alignof(size_t)},
-    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(nvtx_payload_pointer_type), nvtx_alignof(nvtx_payload_pointer_type)},

    /*** Special character types ***/
    /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
@@ -140,4 +148,4 @@ const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_
 };

 #undef nvtx_alignof
-#undef nvtx_alignof2
+#undef nvtx_alignof2
@@ -10,37 +10,34 @@
 #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
 #endif

+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+
 /* ---- Include required platform headers ---- */

-#if defined(_WIN32) 
+#if defined(_WIN32)

-#include <Windows.h>
+#include <windows.h>

 #else
 #include <unistd.h>

 #if defined(__ANDROID__)
-#include <android/api-level.h> 
+#include <android/api-level.h>
 #endif

 #if defined(__linux__) || defined(__CYGWIN__)
 #include <sched.h>
 #endif

+#include <sys/types.h>
 #include <limits.h>
 #include <dlfcn.h>
 #include <fcntl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <errno.h>
-
-#include <string.h>
-#include <sys/types.h>
 #include <pthread.h>
-#include <stdlib.h>
-#include <wchar.h>

 #endif

@@ -14,11 +14,11 @@

 /* Prefer macros over inline functions to reduce symbol resolution at link time */

-#if defined(_WIN32) 
+#if defined(_WIN32)
 #define NVTX_PATHCHAR   wchar_t
 #define NVTX_STR(x)     L##x
 #define NVTX_GETENV     _wgetenv
-#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  HMODULE
 #define NVTX_DLLOPEN(x) LoadLibraryW(x)
 #define NVTX_DLLFUNC    GetProcAddress
@@ -31,7 +31,7 @@
 #define NVTX_PATHCHAR   char
 #define NVTX_STR(x)     x
 #define NVTX_GETENV     getenv
-#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_BUFSIZE    16384
 #define NVTX_DLLHANDLE  void*
 #define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
 #define NVTX_DLLFUNC    dlsym
@@ -23,7 +23,7 @@
 * In some situations it is desirable to declare a variable without initializing
 * it, refer to it in code or other variables' initializers, and then initialize
 * it later.  Similarly, functions can be prototyped, have their address taken,
- * and then have their body defined later.  In such cases, use the FWDDECL macros 
+ * and then have their body defined later.  In such cases, use the FWDDECL macros
 * when forward-declaring LINKONCE global variables without initializers and
 * function prototypes, and then use the DEFINE macros when later defining them.
 * Although in many cases the FWDDECL macro is equivalent to the DEFINE macro,
@@ -1,86 +0,0 @@
-/*
-* Copyright 2021  NVIDIA Corporation.  All rights reserved.
-*
-* Licensed under the Apache License v2.0 with LLVM Exceptions.
-* See https://llvm.org/LICENSE.txt for license information.
-* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-*/
-
-#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
-#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
-#endif
-
-#define NVTX_EXT_IMPL_GUARD
-#include "nvtxExtImpl.h"
-#undef NVTX_EXT_IMPL_GUARD
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
-    NAME##_v##VERSION##_mem##COMPATID
-#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
-    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
-#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
-    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
-
-/*
- * Function slots for the binary payload extension. First entry is the module
- * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
- */
-NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
-NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
-    = {0};
-
-NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
-{
-    intptr_t* fnSlots = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1;
-    nvtxExtModuleSegment_t segment = {
-        0, // unused (only one segment)
-        NVTX3EXT_CBID_PAYLOAD_FN_NUM,
-        fnSlots
-    };
-
-    nvtxExtModuleInfo_t module = {
-        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
-        NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
-        1, &segment, // number of segments, segments
-        NULL, // no export function needed
-        // bake type sizes and alignment information into program binary
-        &nvtxExtPayloadTypeInfo
-    };
-
-    NVTX_INFO( "%s\n", __FUNCTION__  );
-
-    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
-        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
-}
-
-#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
-typedef ret_val ( * fn_name##_impl_fntype )signature; \
-NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
-    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
-    if (slot != NVTX_EXTENSION_DISABLED) { \
-        if (slot) { \
-            return (*(fn_name##_impl_fntype)slot) arg_names; \
-        } else { \
-            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
-            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
-            if (slot != NVTX_EXTENSION_DISABLED && slot) { \
-                return (*(fn_name##_impl_fntype)slot) arg_names; \
-            } \
-        } \
-    } \
-    return ((ret_val)(intptr_t)-1); \
-}
-
-NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
-
-NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
-
-#undef NVTX_EXT_FN_IMPL
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif /* __cplusplus */
@@ -10,6 +10,9 @@
 #define NCCL_P2P_H_

 #include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "core.h"

 #if CUDART_VERSION < 12030
 // MNNVL: FABRIC handle support lifted from CUDA 12.3
@@ -16,13 +16,29 @@
 #include "shm.h"
 #include "p2p.h"

+typedef enum : uint8_t {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown,
+  ncclPatternCollnetChain,
+  ncclPatternCollnetDirect,
+  ncclPatternNvls,
+  ncclPatternNvlsTree,
+  ncclPatternSend,
+  ncclPatternRecv
+} ncclPattern_t;
+
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };

 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);

 #define NCCL_PROXY_MAX_SUBS MAXCHANNELS
-static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+static_assert(2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH <= MAXCHANNELS, "Not enough sub space for max work elements");

 union ncclProxyOpSpecifics {
  struct {
@@ -124,8 +140,9 @@ struct ncclProxyArgs {

 // ProxyOps are used to communicate between main thread and service thread
 // Make sure we have enough to store two full rounds of operations on all channels.
-// Otherwise we'd be unable to post half of them to free new elements.
-#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+// Otherwise we'd be unable to post half of them to free new elements. Each
+// p2p work contains a send and recv proxy op hence the 2x before it.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*2*NCCL_MAX_DEV_WORK_P2P_PER_BATCH)

 struct ncclProxyOpsPool {
  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
@@ -243,7 +260,7 @@ struct ncclProxyState {
  bool dmaBufSupport;
  ncclNet_t* ncclNet;
  ncclCollNet_t* ncclCollNet;
-  volatile uint32_t* abortFlag;
+  uint32_t* abortFlag;
  // Service threads
  pthread_t thread;
  pthread_t threadUDS;
@@ -301,7 +318,6 @@ enum proxyMode {
 };

 ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp, int reg);
 ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
@@ -1,6 +1,11 @@
 #ifndef NCCL_REGISTER_H_
 #define NCCL_REGISTER_H_

+#include "device.h"
+
+#include <cuda.h>
+#include <stdint.h>
+
 enum {
  NET_REG_COMPLETE = 0x01,
  NVLS_REG_COMPLETE = 0x02,
@@ -13,12 +13,14 @@
 #include "core.h"

 #define NTRANSPORTS 4
+#define TRANSPORT_UNDEFINED -1
 #define TRANSPORT_P2P 0
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3

 #include "proxy.h"
+#include "comm.h"

 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -45,6 +47,7 @@ struct ncclPeerInfo {
  int cudaCompCap;
  // MNNVL support
  nvmlGpuFabricInfoV_t fabricInfo;
+  int cuMemSupport;
 };

 #define CONNECT_SIZE 128
@@ -57,17 +60,21 @@ struct ncclConnect {
 #define NVLS_HANDLE_SIZE 64
 struct ncclNvlsSharedRes {
  int refCount;
-  CUmulticastObjectProp properties;
+  bool inited;
+  CUmulticastObjectProp bufProp;
+  CUmulticastObjectProp signalProp;
  CUmemAccessDesc accessDesc;
  int dev;
-  size_t size;
-  size_t granularity;
-  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
+  size_t buffSize;
+  size_t creditSize;
+  CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
+  CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
  char* mcBuff; // Multicast NVLS buffer address
-  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
+  char* mcCredit; // Multicast NVLS credit address
+  CUmemGenericAllocationHandle ucBuffHandle; // Unicast Handle for NVLS buffer
+  CUmemGenericAllocationHandle ucCreditHandle; // Unicast Handle for NVLS credit buffer
  char* ucBuff; // Unicast NVLS buffer address
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  size_t ucGran;
+  char* ucCredit; // Unicast NVLS credit address
  int nChannels;
  struct ncclShmemCollBuff nvlsShmem;
  void *nvlsShmemHandle;
@@ -84,6 +91,7 @@ struct ncclCollNetSharedRes {
  void* resources;
  int nChannels;
  size_t buffSize;
+  int intraHighestTransportType;
 };

 struct ncclTransportComm {
@@ -111,7 +119,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*

 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
+ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
 ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
@@ -121,6 +131,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
 ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
 ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
 ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufUsed, void** outHandle);
-ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, struct ncclKernelPlan *plan, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle);
+ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle);
+
+ncclResult_t ncclTransportRingConnect(struct ncclComm* comm);
+ncclResult_t ncclTransportTreeConnect(struct ncclComm* comm);
+
+ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTopoGraph* graphs[]);
+ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
+ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
+
 #endif
@@ -9,14 +9,15 @@
 #define NCCL_INT_TUNER_H_

 #include "nccl_tuner.h"
+#include "comm.h"

 // Tuning plugin to override NCCL's default algorithm/protocol tuning.

 // Attempts to load NCCL tuner from environmental variable.
 // Returns ncclSuccess if the correct tuner symbol has been found and
 // successully loaded.  Otherwise returns an error and also logs the error.
-ncclResult_t ncclTunerPluginLoad(ncclTuner_t** tuner);
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm);

 // Cleans up NCCL tuner plugin.
-ncclResult_t ncclTunerPluginUnload(ncclTuner_t** tuner);
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm);
 #endif
@@ -9,12 +9,14 @@

 #include "nccl.h"
 #include "alloc.h"
+#include "bitops.h"
 #include "checks.h"
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
 #include <algorithm>
 #include <new>
+#include <type_traits>

 int ncclCudaCompCap();

@@ -30,11 +32,6 @@ uint64_t getHostHash();
 uint64_t getPidHash();
 ncclResult_t getRandomData(void* buffer, size_t bytes);

-const char* ncclOpToString(ncclRedOp_t op);
-const char* ncclDatatypeToString(ncclDataType_t type);
-const char* ncclAlgoToString(int algo);
-const char* ncclProtoToString(int proto);
-
 struct netIf {
  char prefix[64];
  int port;
@@ -44,9 +41,7 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList);
 bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);

 static long log2i(long n) {
- long l = 0;
- while (n>>=1) l++;
- return l;
+  return log2Down(n);
 }

 inline uint64_t clockNano() {
@@ -96,8 +91,11 @@ void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
 void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
 void ncclMemoryStackPush(struct ncclMemoryStack* me);
 void ncclMemoryStackPop(struct ncclMemoryStack* me);
+void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align);
 template<typename T>
 T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+template<typename Header, typename Element>
+inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt);

 ////////////////////////////////////////////////////////////////////////////////
 /* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
@@ -140,11 +138,14 @@ T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
 void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
 template<typename T, T *T::*next>
+void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
 T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
 T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
 template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src);
+

 ////////////////////////////////////////////////////////////////////////////////
 /* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
@@ -233,6 +234,12 @@ inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size,
  return obj;
 }

+inline void* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t size, size_t align) {
+  void *obj = ncclMemoryStack::allocate(me, size, align);
+  memset(obj, 0, size);
+  return obj;
+}
+
 template<typename T>
 inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
@@ -240,6 +247,17 @@ inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
  return (T*)obj;
 }

+template<typename Header, typename Element>
+inline Header* ncclMemoryStackAllocInlineArray(struct ncclMemoryStack* me, size_t nElt) {
+  size_t size = sizeof(Header);
+  size = (size + alignof(Element)-1) & -alignof(Element);
+  size += nElt*sizeof(Element);
+  size_t align = alignof(Header) < alignof(Element) ? alignof(Element) : alignof(Header);
+  void *obj = ncclMemoryStack::allocate(me, size, align);
+  memset(obj, 0, size);
+  return (Header*)obj;
+}
+
 inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
  using Frame = ncclMemoryStack::Frame;
  Frame tmp = me->topFrame;
@@ -343,6 +361,13 @@ inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
  me->tail = x;
 }

+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueueFront(ncclIntruQueue<T,next> *me, T *x) {
+  if (me->head == nullptr) me->tail = x;
+  x->*next = me->head;
+  me->head = x;
+}
+
 template<typename T, T *T::*next>
 inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
  T *ans = me->head;
@@ -388,45 +413,11 @@ inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
 }

 template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
-  T *head = me->head;
-  me->head = nullptr;
-  me->tail = nullptr;
-  while (head != nullptr) {
-    T *tmp = head->*next;
-    ncclMemoryPoolFree(pool, tmp);
-    head = tmp;
-  }
-}
-
-/* cmp function determines the sequence of objects in the queue. If cmp returns value >= 0, it means a > b,
- * and we should put a before b; otherwise, b should be put ahead of a. */
-template<typename T, T *T::*next>
-inline void ncclIntruQueueSortEnqueue(ncclIntruQueue<T,next> *me, T *x, int (*cmp)(T *a, T *b)) {
-  T *cur = me->head;
-  T *prev = NULL;
-
-  if (cur == NULL) {
-    x->*next = nullptr;
-    me->tail = me->head = x;
-  } else {
-    while (cur) {
-      if (cmp(cur, x) > 0) {
-        prev = cur;
-        cur = cur->next;
-      } else {
-        break;
-      }
-    }
-
-    x->*next = cur;
-    if (prev) {
-      prev->*next = x;
-      if (cur == NULL) me->tail = x;
-    } else {
-      me->head = x;
-    }
-  }
+void ncclIntruQueueTransfer(ncclIntruQueue<T,next> *dst, ncclIntruQueue<T,next> *src) {
+  (dst->tail ? dst->tail->next : dst->head) = src->head;
+  if (src->tail) dst->tail = src->tail;
+  src->head = nullptr;
+  src->tail = nullptr;
 }

 ////////////////////////////////////////////////////////////////////////////////
@@ -2,11 +2,11 @@
 #include "nvtx.h"

 static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
-  {"Sum", ncclSum},
-  {"Product", ncclProd},
-  {"Max", ncclMax},
-  {"Min", ncclMin},
-  {"Avg", ncclAvg}
+  {"Sum", ncclSum, 0},
+  {"Product", ncclProd, 0},
+  {"Max", ncclMax, 0},
+  {"Min", ncclMin, 0},
+  {"Avg", ncclAvg, 0}
 };

 // Must be called before the first call to any reduction operation.
@@ -19,7 +19,8 @@ void initNvtxRegisteredEnums() {
    .entries = NvtxEnumRedSchema,
    .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
    .sizeOfEnum = sizeof(ncclRedOp_t),
-    .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
+    .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP,
+    .extension = nullptr
  };

  nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
@@ -52,8 +52,6 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
    WARN("%s : invalid type %d", info->opName, info->datatype);
    return ncclInvalidArgument;
  }
-  // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
-  NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));

  if (info->op < 0 || ncclMaxRedOp < info->op) {
    WARN("%s : invalid reduction operation %d", info->opName, info->op);
@@ -59,6 +59,10 @@ DECLARE_CUDA_PFN(cuGetErrorString);
 DECLARE_CUDA_PFN(cuGetErrorName);
 /* enqueue.cc */
 DECLARE_CUDA_PFN(cuMemGetAddressRange);
+DECLARE_CUDA_PFN(cuLaunchKernel);
+#if CUDA_VERSION >= 11080
+DECLARE_CUDA_PFN(cuLaunchKernelEx);
+#endif
 /* proxy.cc */
 DECLARE_CUDA_PFN(cuCtxCreate);
 DECLARE_CUDA_PFN(cuCtxDestroy);
@@ -137,6 +141,10 @@ static ncclResult_t cudaPfnFuncLoader(void) {
  LOAD_SYM(cuCtxGetCurrent, 1);
  LOAD_SYM(cuCtxSetCurrent, 1);
  LOAD_SYM(cuCtxGetDevice, 1);
+  LOAD_SYM(cuLaunchKernel, 1);
+#if CUDA_VERSION >= 11080
+  LOAD_SYM(cuLaunchKernelEx, 1);
+#endif
 /* cuMem API support */
  LOAD_SYM(cuMemAddressReserve, 1);
  LOAD_SYM(cuMemAddressFree, 1);
@@ -130,7 +130,7 @@ ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint6
  int ret;
  GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret);
  if (ret != 0) {
-    WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret);
+    WARN("gdr_pin_buffer(addr %lx, size %zu) failed: %d", addr, size, ret);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -172,7 +172,7 @@ ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
  int ret;
  GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret);
  if (ret != 0) {
-    WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret);
+    WARN("gdr_map(handle %lx, size %zu) failed: %d", handle.h, size, ret);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -186,7 +186,7 @@ ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
  int ret;
  GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret);
  if (ret != 0) {
-    WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret);
+    WARN("gdr_unmap(handle %lx, va %p, size %zu) failed: %d", handle.h, va, size, ret);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -218,7 +218,7 @@ ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const vo
  int ret;
  GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret);
  if (ret != 0) {
-    WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
+    WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zu) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -232,7 +232,7 @@ ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void
  int ret;
  GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret);
  if (ret != 0) {
-    WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
+    WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zu) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret);
    return ncclSystemError;
  }
  return ncclSuccess;
@@ -132,7 +132,7 @@ ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
      WARN("UDS: Receiving data over socket failed : %d", errno);
      return ncclSystemError;
    }
-    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
  }

  if (recvFd != NULL) {
@@ -221,7 +221,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
      return ncclSystemError;
    }
-    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_RELAXED)) return ncclInternalError;
+    if (handle->abortFlag && __atomic_load_n(handle->abortFlag, __ATOMIC_ACQUIRE)) return ncclInternalError;
  }

  return ncclSuccess;
@@ -41,11 +41,19 @@ namespace {
  NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
  // MNNVL support
  NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
+  // CC support
+  NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state));
+  NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting));

  std::mutex lock; // NVML has had some thread safety bugs
  bool initialized = false;
  thread_local bool threadInitialized = false;
  ncclResult_t initResult;
+
+  union nvmlCCInfoInternal {
+    nvmlConfComputeSystemState_t settingV12020;
+    nvmlSystemConfComputeSettings_t settingV12040;
+  };
 }

 ncclResult_t ncclNvmlEnsureInitialized() {
@@ -87,6 +95,9 @@ ncclResult_t ncclNvmlEnsureInitialized() {
      {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
      // MNNVL support
      {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
+      // CC support
+      {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"},
+      {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"}
    };
    for(Symbol sym: symbols) {
      *sym.ppfn = dlsym(libhandle, sym.name);
@@ -282,3 +293,33 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI
  NVMLTRY(nvmlDeviceGetGpuFabricInfoV, device, gpuFabricInfo);
  return ncclSuccess;
 }
+
+ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  nvmlCCInfoInternal ccInfo;
+  if (pfn_nvmlSystemGetConfComputeSettings != NULL) {
+    ccInfo.settingV12040.version = nvmlSystemConfComputeSettings_v1;
+    NVMLTRY(nvmlSystemGetConfComputeSettings, &ccInfo.settingV12040);
+    if (ccInfo.settingV12040.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
+      status->CCEnabled = true;
+    else
+      status->CCEnabled = false;
+
+    if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
+      status->multiGpuCCEnabled = true;
+    else
+      status->multiGpuCCEnabled = false;
+  } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
+    NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
+    if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
+      status->CCEnabled = true;
+    else
+      status->CCEnabled = false;
+    status->multiGpuCCEnabled = false;
+  } else {
+    status->CCEnabled = false;
+    status->multiGpuCCEnabled = false;
+  }
+  return ncclSuccess;
+}
@@ -84,4 +84,4 @@ const char *ncclGetEnv(const char *name) {
  static pthread_once_t once = PTHREAD_ONCE_INIT;
  pthread_once(&once, initEnv);
  return getenv(name);
-}
+}
@@ -63,13 +63,28 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
     * goes down to 0, unlink should be called in order to delete shared memory file. */
    if (shmPath[0] == '\0') {
      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+    retry_mkstemp:
      fd = mkstemp(shmPath);
+      if (fd < 0) {
+        if (errno == EINTR) {
+          INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
+          goto retry_mkstemp;
+        }
+        WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
+        ret = ncclSystemError;
+        goto fail;
+      }
    } else {
      SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
    }

+  retry_fallocate:
    if (fallocate(fd, 0, 0, realShmSize) != 0) {
-      WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
+      if (errno == EINTR) {
+        INFO(NCCL_ALL, "fallocate: Failed to extend %s to %ld bytes, error: %s (%d) - retrying", shmPath, realShmSize, strerror(errno), errno);
+        goto retry_fallocate;
+      }
+      WARN("Error: failed to extend %s to %ld bytes, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
      ret = ncclSystemError;
      goto fail;
    }
@@ -80,7 +95,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de

  hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
  if (hptr == MAP_FAILED) {
-    WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
+    WARN("Error: Could not map %s size %zu, error: %s (%d)", shmPath, realShmSize, strerror(errno), errno);
    ret = ncclSystemError;
    hptr = NULL;
    goto fail;
@@ -93,7 +108,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
    if (remref == 0) {
      /* the last peer has completed attachment, it should unlink the shm mem file. */
      if (unlink(shmPath) != 0) {
-        WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
+        INFO(NCCL_ALLOC, "unlink shared memory %s failed, error: %s (%d)", shmPath, strerror(errno), errno);
      }
    }
  }
@@ -110,7 +125,8 @@ exit:
  *handle = (ncclShmHandle_t)tmphandle;
  return ret;
 fail:
-  WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
+  WARN("Error while %s shared memory segment %s (size %ld), error: %s (%d)", create ? "creating" : "attaching to",
+       shmPath, shmSize, strerror(errno), errno);
  if (tmphandle) {
    shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
    ncclShmClose((ncclShmHandle_t)tmphandle);
@@ -129,7 +145,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
      close(tmphandle->fd);
      if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
        if (unlink(tmphandle->shmPath) != 0) {
-          WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+          WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
          ret = ncclSystemError;
        }
      }
@@ -139,7 +155,7 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
    if (tmphandle->shmPtr) {
      if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
      if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
-        WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
+        WARN("munmap of shared memory %p size %ld failed, error: %s (%d)", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno), errno);
        ret = ncclSystemError;
      }
    }
@@ -152,9 +168,9 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
  ncclResult_t ret = ncclSuccess;
  struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
  if (tmphandle) {
-    if (tmphandle->shmPath != NULL) {
+    if (tmphandle->shmPath != NULL && tmphandle->refcount != NULL && *tmphandle->refcount > 0) {
      if (unlink(tmphandle->shmPath) != 0) {
-        WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+        WARN("unlink shared memory %s failed, error: %s (%d)", tmphandle->shmPath, strerror(errno), errno);
        ret = ncclSystemError;
      }
      free(tmphandle->shmPath);
@@ -184,7 +200,7 @@ ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff
    uint64_t t0 = clockNano();
    while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
      if (clockNano() - t0 >= 5 * 1000) sched_yield();
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_RELAXED) == 1) {
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) {
        ret = ncclInternalError;
        goto exit;
      }
--- a/Zobrazit více
+++ b/Zobrazit více