Merge remote-tracking branch 'nccl/master' into HEAD

[ROCm/rccl commit: e1cb45ff22]
2023-02-04 01:43:38 +00:00
@@ -4,6 +4,13 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:

 ## Unreleased
 ### Changed
+- Compatibility with NCCL 2.16.2
+### Added
+### Fixed
+### Removed
+
+## Unreleased - RCCL 2.15.5 for ROCm 5.5.0
+### Changed
 - Compatibility with NCCL 2.15.5
 - Unit test executable renamed to rccl-UnitTests
 ### Added
@@ -205,12 +205,15 @@ set(HEADER_SOURCES
  src/include/param.h
  src/include/channel.h
  src/include/nvtx_stub.h
+  src/include/nvtx3.hpp
  src/include/core.h
  src/include/info.h
  src/include/git_version.h
  src/include/npkit/npkit_event.h
  src/include/npkit/npkit.h
  src/include/npkit/npkit_struct.h
+  src/include/nvtx3/nvToolsExtPayload.h
+  src/include/nvtx3/nvToolsExt.h
  src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
  src/include/nvtx3/nvtxDetail/nvtxTypes.h
  src/include/nvtx3/nvtxDetail/nvtxImpl.h
@@ -223,6 +226,11 @@ set(HEADER_SOURCES
  src/include/nvtx3/nvtxDetail/nvtxInit.h
  src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
  src/include/nvtx3/nvToolsExtSync.h
+  src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
+  src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
+  src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
+  src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
+  src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
  src/include/nvtx3/nvToolsExtCudaRt.h
  src/include/nvtx3/nvToolsExtCuda.h
  src/include/nvtx3/nvToolsExtOpenCL.h
@@ -372,6 +380,8 @@ configure_file(src/nccl.h.in ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
 # Execute git_version_check whenever rccl library is built
 add_dependencies(rccl git_version_check)

+add_definitions(-DNVTX_NO_IMPL)
+
 if(TRACE)
  add_definitions(-DENABLE_TRACE)
 endif()
@@ -537,7 +547,7 @@ Source: https://github.com/ROCmSoftwarePlatform/rccl

 Files: *
 Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
-Modifications Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+Modifications Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
 License: See LICENSE.txt for license information\n")
  install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION ${CMAKE_INSTALL_DATADIR}/rccl)
  # Write changelog file
@@ -5,7 +5,6 @@
 #ifndef NCCL_NET_V3_H_
 #define NCCL_NET_V3_H_

-#define NCCL_NET_HANDLE_MAXSIZE_V3 64
 #define NCCL_NET_MAX_REQUESTS_V3 16

 typedef ncclNetProperties_v4_t ncclNetProperties_v3_t;
@@ -5,6 +5,8 @@
 #ifndef NCCL_NET_V4_H_
 #define NCCL_NET_V4_H_

+#define NCCL_NET_HANDLE_MAXSIZE_V4 64
+
 typedef struct {
  char* name;     // Used mostly for logging.
  char* pciPath;  // Path to the PCI device in /sys.
@@ -10,7 +10,7 @@ VERBOSE ?= 0
 KEEP ?= 0
 DEBUG ?= 0
 TRACE ?= 0
-PROFAPI ?= 0
+PROFAPI ?= 1
 NVTX ?= 1

 NVCC = $(CUDA_HOME)/bin/nvcc
@@ -25,22 +25,26 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)

 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
+CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
                -gencode=arch=compute_60,code=sm_60 \
                -gencode=arch=compute_61,code=sm_61
+ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
+# SM35 is deprecated from CUDA12.0 onwards
+CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
+endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
-CUDA11_8_GENCODE = -gencode=arch=compute_90,code=sm_90
+CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90

 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
-CUDA11_8_PTX  = -gencode=arch=compute_90,code=compute_90
+CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
+

 ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
-  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_8_GENCODE) $(CUDA11_8_PTX)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
 # Include Volta support if we're using CUDA9 or above
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 15
-NCCL_PATCH   := 5
+NCCL_MINOR   := 16
+NCCL_PATCH   := 2
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -9,7 +9,7 @@ include ../makefiles/version.mk

 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
+LIBSRCFILES := init.cc init_nvtx.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc net.cc \
 		misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
 		misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
 		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
@@ -14,6 +14,11 @@
 #include "proxy.h"
 #include "signals.h" // [RCCL]

+struct bootstrapRootArgs {
+  struct ncclSocket* listenSock;
+  uint64_t magic;
+};
+
 /* Init functions */
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress bootstrapNetIfAddr;
@@ -27,7 +32,7 @@ ncclResult_t bootstrapNetInit() {
      char* env = getenv("NCCL_COMM_ID");
      if (env) {
        union ncclSocketAddress remoteAddr;
-        if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+        if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
          WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
          return ncclInvalidArgument;
        }
@@ -90,8 +95,10 @@ static ncclResult_t setFilesLimit() {
  return ncclSuccess;
 }

-static void *bootstrapRoot(void* args) {
-  struct ncclSocket* listenSock = (struct ncclSocket*)args;
+static void *bootstrapRoot(void* rargs) {
+  struct bootstrapRootArgs* args = (struct bootstrapRootArgs*)rargs;
+  struct ncclSocket* listenSock = args->listenSock;
+  uint64_t magic = args->magic;
  ncclResult_t res = ncclSuccess;
  int nranks = 0, c = 0;
  struct extInfo info;
@@ -105,11 +112,10 @@ static void *bootstrapRoot(void* args) {
  /* Receive addresses from all ranks */
  do {
    struct ncclSocket sock;
-    /* bootstrap root thread always uses blocking ncclSocketAccept. */
-    NCCLCHECKGOTO(ncclSocketInit(&sock, NULL, NULL, 0), res, out);
+    NCCLCHECKGOTO(ncclSocketInit(&sock), res, out);
    NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
-    close(sock.fd);
+    NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);

    if (c == 0) {
      nranks = info.nranks;
@@ -140,54 +146,60 @@ static void *bootstrapRoot(void* args) {
  for (int r=0; r<nranks; ++r) {
    int next = (r+1) % nranks;
    struct ncclSocket sock;
-    sock.abortFlag = NULL;
-    sock.asyncFlag = 0;
-    memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
+    NCCLCHECKGOTO(ncclSocketInit(&sock, rankAddressesRoot+r, magic, ncclSocketTypeBootstrap), res, out);
    NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
    NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
-    close(sock.fd);
+    NCCLCHECKGOTO(ncclSocketClose(&sock), res, out);
  }
  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);

 out:
-  close(listenSock->fd);
-  free(listenSock);
+  if (listenSock != NULL) {
+    ncclSocketClose(listenSock);
+    free(listenSock);
+  }
  if (rankAddresses) free(rankAddresses);
  if (rankAddressesRoot) free(rankAddressesRoot);
  if (zero) free(zero);
+  free(rargs);

  TRACE(NCCL_INIT, "DONE");
  return NULL;
 }

-ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
+ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv) {
  struct ncclSocket* listenSock;
-  NCCLCHECK(ncclCalloc(&listenSock, 1));
-  memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
-  NCCLCHECK(ncclSocketListen(listenSock));
-  memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
+  struct bootstrapRootArgs* args;
  pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
+
+  NCCLCHECK(ncclCalloc(&listenSock, 1));
+  NCCLCHECK(ncclSocketInit(listenSock, &handle->addr, handle->magic, ncclSocketTypeBootstrap, NULL, 0));
+  NCCLCHECK(ncclSocketListen(listenSock));
+  NCCLCHECK(ncclSocketGetAddr(listenSock, &handle->addr));
+
+  NCCLCHECK(ncclCalloc(&args, 1));
+  args->listenSock = listenSock;
+  args->magic = handle->magic;
+  NEQCHECK(pthread_create(&thread, NULL, bootstrapRoot, (void*)args), 0);
  ncclSetThreadName(thread, "NCCL BootstrapR");
-  pthread_detach(thread); // will not be pthread_join()'d
+  NEQCHECK(pthread_detach(thread), 0); // will not be pthread_join()'d
  return ncclSuccess;
 }

-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
-  static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
-  memset(id, 0, sizeof(ncclUniqueId));
-  union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
+ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
+  memset(handle, 0, sizeof(ncclBootstrapHandle));
+  NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

  char* env = getenv("NCCL_COMM_ID");
  if (env) {
    INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
-    if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
+    if (ncclSocketGetAddrFromString(&handle->addr, env) != ncclSuccess) {
      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
      return ncclInvalidArgument;
    }
  } else {
-    memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
-    NCCLCHECK(bootstrapCreateRoot(id, false));
+    memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+    NCCLCHECK(bootstrapCreateRoot(handle, false));
  }

  return ncclSuccess;
@@ -211,20 +223,27 @@ struct bootstrapState {
  int rank;
  int nranks;
  int virtualId;
+  uint64_t magic;
  volatile uint32_t *abortFlag;
 };

-ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
+ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm) {
  int rank = comm->rank;
  int nranks = comm->nRanks;
  int virtualId = comm->virtualId;
  struct bootstrapState* state;
+  struct ncclSocket* proxySocket;
+  ncclSocketAddress nextAddr;
+  struct ncclSocket sock, listenSockRoot;
+  struct extInfo info = { 0 };
+
  NCCLCHECK(ncclCalloc(&state, 1));
  state->rank = rank;
  state->nranks = nranks;
  state->abortFlag = comm->abortFlag;
  state->virtualId = virtualId;
  comm->bootstrap = state;
+  comm->magic = state->magic = handle->magic;

  TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);

@@ -232,23 +251,17 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
  RegisterSignalHandlers();
  // [/RCCL]

-  struct extInfo info = { 0 };
  info.rank = rank;
  info.nranks = nranks;
-  struct ncclSocket sock, listenSockRoot;
-
-  NCCLCHECK(ncclSocketInit(&sock, (union ncclSocketAddress*) id, comm->abortFlag, 0));
-  NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->abortFlag, 0));
-  NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->abortFlag, 0));
-  NCCLCHECK(ncclSocketInit(&state->ringSendSocket, NULL, comm->abortFlag, 0));
-  NCCLCHECK(ncclSocketInit(&state->ringRecvSocket, NULL, comm->abortFlag, 0));
  // Create socket for other ranks to contact me
+  NCCLCHECK(ncclSocketInit(&state->listenSock, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
  NCCLCHECK(ncclSocketListen(&state->listenSock));
-  memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketGetAddr(&state->listenSock, &info.extAddressListen));

  // Create socket for root to contact me
+  NCCLCHECK(ncclSocketInit(&listenSockRoot, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
  NCCLCHECK(ncclSocketListen(&listenSockRoot));
-  memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketGetAddr(&listenSockRoot, &info.extAddressListenRoot));

  // stagger connection times to avoid an overload of the root
  if (nranks > 128) {
@@ -261,32 +274,37 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
  }

  // send info on my listening socket to root
+  NCCLCHECK(ncclSocketInit(&sock, &handle->addr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
  NCCLCHECK(ncclSocketConnect(&sock));
  NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
-  close(sock.fd);
+  NCCLCHECK(ncclSocketClose(&sock));

  // get info on my "next" rank in the bootstrap ring from root
+  NCCLCHECK(ncclSocketInit(&sock));
  NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
-  NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
-  close(sock.fd);
-  close(listenSockRoot.fd);
+  NCCLCHECK(bootstrapNetRecv(&sock, &nextAddr, sizeof(union ncclSocketAddress)));
+  NCCLCHECK(ncclSocketClose(&sock));
+  NCCLCHECK(ncclSocketClose(&listenSockRoot));

+  NCCLCHECK(ncclSocketInit(&state->ringSendSocket, &nextAddr, comm->magic, ncclSocketTypeBootstrap, comm->abortFlag));
  NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
  // Accept the connect request from the previous rank in the AllGather ring
+  NCCLCHECK(ncclSocketInit(&state->ringRecvSocket));
  NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));

  // AllGather all listen handlers
  NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
-  memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketGetAddr(&state->listenSock, state->peerCommAddresses+rank));
  NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));

  // Create the service proxy
  NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
-  struct ncclSocket* proxySocket;
+
+  // proxy is aborted through a message; don't set abortFlag
  NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, NULL, 0));
+  NCCLCHECK(ncclSocketInit(proxySocket, &bootstrapNetIfAddr, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
  NCCLCHECK(ncclSocketListen(proxySocket));
-  memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
  NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));

@@ -322,16 +340,21 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 }

 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
+  ncclResult_t ret = ncclSuccess;
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct ncclSocket sock;

-  NCCLCHECK(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->abortFlag, 1));
-  NCCLCHECK(ncclSocketConnect(&sock));
-  NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
-  NCCLCHECK(bootstrapNetSend(&sock, data, size));
-  close(sock.fd);
-  return ncclSuccess;
+  NCCLCHECKGOTO(ncclSocketInit(&sock, state->peerCommAddresses+peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
+  NCCLCHECKGOTO(ncclSocketConnect(&sock), ret, fail);
+  NCCLCHECKGOTO(bootstrapNetSend(&sock, &state->rank, sizeof(int)), ret, fail);
+  NCCLCHECKGOTO(bootstrapNetSend(&sock, &tag, sizeof(int)), ret, fail);
+  NCCLCHECKGOTO(bootstrapNetSend(&sock, data, size), ret, fail);
+
+exit:
+  NCCLCHECK(ncclSocketClose(&sock));
+  return ret;
+fail:
+  goto exit;
 }

 ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag) {
@@ -390,9 +413,10 @@ ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag,
  return ncclSuccess;
 }

-ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
+ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock, int* found) {
  struct unexConn* elem = state->unexpectedConnections;
  struct unexConn* prev = NULL;
+  *found = 0;
  while (elem) {
    if (elem->peer == peer && elem->tag == tag) {
      if (prev == NULL) {
@@ -402,54 +426,75 @@ ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag,
      }
      memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
      free(elem);
+      *found = 1;
      return ncclSuccess;
    }
    prev = elem;
    elem = elem->next;
  }
-  sock->fd = -1;
  return ncclSuccess;
 }

+static void unexpectedFree(struct bootstrapState* state) {
+  struct unexConn* elem = state->unexpectedConnections;
+  struct unexConn* prev = NULL;
+
+  while (elem) {
+    prev = elem;
+    elem = elem->next;
+    free(prev);
+  }
+  return;
+}
+
 // We can't know who we'll receive from, so we need to receive everything at once
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
+  ncclResult_t ret = ncclSuccess;
  struct bootstrapState* state = (struct bootstrapState*)commState;
  struct ncclSocket sock;
+  int newPeer, newTag;

  // Search unexpected connections first
-  NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
-  if (sock.fd != -1) {
-    NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
-    close(sock.fd);
-    return ncclSuccess;
+  int found;
+  NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock, &found));
+  if (found) {
+    NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
+    goto exit;
  }

  // Then look for new connections
-  NCCLCHECK(ncclSocketInit(&sock, NULL, state->listenSock.abortFlag, 0));
  while (1) {
-    NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
-    int newPeer, newTag;
-    NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
-    NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
+    NCCLCHECKGOTO(ncclSocketInit(&sock), ret, fail);
+    NCCLCHECKGOTO(ncclSocketAccept(&sock, &state->listenSock), ret, fail);
+    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newPeer, sizeof(int)), ret, fail);
+    NCCLCHECKGOTO(bootstrapNetRecv(&sock, &newTag, sizeof(int)), ret, fail);
    if (newPeer == peer && newTag == tag) {
-      NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
-      close(sock.fd);
-      return ncclSuccess;
+      NCCLCHECKGOTO(bootstrapNetRecv(&sock, ((char*)data), size), ret, fail);
+      goto exit;
    }
    // Unexpected connection. Save for later.
-    NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
+    NCCLCHECKGOTO(unexpectedEnqueue(state, newPeer, newTag, &sock), ret, fail);
  }
+exit:
+  NCCLCHECK(ncclSocketClose(&sock));
+  return ret;
+fail:
+  goto exit;
 }

 ncclResult_t bootstrapClose(void* commState) {
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (state->unexpectedConnections != NULL) {
-    WARN("Unexpected connections are not empty");
-    return ncclInternalError;
+    unexpectedFree(state);
+    if (*state->abortFlag == 0) {
+      WARN("Unexpected connections are not empty");
+      return ncclInternalError;
+    }
  }
-  if (state->listenSock.fd >= 0) close(state->listenSock.fd);
-  if (state->ringSendSocket.fd >= 0) close(state->ringSendSocket.fd);
-  if (state->ringRecvSocket.fd >= 0) close(state->ringRecvSocket.fd);
+
+  NCCLCHECK(ncclSocketClose(&state->listenSock));
+  NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
+  NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));

  free(state->peerCommAddresses);
  free(state);
@@ -460,9 +505,9 @@ ncclResult_t bootstrapClose(void* commState) {
 ncclResult_t bootstrapAbort(void* commState) {
  struct bootstrapState* state = (struct bootstrapState*)commState;
  if (commState == NULL) return ncclSuccess;
-  if (state->listenSock.fd) close(state->listenSock.fd);
-  if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
-  if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
+  NCCLCHECK(ncclSocketClose(&state->listenSock));
+  NCCLCHECK(ncclSocketClose(&state->ringSendSocket));
+  NCCLCHECK(ncclSocketClose(&state->ringRecvSocket));
  free(state->peerCommAddresses);
  free(state->peerProxyAddresses);
  free(state);
@@ -14,7 +14,12 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  // Just pass the size of one message and not the total bytes sent/received.
+  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
+  };
+  size_t msgsize = sendcount * ncclTypeSize(datatype);
+  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -6,6 +6,7 @@
 ************************************************************************/

 #include "enqueue.h"
+#include "nccl.h"

 #include "msccl/msccl_lifecycle.h"

@@ -13,7 +14,18 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct NvtxParamsAllReduce {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  // Just pass the size of one message and not the total bytes sent/received.
+  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsAllReduce, op)}
+  };
+  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -14,7 +14,17 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
    ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
    ncclComm_t comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct NvtxParamsBroadcast {
+    size_t bytes;
+    int root;
+  };
+  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
+  };
+  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+
  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -490,6 +490,7 @@ struct ncclShmemData {
  };
  uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
  int channelId;
+  int aborted;
  alignas(16) struct ncclDevComm comm;
  alignas(16) struct ncclDevChannel channel;
  alignas(16) struct ncclWork work;
@@ -499,6 +500,8 @@ struct ncclShmemData {
 };
 static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");

+extern __shared__ ncclShmemData ncclShmem;
+
 #ifdef ENABLE_PROFILING
 #define __insert_timestamp(line_num) do { \
      if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
@@ -569,8 +572,6 @@ static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we
  }
 }

-extern __shared__ ncclShmemData ncclShmem;
-
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
 __forceinline__ __device__ void ncclKernel(
    struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
@@ -600,6 +601,8 @@ __forceinline__ __device__ void ncclKernel(
  }
  __synclds(); // publish ncclShmem.channelId
  int channelId = ncclShmem.channelId;
+  /* set abort flag to 0 */
+  if (tid == 0) ncclShmem.aborted = 0;

  if (true) {
    void *dst, *src;
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -93,7 +93,10 @@ private:
  inline __device__ bool checkAbort(int &spins) {
    spins++;
    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      flags |= atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0) ? Aborted : 0;
+      if (atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0)) {
+        flags |= Aborted;
+        ncclShmem.aborted = 1;
+      }
      spins = 0;
    }
    return flags & Aborted;
@@ -207,6 +210,9 @@ private:
          ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(dstIx, remoteIx, offset, sliceSize);
        subBarrier();
+        /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
+         * to 0 to avoid unnecessary workload. */
+        size_t workSize = ncclShmem.aborted ? 0 : sliceSize;
        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
          if (Send) {
@@ -229,7 +235,7 @@ private:
              (tid, nworkers, nullptr, false,
               1, (T const**)ncclShmem.groups[group].srcs,
               fan.nsend(), (T**)ncclShmem.groups[group].dsts+1,
-               sliceSize);
+               workSize);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
            if (tid == 0) {
@@ -266,7 +272,7 @@ private:
            (tid, nworkers, ncclShmem.redOpArgs, postOp,
             Recv, (T const**)ncclShmem.groups[group].srcs,
             Dst, (T**)ncclShmem.groups[group].dsts,
-             sliceSize);
+             workSize);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
          if (tid == 0) {
@@ -303,7 +309,7 @@ private:
            (tid, nworkers, ncclShmem.redOpArgs, postOp,
             Recv*fan.nrecv()+Src, (T const**)ncclShmem.groups[group].srcs,
             Send*fan.nsend()+Dst, (T**)ncclShmem.groups[group].dsts,
-             sliceSize);
+             workSize);

 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
          if (tid == 0) {
@@ -7,6 +7,7 @@

 #include "enqueue.h"
 #include "collectives.h"
+#include "nccl.h"

 #include "msccl/msccl_lifecycle.h"

@@ -14,7 +15,19 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct NvtxParamsReduce {
+    size_t bytes;
+    int root;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduce, op)}
+  };
+  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
+  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -7,6 +7,7 @@

 #include "enqueue.h"
 #include "collectives.h"
+#include "nccl.h"

 #include "msccl/msccl_lifecycle.h"

@@ -14,7 +15,17 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  struct NvtxParamsReduceScatter {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduceScatter, op)}
+  };
+  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -11,11 +11,21 @@

 #include "msccl/msccl_lifecycle.h"

+struct NvtxParamsSendRecv {
+    size_t bytes;
+    int peer;
+};
+constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
+};
+
 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -37,7 +47,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
    ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
    ncclComm_t comm, cudaStream_t stream) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)

  if (mscclAvailable() && !mscclIsCaller()) {
    return mscclEnqueueCheck(
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -577,8 +577,7 @@ static ncclResult_t scheduleP2pTasksToPlan(

  // Compute how much to split operations
  // Natural step size matching buffer steps.
-  ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
-  if (comm->nNodes > 1) stepSize = comm->p2pNetChunkSize;
+  ssize_t stepSize = comm->p2pChunkSize;
  // Try to use all channels
  int nChannelsMax = comm->p2pnChannelsPerPeer;
  int nChannelsMin = nChannelsMax;
@@ -991,69 +990,87 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru

 #if CUDART_VERSION >= 11080
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
-NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", 0);
+#define NCCL_CGA_CLUSTER_SIZE_SM90 4
+NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", -2);
+#endif
+
+#if CUDART_VERSION >= 12000
+// NCCL uses the "Remote" Mem Sync domain by default
+NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif

 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  struct ncclTasks* tasks = &comm->tasks;
  void *fn = plan->kernelFn;
-  hipStream_t launchStream = tasks->streams->stream;
+  cudaStream_t launchStream = tasks->streams->stream;
  dim3 grid = {(unsigned)plan->channelCount, 1, 1};
  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
  void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
  if (tasks->numStreams == 1) {
    CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, args, 0, tasks->streams->stream, NULL, comm->doneEvent, 0));
    comm->lastStream = tasks->streams->stream;
-  } else {
-    #if CUDART_VERSION >= 11080
-    int driverVersion;
-    NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
+    return ncclSuccess;
+  }

-    unsigned int clusterSize = 0;
-    clusterSize = ncclParamCGAClusterSize();
-    if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
-      static bool warned = false;
-      if (warned == false) {
-        WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
-             clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
-        warned = true;
+  #if CUDART_VERSION >= 11080
+  int driverVersion;
+  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
+  if (driverVersion >= 11080) {
+    int compCap = comm->compCap;
+    unsigned int clusterSize = (compCap == 90) ? NCCL_CGA_CLUSTER_SIZE_SM90 : 0;
+    if (ncclParamCGAClusterSize() != -2) {
+      clusterSize = ncclParamCGAClusterSize();
+      if (clusterSize > NCCL_MAX_CGA_CLUSTER_SIZE) {
+        static bool warned = false;
+        if (warned == false) {
+          WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.",
+               clusterSize, NCCL_MAX_CGA_CLUSTER_SIZE);
+          warned = true;
+        }
+        clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
      }
-      clusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
    }

-    if (clusterSize && driverVersion >= 11080) {
-      cudaLaunchConfig_t launchConfig = {0};
-      cudaLaunchAttribute launchAttrs[2];
-      /* Cooperative Group Array (CGA)
-       * On sm90 and later we have an extra level of hierarchy where we
-       * can group together several blocks within the Grid, called
-       * Thread Block Clusters.
-       * Clusters enable multiple thread blocks running concurrently
-       * across multiple SMs to synchronize and collaboratively fetch
-       * and exchange data. A cluster of blocks are guaranteed to be
-       * concurrently scheduled onto a group of SMs.
-       * The maximum value is 8 and it must be divisible into the grid dimensions
-       */
+    cudaLaunchConfig_t launchConfig = {0};
+    cudaLaunchAttribute launchAttrs[3];
+    int attrs = 0;
+    /* Cooperative Group Array (CGA)
+     * On sm90 and later we have an extra level of hierarchy where we
+     * can group together several blocks within the Grid, called
+     * Thread Block Clusters.
+     * Clusters enable multiple thread blocks running concurrently
+     * across multiple SMs to synchronize and collaboratively fetch
+     * and exchange data. A cluster of blocks are guaranteed to be
+     * concurrently scheduled onto a group of SMs.
+     * The maximum value is 8 and it must be divisible into the grid dimensions
+     */
+    if (clusterSize) {
      // Grid dimension must be divisible by clusterSize
      if (grid.x % clusterSize) clusterSize = 1;
-      launchAttrs[0].id = cudaLaunchAttributeClusterDimension;
-      launchAttrs[0].val.clusterDim = {clusterSize, 1, 1};
-      launchAttrs[1].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
-      launchAttrs[1].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
-
-      launchConfig.gridDim = grid;
-      launchConfig.blockDim = block;
-      launchConfig.attrs = launchAttrs;
-      launchConfig.numAttrs = sizeof(launchAttrs)/sizeof(launchAttrs[0]);
-      launchConfig.stream = launchStream;
-
-      CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
-      return ncclSuccess;
+      launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension;
+      launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1};
+      launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
+      launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
+    }
+    #if CUDART_VERSION >= 12000
+    if (compCap >= 90 && driverVersion >= 12000) {
+      // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90)
+      launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain;
+      launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain();
    }
    #endif
-    // Standard kernel launch
-    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
+    launchConfig.gridDim = grid;
+    launchConfig.blockDim = block;
+    launchConfig.attrs = launchAttrs;
+    launchConfig.numAttrs = attrs;
+    launchConfig.stream = launchStream;
+
+    CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
+    return ncclSuccess;
  }
+  #endif
+  // Standard kernel launch
+  CUDACHECK(cudaLaunchKernel(fn, grid, block, args, 0, launchStream));
  return ncclSuccess;
 }

@@ -1080,14 +1097,18 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
    // back to us for reclaiming via callbackQueue.
    ncclIntruQueueConstruct(&comm->planQueue);
    cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
-    // Create dependency for deviceStream on launchStream.
-    if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream), result, resume1);
+    // Create dependency for deviceStream on launchStream. We know that deviceStream
+    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
+    // so we can say that launchStream subsumes it.
+    if (tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
  resume1:
-    // Create dependency for other user streams (skip launch stream).
+    // Create dependency for other user streams (skip launch stream) on deviceStream.
+    // Again, the user streams haven't been touched since deviceStream waited on them
+    // so we can say they are subsumed by deviceStream.
    struct ncclCudaStreamList* sl = tasks->streams->next;
    tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
    while (sl != nullptr && tasks->numStreams != 1) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream), result, resume2);
+      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->deviceStream, /*b_subsumes_a=*/true), result, resume2);
    resume2:
      sl = sl->next;
    }
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -866,3 +866,15 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
  *nranks = nvbGpus;
  return ncclSuccess;
 }
+
+int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
+  int minPath = PATH_DIS;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
+    for (int j=0; j<system->nodes[GPU].count; j++) {
+      if (i == j) continue;
+      minPath = std::min(minPath, paths[j].type);
+    }
+  }
+  return minPath >= PATH_PIX ? 0 : 1;
+}
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -78,6 +78,9 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
+    *bw = AMD_BW;
+  }
  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
  }
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -19,6 +19,7 @@
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
 #define SKL_QPI_BW 9.0
+#define AMD_BW 16.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -10,10 +10,16 @@
 #include "nccl.h"
 #include "comm.h"

+struct ncclBootstrapHandle {
+  uint64_t magic;
+  union ncclSocketAddress addr;
+};
+static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
+
 ncclResult_t bootstrapNetInit();
-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
+ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
+ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -177,9 +177,12 @@ struct ncclComm {
  uint64_t* connectSend;
  uint64_t* connectRecv;

+  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
+
  int rank;    // my rank in the communicator
  int nRanks;  // number of GPUs in communicator
  int cudaDev; // my cuda device index
+  int compCap; // compute capability of the GPU
  int64_t busId;   // my PCI bus ID in int format
  cpu_set_t cpuAffinity; // CPU affinity of the GPU
  int WarpSize;
@@ -216,7 +219,7 @@ struct ncclComm {

  // Buffer sizes
  int buffSizes[NCCL_NUM_PROTOCOLS];
-  int p2pNetChunkSize;
+  int p2pChunkSize;

  // Algorithm/Protocols thresholds
  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -253,7 +256,6 @@ struct ncclComm {
  // Intra-process sync
  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
-  int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
  int intraRank;
  int intraRanks;
  uint32_t intraBarrierPhase;
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -159,20 +159,55 @@ typedef struct gdr_mem_desc {

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 static gdr_t ncclGdrInit() {
-  return NULL;
+  INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation");
+  return (gdr_t)0x12345678L;
 }

 template <typename T>
-static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
+  gdr_info_t info;
+  size_t mapSize;
+  gdr_mh_t mh;
+  char *devMem;
+  void *gdrMap;
+
+  mapSize = sizeof(T)*nelem;
+
+  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
+  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
+  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
+
+  gdr_mem_desc_t* md;
+  NCCLCHECK(ncclCalloc(&md, 1));
+  md->gdrDevMem = devMem;
+  md->gdrMap = NULL;
+  md->gdrMapSize = mapSize;
+  md->gdrOffset = 0;
+  md->gdrMh.h = 0;
+  *gdrHandle = md;
+
+  *ptr = (T *)(devMem);
+  if (devPtr) *devPtr = (T *)(devMem);
+
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+
  return ncclSuccess;
 }

 template <typename T>
 static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  memcpy(dst, src, nelem*sizeof(T));
  return ncclSuccess;
 }

 static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  CUDACHECK(hipFree(md->gdrDevMem));
+  free(md);
+
  return ncclSuccess;
 }
 #else
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -29,6 +29,7 @@ void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
+int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);

 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
@@ -9,6 +9,77 @@

 #include "nvtx3.hpp"

+#if __cpp_constexpr >= 201304L && !defined(NVTX3_RELAXED_CONSTEXPR)
+#define NVTX3_RELAXED_CONSTEXPR constexpr
+#else
+#define NVTX3_RELAXED_CONSTEXPR
+#endif
+
+// Define all NCCL-provided static schema IDs here (avoid duplicates).
+#define NVTX_SID_CommInitRank  0
+#define NVTX_SID_CommInitAll   1
+#define NVTX_SID_CommDestroy   2 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_CommAbort     3 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_AllGather     4
+#define NVTX_SID_AllReduce     5
+#define NVTX_SID_Broadcast     6
+#define NVTX_SID_ReduceScatter 7
+#define NVTX_SID_Reduce        8
+#define NVTX_SID_Send          9
+#define NVTX_SID_Recv          10
+
+// Define static schema ID for the reduction operation.
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+
+extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
+
 struct nccl_domain{static constexpr char const* name{"NCCL"};};

+class payload_schema {
+ public:
+  NVTX3_RELAXED_CONSTEXPR explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+  {
+    schema_attr.name = schemaName;
+    schema_attr.entries = entries;
+    schema_attr.numEntries = numEntries;
+    schema_attr.schemaId = schemaId;
+    nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
+  }
+
+  payload_schema() = delete;
+  ~payload_schema() = default;
+  payload_schema(payload_schema const&) = default;
+  payload_schema& operator=(payload_schema const&) = default;
+  payload_schema(payload_schema&&) = default;
+  payload_schema& operator=(payload_schema&&) = default;
+
+ private:
+  nvtxPayloadSchemaAttr_t schema_attr{
+    NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    nullptr,
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    nullptr, 0, 0, 0};
+};
+
+// Create NVTX push/pop range with parameters
+// @param name of the operation (see `NVTX_SID_*`)
+// @param N  schema name
+// @param S  schema (entries)
+// @param P  payload (struct)
+#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
+  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
+    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
+  static ::nvtx3::v1::registered_string<nccl_domain> const nvtx3_func_name__{__func__}; \
+  nvtxPayloadData_t nvtx3_bpl__[] = { \
+    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
+  ::nvtx3::v1::event_attributes nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
+  ::nvtx3::v1::domain_thread_range<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
+
+extern void initNvtxRegisteredEnums();
+
 #endif
@@ -92,6 +92,7 @@
 /* clang-format on */

 #include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtPayload.h>

 #include <memory>
 #include <string>
@@ -1732,6 +1733,22 @@ class event_attributes {
    attributes_.messageType = m.get_type();
  }

+  /**
+   * @brief Variadic constructor where the first argument is a binary payload.
+   *
+   * Sets the value of the `EventAttribute`s message based on `m` and forwards
+   * the remaining variadic parameter pack to the next constructor.
+   *
+   */
+  template <typename... Args>
+  NVTX3_RELAXED_CONSTEXPR explicit event_attributes(nvtxPayloadData_t const* bpl, Args const&... args) noexcept
+    : event_attributes(args...)
+  {
+    attributes_.payloadType = NVTX_PAYLOAD_TYPE_BINARY;
+    attributes_.reserved0 = 1; // NCCL uses only a single binary payload per event.
+    attributes_.payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(bpl);
+  }
+
  ~event_attributes()                       = default;
  event_attributes(event_attributes const&) = default;
  event_attributes& operator=(event_attributes const&) = default;
@@ -8,7 +8,7 @@

 #include "nvToolsExt.h"

-#include "hip/hip_runtime.h"
+#include "cuda.h"

 #ifndef NVTOOLSEXT_CUDA_V3
 #define NVTOOLSEXT_CUDA_V3
@@ -42,10 +42,10 @@ extern "C" {
 */
 typedef enum nvtxResourceCUDAType_t
 {
-    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* hipDevice_t */
-    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* hipCtx_t */
-    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* hipStream_t */
-    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* hipEvent_t */
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
 } nvtxResourceCUDAType_t;


@@ -59,8 +59,8 @@ typedef enum nvtxResourceCUDAType_t
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -73,16 +73,16 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t*
 *
 * \par Example:
 * \code
- * hipError_t status = hipCtxCreate( &cuContext, 0, cuDevice );
- * if ( hipSuccess != status )
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
 *     goto Error;
 * nvtxNameCuContext(cuContext, "CTX_NAME");
 * \endcode
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -95,8 +95,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t*
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -109,8 +109,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t*
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
 /** @} */

 /** @} */ /* END RESOURCE_NAMING */
@@ -8,8 +8,8 @@

 #include "nvToolsExt.h"

-#include "hip/hip_runtime.h"
-#include "hip/driver_types.h"
+#include "cuda.h"
+#include "driver_types.h"

 #ifndef NVTOOLSEXT_CUDART_V3
 #define NVTOOLSEXT_CUDART_V3
@@ -44,8 +44,8 @@ extern "C" {
 typedef enum nvtxResourceCUDARTType_t
 {
    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
-    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* hipStream_t */
-    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* hipEvent_t */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
 } nvtxResourceCUDARTType_t;


@@ -73,8 +73,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
 /** @} */

 /* ------------------------------------------------------------------------- */
@@ -87,8 +87,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_
 *
 * \version \NVTX_VERSION_1
 * @{ */
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name);
-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
 /** @} */

 /** @} */ /* END RESOURCE_NAMING */
@@ -0,0 +1,776 @@
+/*
+* Copyright 2021  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#include "nvtx3/nvToolsExt.h"
+
+#ifndef NVTOOLSEXT_PAYLOAD_H
+#define NVTOOLSEXT_PAYLOAD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \brief A compatibility ID value used in initialization to identify version
+ * differences.
+ */
+#define NVTX_EXT_COMPATID_PAYLOAD 0x0103
+
+/**
+ * \brief This module ID identifies the payload extension. It has to be unique
+ * among the extension modules.
+ */
+#define NVTX_EXT_MODULEID_PAYLOAD 2
+
+/**
+ * \brief Additional values for the enum @ref nvtxPayloadType_t
+ */
+#define NVTX_PAYLOAD_TYPE_BINARY ((int32_t)0xDFBD0009)
+
+
+/** ---------------------------------------------------------------------------
+ * Payload schema entry flags.
+ * ------------------------------------------------------------------------- */
+#define NVTX_PAYLOAD_ENTRY_FLAG_UNUSED 0
+
+/**
+ * Absolute pointer into a payload (entry) of the same event.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_POINTER          (1 << 1)
+
+/**
+ * Offset from base address of the payload.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_BASE (1 << 2)
+
+/**
+ * Offset from the end of this payload entry.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_OFFSET_FROM_HERE (1 << 3)
+
+/**
+ * The value is an array with fixed length, set with the field `arrayLength`.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE      (1 << 4)
+
+/**
+ * The value is a zero-/null-terminated array.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED (2 << 4)
+
+/**
+ * \brief A single or multi-dimensional array of variable length.
+ *
+ * The field `arrayLength` contains the index of the schema entry that holds the
+ * length(s). If the other field points to a scalar entry then this will be the
+ * 1D array. If the other field points to a FIXED_SIZE array, then the number of
+ * dimensions is defined with the registration of the scheme. If the other field
+ * is ZERO_TERMINATED, the array the dimensions can be determined at runtime.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX    (3 << 4)
+
+/**
+ * A tool may not support deep copy and just ignore this flag.
+ * See @ref NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY for more details.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_DEEP_COPY             (1 << 9)
+
+/**
+ * The entry specifies the message in a deferred event. The entry type can be
+ * any string type. The flag is ignored for schemas that are not flagged with
+ * `NVTX_PAYLOAD_SCHEMA_FLAG_RANGE*` or `NVTX_PAYLOAD_SCHEMA_FLAG_MARK`.
+ */
+#define NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE         (1 << 10)
+
+/**
+ * @note The ‘array’ flags assume that the array is embedded. Otherwise,
+ * @ref NVTX_PAYLOAD_ENTRY_FLAG_POINTER has to be additionally specified. Some
+ * combinations may be invalid based on the `NVTX_PAYLOAD_SCHEMA_TYPE_*` this
+ * entry is enclosed. For instance, variable length embedded arrays are valid
+ * within @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC but invalid with
+ * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC. See `NVTX_PAYLOAD_SCHEMA_TYPE_*` for
+ * additional details.
+ */
+
+/* Helper macro to check if an entry represents an array. */
+#define NVTX_PAYLOAD_ENTRY_FLAG_IS_ARRAY (\
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE | \
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED | \
+    NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX)
+
+/** ---------------------------------------------------------------------------
+ * Types of entries in a payload schema.
+ * ------------------------------------------------------------------------- */
+
+/**
+ * @note Several of the predefined types contain the size (in bits) in their
+ * names. For some data types the size (in bytes) is not fixed and may differ
+ * for different platforms/operating systems/compilers. To provide portability,
+ * an array of sizes (in bytes) for type 1 to 28 ( @ref
+ * NVTX_PAYLOAD_ENTRY_TYPE_CHAR to @ref NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE)
+ * is passed to the NVTX extension initialization function
+ * @ref InitializeInjectionNvtxExtension via the `extInfo` field of
+ * @ref nvtxExtModuleInfo_t.
+ */
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_INVALID 0
+
+/**
+ * Basic integer types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR        1
+#define NVTX_PAYLOAD_ENTRY_TYPE_UCHAR       2
+#define NVTX_PAYLOAD_ENTRY_TYPE_SHORT       3
+#define NVTX_PAYLOAD_ENTRY_TYPE_USHORT      4
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT         5
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT        6
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONG        7
+#define NVTX_PAYLOAD_ENTRY_TYPE_ULONG       8
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG    9
+#define NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG  10
+
+/**
+ * Integer types with explicit size.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT8       11
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT8      12
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT16      13
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT16     14
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT32      15
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT32     16
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT64      17
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT64     18
+
+/**
+ * C floating point types
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT      19
+#define NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE     20
+#define NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE 21
+
+/**
+ * Size type (`size_t`)
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SIZE       22
+
+/**
+ * Any address, e.g. `void*`. If the pointer type matters, use the flag @ref
+ * NVTX_PAYLOAD_ENTRY_FLAG_POINTER and the respective type instead.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS    23
+
+/**
+ * Special character types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_WCHAR      24 /* wide character (since C90) */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR8      25 /* since C2x and C++20 */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR16     26
+#define NVTX_PAYLOAD_ENTRY_TYPE_CHAR32     27
+
+/**
+ * There is type size and alignment information for all previous types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE (NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 + 1)
+
+/**
+ * Store raw 8-bit binary data. As with `char`, 1-byte alignment is assumed.
+ * Typically a tool will display this as hex or binary.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_BYTE       32
+
+/**
+ * These types do not have standardized equivalents. It is assumed that the
+ * number at the end corresponds to the bits used to store the value and that
+ * the alignment corresponds to standardized types of the same size.
+ * A tool may not support these types.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_INT128     33
+#define NVTX_PAYLOAD_ENTRY_TYPE_UINT128    34
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT16    42
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT32    43
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT64    44
+#define NVTX_PAYLOAD_ENTRY_TYPE_FLOAT128   45
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_BF16       50
+#define NVTX_PAYLOAD_ENTRY_TYPE_TF32       52
+
+/**
+ * These types are normalized numbers stored in integers. UNORMs represent 0.0
+ * to 1.0 and SNORMs represent -1.0 to 1.0. The number after represents the
+ * number of integer bits. Alignment is take from equivalent types INT# matching
+ * to SNORM# and UINT# matching to UNORM#.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM8     61
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM8     62
+#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM16    63
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM16    64
+#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM32    65
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM32    66
+#define NVTX_PAYLOAD_ENTRY_TYPE_SNORM64    67
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNORM64    68
+
+/**
+ * String types.
+ *
+ * If `arrayOrUnionDetail` is greater than `0`, the entry is a fixed-size string
+ * with the provided length.
+ *
+ * `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_FIXED_SIZE` is ignored for string types. It
+ * just specifies once more that the entry is a fixed-size string.
+ *
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_ZERO_TERMINATED` indicates a
+ * zero-terminated string. If `arrayOrUnionDetail` is greater than `0`, a zero-
+ * terminated array of fixed-size strings is assumed.
+ *
+ * Setting the flag `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_LENGTH_INDEX` specifies the
+ * entry index of the entry which contains the string length. It is not possible
+ * to describe a variable length array of strings.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING       75 /* `char*`, system LOCALE */
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF8  76
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF16 77
+#define NVTX_PAYLOAD_ENTRY_TYPE_CSTRING_UTF32 78
+
+/**
+ * @ref nvtxStringHandle_t returned by @ref nvtxDomainRegisterString
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_REGISTERED_STRING_HANDLE 80
+
+/**
+ * Entry types to be used in deferred events. Data types are as defined by
+ * NVTXv3 core: category -> uint32_t, color -> uint32_t, color type -> int32_t.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_CATEGORY    90
+#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLORTYPE   91
+#define NVTX_PAYLOAD_ENTRY_TYPE_NVTX_COLOR       92
+
+/**
+ * This type marks the union selector member (entry index) in schemas used by
+ * a union with internal internal selector.
+ * See @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_UNION_SELECTOR 100
+
+/**
+ * Timestamp types occupy the range from 128 to 255
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP64 128 /* data type is uint64_t */
+
+/**
+ * CPU timestamp sources.
+ * \todo All 64 bits?
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC                              129
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_TSC_NONVIRTUALIZED               130
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME           131
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_REALTIME_COARSE    132
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC          133
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_RAW      134
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_MONOTONIC_COARSE   135
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_BOOTTIME           136
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_PROCESS_CPUTIME_ID 137
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPU_CLOCK_GETTIME_THREAD_CPUTIME_ID  138
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_QPC     160
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFT  161
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_WIN_GSTAFTP 162
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIME         163
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_CLOCK        164
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_C_TIMESPEC_GET 165
+
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_STEADY_CLOCK          166
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_HIGH_RESOLUTION_CLOCK 167
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_SYSTEM_CLOCK          168
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_UTC_CLOCK             169
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_TAI_CLOCK             170
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_GPS_CLOCK             171
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_CPP_FILE_CLOCK            172
+
+/**
+ * \brief GPU timestamp sources.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_GLOBALTIMER 192
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK    193
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_SM_CLOCK64  194
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_GPU_CUPTI       195
+
+/**
+ * The timestamp was provided by the NVTX handler’s timestamp routine.
+ */
+#define NVTX_PAYLOAD_ENTRY_TYPE_TIMESTAMP_TOOL_PROVIDED 224
+
+/**
+ * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
+ * the payload is a blob of memory which other payload entries may point into.
+ * A tool will not expose this payload directly.
+ */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_REFERENCED 1022
+
+/**
+ * This predefined schema ID can be used in `nvtxPayloadData_t` to indicate that
+ * the payload is a blob which can be shown with an arbitrary data viewer.
+ */
+#define NVTX_TYPE_PAYLOAD_SCHEMA_RAW        1023
+
+/* Custom (static) schema IDs. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START  (1 << 24)
+
+/* Dynamic schema IDs (generated by the tool) start here. */
+#define NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START 4294967296  // 1 << 32
+
+
+/**
+ * \brief Size and alignment information for predefined payload entry types.
+ *
+ * The struct contains the size and the alignment size in bytes. A respective
+ * array for the predefined types is passed via nvtxExtModuleInfo_t to the NVTX
+ * client/handler. The type (ID) is used as index into this array.
+ */
+typedef struct nvtxPayloadEntryTypeInfo_t
+{
+    uint16_t size;
+    uint16_t align;
+} nvtxPayloadEntryTypeInfo_t;
+
+/**
+ * \brief Entry in a schema.
+ *
+ * A payload schema consists of an array of payload schema entries. It is
+ * registered with @ref nvtxPayloadSchemaRegister. `flag` can be set to `0` for
+ * simple values, 'type' is the only "required" field. If not set explicitly,
+ * all other fields are zero-initialized, which means that the entry has no name
+ * and the offset is determined based on self-alignment rules.
+ *
+ * Example schema:
+ *  nvtxPayloadSchemaEntry_t desc[] = {
+ *      {0, NVTX_EXT_PAYLOAD_TYPE_UINT8, "one byte"},
+ *      {0, NVTX_EXT_PAYLOAD_TYPE_INT32, "four bytes"}
+ *  };
+ */
+typedef struct nvtxPayloadSchemaEntry_t
+{
+    /**
+     * \brief Flags to augment the basic type.
+     *
+     * This field allows additional properties of the payload entry to be
+     * specified. Valid values are `NVTX_PAYLOAD_ENTRY_FLAG_*`.
+     */
+    uint64_t       flags;
+
+    /**
+     * \brief Predefined payload schema entry type or ID of a registered payload
+     * schema.
+     */
+    uint64_t       type;
+
+    /**
+     * \brief Name of the payload entry. (Optional)
+     *
+     * Providing a name is useful to give a meaning to the associated value.
+     */
+    const char*    name;
+
+    /**
+     * \brief Description of the payload entry. (Optional)
+     */
+    const char*    description;
+
+    /**
+     * \brief String or array length or union selector for union types.
+     *
+     * If @ref type is a C string type, this defines the length of the string.
+     *
+     * If @ref flags specify that the entry is an array, this field defines the
+     * length of the array. See `NVTX_PAYLOAD_ENTRY_FLAG_ARRAY_*` for more
+     * details.
+     *
+     * If @ref type implies that the entry is a union with schema type
+     * @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION (external selection of the union
+     * member), this field contains the index (starting with 0) to an entry of
+     * integer type in the same schema. The associated field contains the
+     * selected union member.
+     *
+     * @note An array of schema type @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION is not
+     * supported. @ref NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR can
+     * be used instead.
+     */
+    uint64_t       arrayOrUnionDetail;
+
+    /**
+     * \brief Offset in the binary payload data (in bytes).
+     *
+     * This field specifies the byte offset from the base address of the actual
+     * binary data (blob) to the data of this entry.
+     *
+     * This is an optional field, but it is recommended to specify this field to
+     * avoid issues in the automatic detection of the offset by a tool/handler.
+     */
+    uint64_t       offset;
+
+    /**
+     * Semantics are not yet defined.
+     */
+    void*          semantics;
+
+    /**
+     * Reserved for future use. Do not use it!
+     */
+    void*          reserved;
+} nvtxPayloadSchemaEntry_t;
+
+/**
+ * \brief Binary payload data, size and decoding information.
+ *
+ * An array of nvtxPayloadData_t is passed to the NVTX event attribute payload
+ * member. To attach a single payload the macro @ref NVTX_EXT_PAYLOAD_SET_ATTR
+ * can be used.
+ */
+typedef struct nvtxPayloadData_t
+{
+    /**
+     * The schema ID, which defines the layout of the binary data.
+     */
+    uint64_t    schemaId;
+
+    /**
+     * Size of the binary payload (blob) in bytes.
+     */
+    size_t      size;
+
+    /**
+     * Pointer to the binary payload data.
+     */
+    const void* payload;
+} nvtxPayloadData_t;
+
+/* Helper macros for safe double-cast of pointer to uint64_t value */
+#ifndef NVTX_POINTER_AS_PAYLOAD_ULLVALUE
+# ifdef __cplusplus
+# define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) \
+    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(p))
+# else
+#define NVTX_POINTER_AS_PAYLOAD_ULLVALUE(p) ((uint64_t)(uintptr_t)p)
+# endif
+#endif
+
+
+#define NVTX_PAYLOAD_CONCAT2(a,b) a##b
+#define NVTX_PAYLOAD_CONCAT(a,b) NVTX_PAYLOAD_CONCAT2(a,b)
+#define NVTX_DATA_VAR NVTX_PAYLOAD_CONCAT(nvtxDFDB,__LINE__)
+
+/**
+ * \brief Helper macro to attach a single payload to an NVTX event attribute.
+ *
+ * @note The NVTX push, start or mark operation must not be in the same or a
+ * nested scope.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET(EVTATTR, SCHEMA_ID, PAYLOAD_ADDR, SIZE) \
+    nvtxPayloadData_t NVTX_DATA_VAR[] = {{SCHEMA_ID, SIZE, PAYLOAD_ADDR}}; \
+    (EVTATTR).payload.ullValue = \
+        NVTX_POINTER_AS_PAYLOAD_ULLVALUE(NVTX_DATA_VAR); \
+    (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
+    (EVTATTR).reserved0 = 1;
+
+/**
+ * \brief Helper macro to attach multiple payloads to an NVTX event attribute.
+ *
+ * The payload data array (`nvtxPayloadData_t`) is passed as first argument to
+ * this macro.
+ */
+#define NVTX_PAYLOAD_EVTATTR_SET_MULTIPLE(EVTATTR, PAYLOADS) \
+    (EVTATTR).payloadType = NVTX_PAYLOAD_TYPE_BINARY; \
+    (EVTATTR).reserved0 = sizeof(PAYLOADS)/sizeof(nvtxPayloadData_t); \
+    (EVTATTR).payload.ullValue = NVTX_POINTER_AS_PAYLOAD_ULLVALUE(PAYLOADS);
+
+
+/**
+ * \brief The payload schema type.
+ *
+ * A schema can be either of these types.
+ */
+enum nvtxPayloadSchemaType
+{
+    NVTX_PAYLOAD_SCHEMA_TYPE_INVALID = 0,
+
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC  = 1,
+    NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC = 2,
+
+    NVTX_PAYLOAD_SCHEMA_TYPE_UNION   = 3,
+    NVTX_PAYLOAD_SCHEMA_TYPE_UNION_WITH_INTERNAL_SELECTOR = 4
+};
+
+/**
+ * \brief Flags for static and dynamic schemas.
+ */
+enum nvtxPayloadSchemaFlags
+{
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE = 0,
+
+    /**
+     * This flag indicates that a schema and the corresponding payloads can
+     * contain fields which require a deep copy.
+     */
+    NVTX_PAYLOAD_SCHEMA_FLAG_DEEP_COPY  = (1 << 1),
+
+    /**
+     * This flag indicates that a schema and the corresponding payloads can
+     * be referenced by another payload of the same event.
+     */
+    NVTX_PAYLOAD_SCHEMA_FLAG_REFERENCED = (1 << 2),
+
+    /**
+     * The schema describes a deferred event/marker. Such a schema requires one
+     * timestamp entry and one string entry with the flag
+     * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
+     * optionally specified with the respective entry types. The deferred event
+     * can contain a binary payload itself by using a custom schema ID as type
+     * its schema description. Multiple occurrences of the same event can be
+     * described by specifying an array timestamps.
+     */
+    NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_EVENT = (1 << 3),
+    /**
+     * The schema describes a deferred event/marker. Such a schema requires
+     * one start timestamp, one end timestamp and one string entry with the flag
+     * `NVTX_PAYLOAD_ENTRY_FLAG_EVENT_MESSAGE`. Category and color can be
+     * optionally specified with the respective entry types. The deferred range
+     * can contain a binary payload itself by using a custom schema ID as type
+     * its schema description.
+     *
+     * Timestamps can be provided in different ways:
+     *  - A single range has two timestamp entries with the first (smaller entry
+     *    index) being used as the start/push timestamp.
+     *  - If the range schema contains one array of timestamps, the tool assumes
+     *    that the array contains alternating start and end timestamps.
+     *  - If two timestamp arrays are specified the first entry (with the
+     *    smaller entry index) is assumed to contain the start timestamps. Both
+     *    arrays have to be of the same size.
+     */
+    NVTX_PAYLOAD_SCHEMA_FLAG_DEFERRED_RANGE = (2 << 3)
+};
+
+/**
+ * The values allow the valid fields in @ref nvtxPayloadSchemaAttr_t to be
+ * specified via setting the field `fieldMask`.
+ */
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_TYPE        (1 << 2)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_FLAGS       (1 << 3)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES     (1 << 4)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES (1 << 5)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE (1 << 6)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_ALIGNMENT   (1 << 7)
+#define NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID   (1 << 8)
+
+/**
+ * NVTX payload schema attributes.
+ */
+typedef struct nvtxPayloadSchemaAttr_t
+{
+    /**
+     * \brief Mask of valid fields in this structure.
+     *
+     * The values from `enum nvtxPayloadSchemaAttributes` have to be used.
+     */
+    uint64_t                        fieldMask;
+
+    /**
+     * \brief Name of the payload schema. (Optional)
+     */
+    const char*                     name;
+
+    /**
+     * \brief Payload schema type. (Mandatory) \anchor PAYLOAD_TYPE_FIELD
+     *
+     * A value from `enum nvtxPayloadSchemaType` has to be used.
+     */
+    uint64_t                        type;
+
+    /**
+     * \brief Payload schema flags. (Optional)
+     *
+     * Flags defined in `enum nvtxPayloadSchemaFlags` can be used to set
+     * additional properties of the schema.
+     */
+    uint64_t                        flags;
+
+    /**
+     * \brief Entries of a payload schema. (Mandatory) \anchor ENTRIES_FIELD
+     *
+     * This field is a pointer to an array of schema entries, each describing a
+     * field in a data structure, e.g. in a C struct or union.
+     */
+    const nvtxPayloadSchemaEntry_t* entries;
+
+    /**
+     * \brief Number of entries in the payload schema. (Mandatory)
+     *
+     * Number of entries in the array of payload entries \ref ENTRIES_FIELD.
+     */
+    size_t                          numEntries;
+
+    /**
+     * \brief The binary payload size in bytes for static payload schemas.
+     *
+     * If \ref PAYLOAD_TYPE_FIELD is @ref NVTX_PAYLOAD_SCHEMA_TYPE_DYNAMIC this
+     * value is ignored. If this field is not specified for a schema of type
+     * @ref NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, the size can be automatically
+     * determined by a tool.
+     */
+    size_t                          payloadStaticSize;
+
+    /**
+     * \brief The byte alignment for packed structures.
+     *
+     * If not specified, this field defaults to `0`, which means that the fields
+     * in the data structure are not packed and natural alignment rules can be
+     * applied.
+     */
+    size_t                          packAlign;
+
+    /* Static/custom schema ID must be
+       >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
+       < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START */
+    uint64_t                        schemaId;
+} nvtxPayloadSchemaAttr_t;
+
+/**
+ * \brief Register a payload schema.
+ *
+ * @param domain NVTX domain handle.
+ * @param attr NVTX payload schema attributes.
+ */
+NVTX_DECLSPEC uint64_t NVTX_API nvtxPayloadSchemaRegister(
+    nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr);
+
+/**
+ * \brief Enumeration entry.
+ *
+ * Since the value of an enum entry might not be meaningful for the analysis,
+ * a tool can show the name of enum entry instead.
+ *
+ * @note EXPERIMENTAL
+ */
+typedef struct nvtxPayloadEnum_t
+{
+    /**
+     * Name of the enum value.
+     */
+    const char* name;
+
+    /**
+     * Value of the enum entry.
+     */
+    uint64_t    value;
+
+    /**
+     * Indicates that this entry sets a specific set of bits, which can be used
+     * to easily define bitsets.
+     */
+    int8_t      isFlag;
+} nvtxPayloadEnum_t;
+
+/**
+ * The values are used to set the field `fieldMask` and specify which fields in
+ * `nvtxPayloadEnumAttr_t` are set.
+ */
+#define NVTX_PAYLOAD_ENUM_ATTR_NAME        (1 << 1)
+#define NVTX_PAYLOAD_ENUM_ATTR_ENTRIES     (1 << 2)
+#define NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES (1 << 3)
+#define NVTX_PAYLOAD_ENUM_ATTR_SIZE        (1 << 4)
+#define NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID   (1 << 5)
+
+/**
+ * NVTX payload enumeration type attributes.
+ */
+typedef struct nvtxPayloadEnumAttr_t {
+    /**
+     * Mask of valid fields in this struct.
+     * The values from `enum nvtxPayloadSchemaAttributes` have to be used.
+     */
+    uint64_t                 fieldMask;
+
+    /**
+     * Name of the enum. (Optional)
+     */
+    const char*              name;
+
+    /**
+     * Entries of the enum. (Mandatory)
+     */
+    const nvtxPayloadEnum_t* entries;
+
+    /**
+     * Number of entries in the enum. (Mandatory)
+     */
+    size_t                   numEntries;
+
+    /**
+     * Size of enumeration type in bytes
+     */
+    size_t                   sizeOfEnum;
+
+    /**
+     * Static/custom schema ID must be
+     * >= NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START and
+     *  < NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_DYNAMIC_START
+     */
+    uint64_t                 schemaId;
+} nvtxPayloadEnumAttr_t;
+
+/**
+ * \brief Register an enumeration type with the payload extension.
+ *
+ * @param domain NVTX domain handle
+ * @param attr NVTX payload enumeration type attributes.
+ */
+NVTX_DECLSPEC uint64_t nvtxPayloadEnumRegister(nvtxDomainHandle_t domain,
+    const nvtxPayloadEnumAttr_t* attr);
+
+/**
+ * \brief Callback Ids of API functions in the payload extension.
+ *
+ * The NVTX handler can use these values to register a handler function. When
+ * InitializeInjectionNvtxExtension(nvtxExtModuleInfo_t* moduleInfo) is
+ * executed, a handler routine 'handlenvtxPayloadRegisterSchema' can be
+ * registered as follows:
+ *      moduleInfo->segments->slots[NVTX3EXT_CBID_nvtxPayloadSchemaRegister] =
+ *          (intptr_t)handlenvtxPayloadRegisterSchema;
+ */
+typedef enum NvtxExtPayloadCallbackId
+{
+    NVTX3EXT_CBID_nvtxPayloadSchemaRegister = 0,
+    NVTX3EXT_CBID_nvtxPayloadEnumRegister   = 1,
+    NVTX3EXT_CBID_PAYLOAD_FN_NUM            = 2
+} NvtxExtPayloadCallbackId;
+
+#ifdef __GNUC__
+#pragma GCC visibility push(internal)
+#endif
+
+#define NVTX_EXT_TYPES_GUARD /* Ensure other headers cannot include directly */
+#include "nvtxExtDetail/nvtxExtTypes.h"
+#undef NVTX_EXT_TYPES_GUARD
+
+#ifndef NVTX_NO_IMPL
+#define NVTX_EXT_IMPL_PAYLOAD_GUARD /* Ensure other headers cannot included directly */
+#include "nvtxExtDetail/nvtxExtPayloadTypeInfo.h"
+#include "nvtxExtDetail/nvtxExtImplPayload_v1.h"
+#undef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#endif /*NVTX_NO_IMPL*/
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* NVTOOLSEXT_PAYLOAD_H */
@@ -16,10 +16,10 @@ extern "C" {

 typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
 typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(hipStream_t stream, const char* name);
-typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(hipEvent_t event, const char* name);
-typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
+typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
+typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);

 NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
 {
@@ -39,7 +39,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
@@ -48,7 +48,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
@@ -57,7 +57,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
@@ -66,7 +66,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* nam
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
@@ -15,16 +15,16 @@
 extern "C" {
 #endif /* __cplusplus */

-typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(hipDevice_t device, const char* name);
-typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(hipDevice_t device, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(hipCtx_t context, const char* name);
-typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(hipCtx_t context, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(hipStream_t stream, const char* name);
-typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
-typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(hipEvent_t event, const char* name);
-typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
+typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
+typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
+typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
+typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
+typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);

-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
@@ -33,7 +33,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
@@ -42,7 +42,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
@@ -51,7 +51,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* nam
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
@@ -60,7 +60,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
@@ -69,7 +69,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* na
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
@@ -78,7 +78,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t*
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
@@ -87,7 +87,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
 #endif /*NVTX_DISABLE*/
 }

-NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name)
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
 {
 #ifndef NVTX_DISABLE
    nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
@@ -18,7 +18,7 @@
 /* ------ Dependency-free types binary-compatible with real types ------- */

 /* In order to avoid having the NVTX core API headers depend on non-NVTX
-*  headers like hip/hip_runtime.h, NVTX defines binary-compatible types to use for
+*  headers like cuda.h, NVTX defines binary-compatible types to use for
 *  safely making the initialization versions of all NVTX functions without
 *  needing to have definitions for the real types. */

@@ -0,0 +1,93 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifndef NVTX_EXT_IMPL_H
+#define NVTX_EXT_IMPL_H
+/* ---- Include required platform headers ---- */
+
+#if defined(_WIN32) 
+
+#include <Windows.h>
+
+#else
+#include <unistd.h>
+
+#if defined(__ANDROID__)
+#include <android/api-level.h> 
+#endif
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#endif
+
+/* ---- Define macros used in this file ---- */
+
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+// #ifdef __GNUC__
+// #pragma GCC visibility push(hidden)
+// #endif
+
+#define NVTX_EXTENSION_FRESH 0
+#define NVTX_EXTENSION_DISABLED 1
+#define NVTX_EXTENSION_STARTING 2
+#define NVTX_EXTENSION_LOADED 3
+
+NVTX_LINKONCE_DEFINE_GLOBAL NvtxExtInitializeInjectionFunc_t NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = (NvtxExtInitializeInjectionFunc_t)0;
+
+#define NVTX_EXT_INIT_GUARD
+#include "nvtxExtInit.h"
+#undef NVTX_EXT_INIT_GUARD
+
+// #ifdef __GNUC__
+// #pragma GCC visibility pop
+// #endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* NVTX_EXT_IMPL_H */
@@ -0,0 +1,85 @@
+/*
+* Copyright 2021  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#define NVTX_EXT_IMPL_GUARD
+#include "nvtxExtImpl.h"
+#undef NVTX_EXT_IMPL_GUARD
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID) \
+    NAME##_v##VERSION##_mem##COMPATID
+#define NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, VERSION, COMPATID) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L3(NAME, VERSION, COMPATID)
+#define NVTX_EXT_PAYLOAD_VERSIONED_ID(NAME) \
+    NVTX_EXT_PAYLOAD_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION, NVTX_EXT_COMPATID_PAYLOAD)
+
+/*
+ * Function slots for the binary payload extension. First entry is the module
+ * state, initialized to `0` (`NVTX_EXTENSION_FRESH`).
+ */
+NVTX_LINKONCE_DEFINE_GLOBAL intptr_t
+NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_PAYLOAD_FN_NUM + 1]
+    = {0};
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)()
+{
+    nvtxExtModuleSegment_t segment = {
+        0, // unused (only one segment)
+        NVTX3EXT_CBID_PAYLOAD_FN_NUM,
+        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots) + 1
+    };
+
+    nvtxExtModuleInfo_t module = {
+        NVTX_VERSION, sizeof(nvtxExtModuleInfo_t),
+        NVTX_EXT_MODULEID_PAYLOAD, NVTX_EXT_COMPATID_PAYLOAD,
+        1, &segment, // number of segments, segments
+        NULL, // no export function needed
+        // bake type sizes and alignment information into program binary
+        &nvtxExtPayloadTypeInfo
+    };
+
+    NVTX_INFO( "%s\n", __FUNCTION__  );
+
+    NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce)(&module,
+        NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots));
+}
+
+#define NVTX_EXT_FN_IMPL(ret_val, fn_name, signature, arg_names) \
+typedef ret_val ( * fn_name##_impl_fntype )signature; \
+NVTX_LINKONCE_DEFINE_FUNCTION ret_val fn_name signature { \
+    intptr_t slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+    if (slot != NVTX_EXTENSION_DISABLED) { \
+        if (slot) { \
+            return (*(fn_name##_impl_fntype)slot) arg_names; \
+        } else { \
+            NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadInitOnce)(); \
+            slot = NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadSlots)[NVTX3EXT_CBID_##fn_name + 1]; \
+            if (slot != NVTX_EXTENSION_DISABLED && slot) { \
+                return (*(fn_name##_impl_fntype)slot) arg_names; \
+            } \
+        } \
+    } \
+    return ((ret_val)(intptr_t)-1); \
+}
+
+NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadSchemaRegister, (nvtxDomainHandle_t domain, const nvtxPayloadSchemaAttr_t* attr), (domain, attr))
+
+NVTX_EXT_FN_IMPL(uint64_t, nvtxPayloadEnumRegister, (nvtxDomainHandle_t domain, const nvtxPayloadEnumAttr_t* attr), (domain, attr))
+
+#undef NVTX_EXT_FN_IMPL
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
@@ -0,0 +1,363 @@
+/*
+* Copyright 2009-2020  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+#ifndef NVTX_EXT_INIT_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/* ---- Platform-independent helper definitions and functions ---- */
+
+/* Prefer macros over inline functions to reduce symbol resolution at link time */
+
+#if defined(_WIN32)
+#define NVTX_PATHCHAR   wchar_t
+#define NVTX_STR(x)     L##x
+#define NVTX_GETENV     _wgetenv
+#define NVTX_BUFSIZE    MAX_PATH
+#define NVTX_DLLHANDLE  HMODULE
+#define NVTX_DLLOPEN(x) LoadLibraryW(x)
+#define NVTX_DLLFUNC    GetProcAddress
+#define NVTX_DLLCLOSE   FreeLibrary
+#define NVTX_YIELD()    SwitchToThread()
+#define NVTX_MEMBAR()   MemoryBarrier()
+#define NVTX_ATOMIC_WRITE_32(address, value)                        InterlockedExchange((volatile LONG*)address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) old = InterlockedCompareExchange((volatile LONG*)address, exchange, comparand)
+#define NVTX_ATOMIC_WRITE_PTR(address, value)                        InterlockedExchangePointer((volatile PVOID*)address, (PVOID)value)
+#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) old = (intptr_t)InterlockedCompareExchangePointer((volatile PVOID*)address, (PVOID)exchange, (PVOID)comparand)
+
+
+#elif defined(__GNUC__)
+#define NVTX_PATHCHAR   char
+#define NVTX_STR(x)     x
+#define NVTX_GETENV     getenv
+#define NVTX_BUFSIZE    PATH_MAX
+#define NVTX_DLLHANDLE  void*
+#define NVTX_DLLOPEN(x) dlopen(x, RTLD_LAZY)
+#define NVTX_DLLFUNC    dlsym
+#define NVTX_DLLCLOSE   dlclose
+#define NVTX_YIELD()    sched_yield()
+#define NVTX_MEMBAR()   __sync_synchronize()
+/* Ensure full memory barrier for atomics, to match Windows functions */
+#define NVTX_ATOMIC_WRITE_32(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_32(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#define NVTX_ATOMIC_WRITE_PTR(address, value)                  __sync_synchronize();       __sync_lock_test_and_set(address, value)
+#define NVTX_ATOMIC_CAS_PTR(old, address, exchange, comparand) __sync_synchronize(); old = __sync_val_compare_and_swap(address, exchange, comparand)
+#else
+#error The library does not support your configuration!
+#endif
+
+/* Define this to 1 for platforms that where pre-injected libraries can be discovered. */
+#if defined(_WIN32)
+/* TODO */
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#else
+#define NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY 0
+#endif
+
+/* Define this to 1 for platforms that support environment variables */
+/* TODO: Detect UWP, a.k.a. Windows Store app, and set this to 0. */
+/* Try:  #if defined(WINAPI_FAMILY_PARTITION) && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) */
+#define NVTX_SUPPORT_ENV_VARS 1
+
+/* Define this to 1 for platforms that support dynamic/shared libraries */
+#define NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY 1
+
+/* Injection libraries implementing InitializeInjectionNvtxExtension may be statically linked,
+*  and this will override any dynamic injection.  Useful for platforms where dynamic
+*  injection is not available.  Since weak symbols not explicitly marked extern are
+*  guaranteed to be initialized to zero if no definitions are found by the linker, the
+*  dynamic injection process proceeds normally if pfnInitializeInjectionNvtx2 is 0. */
+#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__)
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 1
+/* To statically inject an NVTX library, define InitializeInjectionNvtxExtension_fnptr as a normal
+*  symbol (not weak) pointing to the implementation of InitializeInjectionNvtxExtension (which
+*  does not need to be named "InitializeInjectionNvtxExtension" as is necessary in a dynamic
+*  injection library. */
+__attribute__((weak)) NvtxExtInitializeInjectionFunc_t InitializeInjectionNvtxExtension_fnptr;
+#else
+#define NVTX_SUPPORT_STATIC_INJECTION_LIBRARY 0
+#endif
+
+
+
+/* This function tries to find or load an NVTX injection library and get the
+*  address of its InitializeInjectionExtension function.  If such a function pointer
+*  is found, it is called, and passed the address of this NVTX instance's
+*  nvtxGetExportTable function, so the injection can attach to this instance.
+*  If the initialization fails for any reason, any dynamic library loaded will
+*  be freed, and all NVTX implementation functions will be set to no-ops.  If
+*  initialization succeeds, NVTX functions not attached to the tool will be set
+*  to no-ops.  This is implemented as one function instead of several small
+*  functions to minimize the number of weak symbols the linker must resolve.
+*  Order of search is:
+*  - Pre-injected library exporting InitializeInjectionNvtxExtension
+*  - Loadable library exporting InitializeInjectionNvtxExtension
+*      - Path specified by env var NVTX_INJECTION??_PATH (?? is 32 or 64)
+*      - On Android, libNvtxInjection??.so within the package (?? is 32 or 64)
+*  - Statically-linked injection library defining InitializeInjectionNvtx2_fnptr
+*/
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr);
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(NvtxExtInitializeInjectionFunc_t* out_init_fnptr)
+{
+    const char* const initFuncName = "InitializeInjectionNvtxExtension";
+    NvtxExtInitializeInjectionFunc_t init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
+    NVTX_DLLHANDLE injectionLibraryHandle = (NVTX_DLLHANDLE)0;
+
+    if(out_init_fnptr){
+        *out_init_fnptr = (NvtxExtInitializeInjectionFunc_t)0;
+    }
+
+#if NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY
+    /* Use POSIX global symbol chain to query for init function from any module */
+    init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(0, initFuncName);
+#endif
+
+#if NVTX_SUPPORT_DYNAMIC_INJECTION_LIBRARY
+    /* Try discovering dynamic injection library to load */
+    if (!init_fnptr)
+    {
+#if NVTX_SUPPORT_ENV_VARS
+        /* If env var NVTX_INJECTION64_PATH is set, it should contain the path
+        *  to a 64-bit dynamic NVTX injection library (and similar for 32-bit). */
+        const NVTX_PATHCHAR* const nvtxEnvVarName = (sizeof(void*) == 4)
+            ? NVTX_STR("NVTX_INJECTION32_PATH")
+            : NVTX_STR("NVTX_INJECTION64_PATH");
+#endif /* NVTX_SUPPORT_ENV_VARS */
+        NVTX_PATHCHAR injectionLibraryPathBuf[NVTX_BUFSIZE];
+        const NVTX_PATHCHAR* injectionLibraryPath = (const NVTX_PATHCHAR*)0;
+
+        /* Refer to this variable explicitly in case all references to it are #if'ed out */
+        (void)injectionLibraryPathBuf;
+
+#if NVTX_SUPPORT_ENV_VARS
+        /* Disable the warning for getenv & _wgetenv -- this usage is safe because
+        *  these functions are not called again before using the returned value. */
+#if defined(_MSC_VER)
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif
+        injectionLibraryPath = NVTX_GETENV(nvtxEnvVarName);
+#if defined(_MSC_VER)
+#pragma warning( pop )
+#endif
+#endif
+
+#if defined(__ANDROID__)
+        if (!injectionLibraryPath)
+        {
+            const char *bits = (sizeof(void*) == 4) ? "32" : "64";
+            char cmdlineBuf[32];
+            char pkgName[PATH_MAX];
+            int count;
+            int pid;
+            FILE *fp;
+            size_t bytesRead;
+            size_t pos;
+
+            pid = (int)getpid();
+            count = snprintf(cmdlineBuf, sizeof(cmdlineBuf), "/proc/%d/cmdline", pid);
+            if (count <= 0 || count >= (int)sizeof(cmdlineBuf))
+            {
+                NVTX_ERR("Path buffer too small for: /proc/%d/cmdline\n", pid);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            fp = fopen(cmdlineBuf, "r");
+            if (!fp)
+            {
+                NVTX_ERR("File couldn't be opened: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            bytesRead = fread(pkgName, 1, sizeof(pkgName) - 1, fp);
+            fclose(fp);
+            if (bytesRead == 0)
+            {
+                NVTX_ERR("Package name couldn't be read from file: %s\n", cmdlineBuf);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            pkgName[bytesRead] = 0;
+
+            /* String can contain colon as a process separator. In this case the package name is before the colon. */
+            pos = 0;
+            while (pos < bytesRead && pkgName[pos] != ':' && pkgName[pos] != '\0')
+            {
+                ++pos;
+            }
+            pkgName[pos] = 0;
+
+            count = snprintf(injectionLibraryPathBuf, NVTX_BUFSIZE, "/data/data/%s/files/libNvtxInjection%s.so", pkgName, bits);
+            if (count <= 0 || count >= NVTX_BUFSIZE)
+            {
+                NVTX_ERR("Path buffer too small for: /data/data/%s/files/libNvtxInjection%s.so\n", pkgName, bits);
+                return NVTX_ERR_INIT_ACCESS_LIBRARY;
+            }
+
+            /* On Android, verify path is accessible due to aggressive file access restrictions. */
+            /* For dlopen, if the filename contains a leading slash, then it is interpreted as a */
+            /* relative or absolute pathname; otherwise it will follow the rules in ld.so. */
+            if (injectionLibraryPathBuf[0] == '/')
+            {
+#if (__ANDROID_API__ < 21)
+                int access_err = access(injectionLibraryPathBuf, F_OK | R_OK);
+#else
+                int access_err = faccessat(AT_FDCWD, injectionLibraryPathBuf, F_OK | R_OK, 0);
+#endif
+                if (access_err != 0)
+                {
+                    NVTX_ERR("Injection library path wasn't accessible [code=%s] [path=%s]\n", strerror(errno), injectionLibraryPathBuf);
+                    return NVTX_ERR_INIT_ACCESS_LIBRARY;
+                }
+            }
+            injectionLibraryPath = injectionLibraryPathBuf;
+        }
+#endif
+
+        /* At this point, injectionLibraryPath is specified if a dynamic
+        *  injection library was specified by a tool. */
+        if (injectionLibraryPath)
+        {
+            /* Load the injection library */
+            injectionLibraryHandle = NVTX_DLLOPEN(injectionLibraryPath);
+            if (!injectionLibraryHandle)
+            {
+                NVTX_ERR("Failed to load injection library\n");
+                return NVTX_ERR_INIT_LOAD_LIBRARY;
+            }
+            else
+            {
+                /* Attempt to get the injection library's entry-point */
+                init_fnptr = (NvtxExtInitializeInjectionFunc_t)NVTX_DLLFUNC(injectionLibraryHandle, initFuncName);
+                if (!init_fnptr)
+                {
+                    NVTX_DLLCLOSE(injectionLibraryHandle);
+                    NVTX_ERR("Failed to get address of function %s from injection library\n", initFuncName);
+                    return NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT;
+                }
+            }
+        }
+    }
+#endif
+
+#if NVTX_SUPPORT_STATIC_INJECTION_LIBRARY
+    if (!init_fnptr)
+    {
+        /* Check weakly-defined function pointer.  A statically-linked injection can define this as
+        *  a normal symbol and it will take precedence over a dynamic injection. */
+        if (InitializeInjectionNvtxExtension_fnptr)
+        {
+            init_fnptr = InitializeInjectionNvtxExtension_fnptr;
+        }
+    }
+#endif
+
+    if(out_init_fnptr){
+        *out_init_fnptr = init_fnptr;
+    }
+
+    /* At this point, if init_fnptr is not set, then no tool has specified
+    *  an NVTX injection library -- return non-success result so all NVTX
+    *  API functions will be set to no-ops. */
+    if (!init_fnptr)
+    {
+        return NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE;
+    }
+
+    return NVTX_SUCCESS;
+}
+
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxExtInitOnce) (
+    nvtxExtModuleInfo_t* moduleInfo,
+    intptr_t* moduleState
+    )
+{
+    intptr_t old;
+
+    NVTX_INFO( "%s\n", __FUNCTION__ );
+
+    if( *moduleState == NVTX_EXTENSION_LOADED) {
+        return;
+    }
+
+    NVTX_ATOMIC_CAS_PTR(
+        old,
+        moduleState,
+        NVTX_EXTENSION_STARTING,
+        NVTX_EXTENSION_FRESH);
+    if (old == NVTX_EXTENSION_FRESH)
+    {
+        NvtxExtInitializeInjectionFunc_t init_fnptr = NVTX_VERSIONED_IDENTIFIER(injectionFnPtr);
+        int entryPointStatus = 0;
+        int forceAllToNoops = 0;
+
+        /* Load & initialize injection library -- it will assign the function pointers */
+        if(init_fnptr == 0){
+            int result = 0;
+
+            /* try to load vanilla NVTX first*/
+            nvtxInitialize(0);
+
+            result = NVTX_VERSIONED_IDENTIFIER(nvtxExtLoadInjectionLibrary)(&init_fnptr);
+            /*at this point init_fnptr will be either 0 or a real function*/
+
+            if(result == NVTX_SUCCESS) {
+                NVTX_VERSIONED_IDENTIFIER(injectionFnPtr) = init_fnptr;
+            }
+            else {
+                NVTX_ERR("Failed to load injection library\n");
+            }
+        }
+
+        if(init_fnptr != 0) {
+            /* Invoke injection library's initialization function.  If it returns
+            *  0 (failure) and a dynamic injection was loaded, unload it. */
+            entryPointStatus = init_fnptr(moduleInfo);
+            if (entryPointStatus == 0) {
+                NVTX_ERR("Failed to initialize injection library -- initialization function returned 0\n");
+            }
+        }
+
+        /* Clean up any functions that are still uninitialized so that they are skipped.
+         * Set all to null if injection init function failed as well.
+        */
+        forceAllToNoops = (init_fnptr == 0) || (entryPointStatus == 0);
+        for(size_t s = 0; s < moduleInfo->segmentsCount; ++s){
+            nvtxExtModuleSegment_t* segment = moduleInfo->segments+s;
+            for(size_t i = 0; i < segment->slotCount; ++i){
+                if(forceAllToNoops || (segment->functionSlots[i] == NVTX_EXTENSION_FRESH)){
+                    segment->functionSlots[i] = NVTX_EXTENSION_DISABLED;
+                }
+            }
+        }
+
+        NVTX_MEMBAR();
+
+        /* Signal that initialization has finished, so now the assigned function pointers will be used */
+        NVTX_ATOMIC_WRITE_PTR(
+            moduleState,
+            NVTX_EXTENSION_LOADED);
+    }
+    else /* Spin-wait until initialization has finished */
+    {
+        NVTX_MEMBAR();
+        while (*moduleState != NVTX_EXTENSION_LOADED)
+        {
+            NVTX_YIELD();
+            NVTX_MEMBAR();
+        }
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
@@ -0,0 +1,128 @@
+#ifndef NVTX_EXT_IMPL_PAYLOAD_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExtPayload.h (except when NVTX_NO_IMPL is defined).
+#endif
+
+/*
+ * Helper array to get the alignment for each predefined C language type.
+ */
+
+typedef void* pointer_type;
+
+#if __STDC_VERSION__ >= 201112L /* or CPP11 */
+#include <stdalign.h>
+#define nvtx_alignof(type) alignof(type)
+#define nvtx_alignof2(type,tname) alignof(type)
+#else /*  __STDC_VERSION__ >= 201112L */
+#ifndef __cplusplus
+
+#include <stddef.h>
+#define nvtx_alignof(type) offsetof(struct {char c; type d;}, d)
+#define nvtx_alignof2(type,tname) nvtx_alignof(type)
+
+#else /* __cplusplus */
+
+#define MKTYPEDEF(TYPE) typedef struct {char c; TYPE d;} _nvtx_##TYPE
+#define MKTYPEDEF2(TYPE,TNAME) typedef struct {char c; TYPE d;} _nvtx_##TNAME
+#define nvtx_alignof(TNAME) offsetof(_nvtx_##TNAME, d)
+#define nvtx_alignof2(type,tname) offsetof(_nvtx_##tname, d)
+
+MKTYPEDEF(char);
+MKTYPEDEF2(unsigned char, uchar);
+MKTYPEDEF(short);
+MKTYPEDEF2(unsigned short, ushort);
+MKTYPEDEF(int);
+MKTYPEDEF2(unsigned int, uint);
+MKTYPEDEF(long);
+MKTYPEDEF2(unsigned long, ulong);
+MKTYPEDEF2(long long, longlong);
+MKTYPEDEF2(unsigned long long, ulonglong);
+
+MKTYPEDEF(int8_t);
+MKTYPEDEF(uint8_t);
+MKTYPEDEF(int16_t);
+MKTYPEDEF(uint16_t);
+MKTYPEDEF(int32_t);
+MKTYPEDEF(uint32_t);
+MKTYPEDEF(int64_t);
+MKTYPEDEF(uint64_t);
+
+MKTYPEDEF(float);
+MKTYPEDEF(double);
+MKTYPEDEF2(long double, longdouble);
+
+MKTYPEDEF(size_t);
+MKTYPEDEF(pointer_type);
+
+MKTYPEDEF(wchar_t);
+#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
+    {sizeof(char8_t), nvtx_alignof(char8_t)},
+    MKTYPEDEF(char8_t);
+#endif
+#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+    MKTYPEDEF(char16_t);
+    MKTYPEDEF(char32_t);
+#endif
+
+#undef MKTYPEDEF
+#undef MKTYPEDEF2
+
+#endif /* __cplusplus */
+#endif /*  __STDC_VERSION__ >= 201112L */
+
+/*
+ * The order of entries must match the values in`enum nvtxPayloadSchemaEntryType`.
+ */
+const nvtxPayloadEntryTypeInfo_t nvtxExtPayloadTypeInfo[NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE] =
+{
+    /* The first entry contains this array's length and the size of each entry in this array. */
+    {NVTX_PAYLOAD_ENTRY_TYPE_INFO_ARRAY_SIZE, sizeof(nvtxPayloadEntryTypeInfo_t)},
+
+    /*** C integer types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR */   {sizeof(char), nvtx_alignof(char)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UCHAR */  {sizeof(unsigned char), nvtx_alignof2(unsigned char, uchar)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_SHORT */  {sizeof(short), nvtx_alignof(short)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_USHORT */ {sizeof(unsigned short), nvtx_alignof2(unsigned short, ushort)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT */    {sizeof(int), nvtx_alignof(int)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT */   {sizeof(unsigned int), nvtx_alignof2(unsigned int, uint)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONG */   {sizeof(long), nvtx_alignof(long)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ULONG */  {sizeof(unsigned long), nvtx_alignof2(unsigned long, ulong)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONGLONG */  {sizeof(long long), nvtx_alignof2(long long, longlong)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ULONGLONG */ {sizeof(unsigned long long), nvtx_alignof2(unsigned long long,ulonglong)},
+
+    /*** Integer types with explicit size ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT8 */   {sizeof(int8_t),   nvtx_alignof(int8_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT8 */  {sizeof(uint8_t),  nvtx_alignof(uint8_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT16 */  {sizeof(int16_t),  nvtx_alignof(int16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT16 */ {sizeof(uint16_t), nvtx_alignof(uint16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT32 */  {sizeof(int32_t),  nvtx_alignof(int32_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT32 */ {sizeof(uint32_t), nvtx_alignof(uint32_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_INT64 */  {sizeof(int64_t),  nvtx_alignof(int64_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_UINT64 */ {sizeof(uint64_t), nvtx_alignof(uint64_t)},
+
+    /*** C floating point types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_FLOAT */      {sizeof(float),       nvtx_alignof(float)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_DOUBLE */     {sizeof(double),      nvtx_alignof(double)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_LONGDOUBLE */ {sizeof(long double), nvtx_alignof2(long double, longdouble)},
+
+    /* NVTX_PAYLOAD_ENTRY_TYPE_SIZE */    {sizeof(size_t),       nvtx_alignof(size_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_ADDRESS */ {sizeof(pointer_type), nvtx_alignof(pointer_type)},
+
+    /*** Special character types ***/
+    /* NVTX_PAYLOAD_ENTRY_TYPE_WCHAR */ {sizeof(wchar_t), nvtx_alignof(wchar_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR8 */
+#if (__STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
+    {sizeof(char8_t), nvtx_alignof(char8_t)},
+#else
+    {0, 0},
+#endif
+#if (__STDC_VERSION__ >= 201112L) || (defined(__cplusplus) && __cplusplus >= 201103L)
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {sizeof(char16_t), nvtx_alignof(char16_t)},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {sizeof(char32_t), nvtx_alignof(char32_t)}
+#else
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR16 */ {0, 0},
+    /* NVTX_PAYLOAD_ENTRY_TYPE_CHAR32 */ {0, 0}
+#endif
+};
+
+#undef nvtx_alignof
+#undef nvtx_alignof2
@@ -0,0 +1,44 @@
+/*
+* Copyright 2021  NVIDIA Corporation.  All rights reserved.
+*
+* Licensed under the Apache License v2.0 with LLVM Exceptions.
+* See https://llvm.org/LICENSE.txt for license information.
+* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+*/
+
+/* This header defines types which are used by the internal implementation
+*  of NVTX and callback subscribers.  API clients do not use these types,
+*  so they are defined here instead of in nvToolsExt.h to clarify they are
+*  not part of the NVTX client API. */
+
+#ifndef NVTXEXTTYPES_H
+#define NVTXEXTTYPES_H
+
+#ifndef NVTX_EXT_TYPES_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt[EXTENSION].h.
+#endif
+
+typedef intptr_t (NVTX_API * NvtxExtGetExportFunction_t)(uint32_t exportFunctionId);
+
+typedef struct nvtxExtModuleSegment_t
+{
+    size_t segmentId;
+    size_t slotCount;
+    intptr_t* functionSlots;
+} nvtxExtModuleSegment_t;
+
+typedef struct nvtxExtModuleInfo_t
+{
+    uint16_t nvtxVer;
+    uint16_t structSize;
+    uint16_t moduleId;
+    uint16_t compatId;
+    size_t segmentsCount;
+    nvtxExtModuleSegment_t* segments;
+    NvtxExtGetExportFunction_t getExportFunction;
+    const void* extInfo;
+} nvtxExtModuleInfo_t;
+
+typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
+
+#endif /* NVTXEXTTYPES_H */
@@ -7,9 +7,14 @@
 #ifndef NCCL_NVTX_STUB_H_
 #define NCCL_NVTX_STUB_H_

+#include <nvtx3/nvToolsExtPayload.h>
+
 struct nccl_domain{static constexpr char const* name{"NCCL"};};

 #define NVTX3_FUNC_RANGE_IN(domain)
 #define nvtxNameOsThreadA(syscall, thread)
+#define NVTX3_FUNC_WITH_PARAMS(ID, S, P)
+
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11

 #endif
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -30,32 +30,16 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
 #define RCCL_PARAM_DECLARE(name) \
 int64_t rcclParam##name()

-#define RCCL_PARAM(name, env, default_value) \
+#define RCCL_PARAM(name, env, deftVal) \
 pthread_mutex_t rcclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
 int64_t rcclParam##name() { \
-  static_assert(default_value != -1LL, "default value cannot be -1"); \
-  static int64_t value = -1LL; \
-  int64_t localValue; \
-  pthread_mutex_lock(&rcclParamMutex##name); \
-  localValue = value; \
-  char* en = getenv("RCCL_TEST_ENV_VARS"); \
-  if (value == -1LL || (en && (strcmp(en, "ENABLE") == 0))){  \
-    value = default_value; \
-    char* str = getenv("RCCL_" env); \
-    if (str && strlen(str) > 0) { \
-      errno = 0; \
-      int64_t v = strtoll(str, NULL, 0); \
-      if (errno) { \
-        INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "RCCL_" env, value); \
-      } else { \
-        value = v; \
-        INFO(NCCL_ALL,"%s set by environment to %lu.", "RCCL_" env, value);  \
-      } \
+    constexpr int64_t uninitialized = INT64_MIN; \
+    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
+    static int64_t cache = uninitialized; \
+    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
+      ncclLoadParam("RCCL_" env, deftVal, uninitialized, &cache); \
    } \
-    localValue = value; \
-  } \
-  pthread_mutex_unlock(&rcclParamMutex##name); \
-  return localValue; \
-}
+    return cache; \
+  }

 #endif
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -13,6 +13,7 @@
 #include "info.h"
 #include "socket.h"
 #include <pthread.h>
+#include "shm.h"

 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };

@@ -116,6 +117,7 @@ struct ncclProxyOpsPool {

 struct ncclProxyOps {
  ncclProxyOpsPool* pool;
+  ncclShmHandle_t handle;
  int count;
  int freeOp;
  int nextOps;
@@ -155,6 +157,7 @@ struct ncclProxyPool;
 struct ncclProxyProgressState {
  // Used by main threads to send work to progress thread
  struct ncclProxyOpsPool* opsPool;
+  ncclShmHandle_t handle;
  char opsPoolShmSuffix[6];

  pthread_t thread;
@@ -174,7 +177,6 @@ struct ncclProxyState {
  struct ncclSocket* listenSock;
  int stop;
  CUcontext cudaCtx;
-  int safeAbortFlag;

  // Used by main thread
  union ncclSocketAddress* peerAddresses;
@@ -186,6 +188,15 @@ struct ncclProxyState {
  struct ncclProxyProgressState progressState;
 };

+enum proxyConnectState {
+  connUninitialized     = 0,
+  connInitialized       = 1,
+  connSharedInitialized = 2,
+  connSetupDone         = 3,
+  connConnected         = 4,
+  numConnStates         = 5
+};
+
 struct ncclProxyConnection {
  int send, transport, shared;
  int localRank;
@@ -194,7 +205,7 @@ struct ncclProxyConnection {
  struct ncclProxyArgs *proxyAppend;
  struct ncclProxyArgs **proxyAppendPtr;
  void* transportResources;
-  bool initFlag;
+  proxyConnectState state;
 };

 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -9,7 +9,9 @@

 #include "nccl.h"

-ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
-ncclResult_t ncclShmUnlink(const char* shmname);
-ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
+typedef void* ncclShmHandle_t;
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmClose(ncclShmHandle_t handle);
+ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
+
 #endif
@@ -21,6 +21,7 @@
 #define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
 #define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
+#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL

 /* Common socket address storage structure for IPv4/IPv6 */
 union ncclSocketAddress {
@@ -30,32 +31,59 @@ union ncclSocketAddress {
 };

 enum ncclSocketState {
-  ncclSocketConnecting = 0,
-  ncclSocketConnected = 1,
-  ncclSocketError = 2,
-  ncclSocketStateNum = 3
-} ;
+  ncclSocketStateNone = 0,
+  ncclSocketStateInitialized = 1,
+  ncclSocketStateAccepting = 2,
+  ncclSocketStateAccepted = 3,
+  ncclSocketStateConnecting = 4,
+  ncclSocketStateConnectPolling = 5,
+  ncclSocketStateConnected = 6,
+  ncclSocketStateReady = 7,
+  ncclSocketStateClosed = 8,
+  ncclSocketStateError = 9,
+  ncclSocketStateNum = 10
+};
+
+enum ncclSocketType {
+  ncclSocketTypeUnknown = 0,
+  ncclSocketTypeBootstrap = 1,
+  ncclSocketTypeProxy = 2,
+  ncclSocketTypeNetSocket = 3,
+  ncclSocketTypeNetIb = 4
+};

 struct ncclSocket {
  int fd;
+  int acceptFd;
+  int timedOutRetries;
+  int refusedRetries;
  union ncclSocketAddress addr;
  volatile uint32_t* abortFlag;
  int asyncFlag;
  enum ncclSocketState state;
+  int salen;
+  uint64_t magic;
+  enum ncclSocketType type;
 };

 const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
-ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
+ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
 int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+
+// Initialize a socket
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
 // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 ncclResult_t ncclSocketListen(struct ncclSocket* sock);
+ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
 // Connect to sock->addr. sock->fd is set after a successful call.
 ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse = 0);
 // Return socket connection state.
-ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
-// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
-ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
+ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
+// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
+ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
+ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);

 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
@@ -65,6 +93,5 @@ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
-/* initialize a socket. */
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock);
 #endif
@@ -98,16 +98,19 @@ ncclResult_t ncclStrongStreamLaunchKernel(
 );

 // Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
+// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
+// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
+// implementation to induce few graph dependencies.
 ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
 );
 // `b` must be capturing within `graph`.
 ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
 );
 // `a` must be capturing within `graph`.
 ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
 );

 // Synchrnoization does not need the strong stream to be acquired.
@@ -27,6 +27,7 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
+ncclResult_t getRandomData(void* buffer, size_t bytes);

 struct netIf {
  char prefix[64];
@@ -48,6 +49,19 @@ inline uint64_t clockNano() {
  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
 }

+/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
+ * return -1 */
+inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
+  ncclResult_t ret = ncclSuccess;
+  if (bytes > 0) {
+    const size_t one = 1UL;
+    FILE* fp = fopen("/dev/urandom", "r");
+    if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError;
+    if (fp) fclose(fp);
+  }
+  return ret;
+}
+
 ////////////////////////////////////////////////////////////////////////////////

 template<typename Int>
@@ -0,0 +1,26 @@
+#include "nccl.h"
+#include "nvtx.h"
+
+static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
+  {"Sum", ncclSum},
+  {"Product", ncclProd},
+  {"Max", ncclMax},
+  {"Min", ncclMin},
+  {"Avg", ncclAvg}
+};
+
+// Must be called before the first call to any reduction operation.
+void initNvtxRegisteredEnums() {
+  // Register schemas and strings
+  constexpr const nvtxPayloadEnumAttr_t eAttr {
+    .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
+      NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
+    .name = NULL,
+    .entries = NvtxEnumRedSchema,
+    .numEntries = std::extent<decltype(NvtxEnumRedSchema)>::value,
+    .sizeOfEnum = sizeof(ncclRedOp_t),
+    .schemaId = NVTX_PAYLOAD_ENTRY_NCCL_REDOP
+  };
+
+  nvtxPayloadEnumRegister(nvtx3::domain::get<nccl_domain>(), &eAttr);
+}
@@ -87,7 +87,7 @@ static void initOnceFunc() {

  cudaLib = dlopen(path, RTLD_LAZY);
  if (cudaLib == NULL) {
-    WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
+    WARN("Failed to find CUDA library (NCCL_CUDA_PATH='%s') : %s", ncclCudaPath ? ncclCudaPath : "", dlerror());
    goto error;
  }

@@ -15,79 +15,152 @@
 #include <stdlib.h>
 #include <unistd.h>

-// Change functions behavior to match other SYS functions
-static int shm_allocate(int fd, const int shmSize) {
-  int err = posix_fallocate(fd, 0, shmSize);
-  if (err) { errno = err; return -1; }
-  return 0;
-}
-static int shm_map(int fd, const int shmSize, void** ptr) {
-  *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  return (*ptr == MAP_FAILED) ? -1 : 0;
+struct shmHandleInternal {
+  int fd;
+  char* shmPath;
+  char* shmPtr;
+  void* devShmPtr;
+  size_t shmSize;
+  size_t realShmSize;
+  int* refcount;
+};
+
+static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmSize, char* hptr, void* dptr, bool create, struct shmHandleInternal* handle) {
+  handle->fd = fd;
+  handle->shmPtr = hptr;
+  handle->devShmPtr = dptr;
+  handle->shmSize = shmSize;
+  handle->realShmSize = realShmSize;
+  handle->refcount = (int*)(hptr + shmSize);
+  if (create) {
+    int slen = strlen(shmPath);
+    handle->shmPath = (char*)malloc(slen + 1);
+    memcpy(handle->shmPath, shmPath, slen + 1);
+    if (hptr) memset(hptr, 0, shmSize);
+  } else {
+    handle->shmPath = NULL;
+  }
+  return;
 }

-static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
+  int fd = -1;
+  char* hptr = NULL;
+  void* dptr = NULL;
+  ncclResult_t ret = ncclSuccess;
+  struct shmHandleInternal* tmphandle;
+  bool create = refcount > 0 ? true : false;
+  const size_t refSize = sizeof(int); /* extra sizeof(int) bytes for reference count */
+  const size_t realShmSize = shmSize + refSize;
+
+  *handle = *shmPtr = NULL; /* assume shmPtr and handle always set correctly by users. */
+  EQCHECKGOTO(tmphandle = (struct shmHandleInternal*)malloc(sizeof(struct shmHandleInternal)), NULL, ret, fail);
  if (create) {
+    /* refcount > 0 means the caller tries to allocate a shared memory. This shared memory segment will have
+     * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
+     * goes down to 0, unlink should be called in order to delete shared memory file. */
    if (shmPath[0] == '\0') {
      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
-      *fd = mkstemp(shmPath);
+      fd = mkstemp(shmPath);
    } else {
-      SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+      SYSCHECKGOTO(fd = open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
    }
-    if (ftruncate(*fd, shmSize) != 0) {
-      WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
-      return ncclSystemError;
+
+    if (ftruncate(fd, realShmSize) != 0) {
+      WARN("Error: failed to extend %s to %ld bytes", shmPath, realShmSize);
+      ret = ncclSystemError;
+      goto fail;
    }
-    INFO(NCCL_ALLOC, "Allocated %d bytes of shared memory in %s\n", shmSize, shmPath);
+    INFO(NCCL_ALLOC, "Allocated %ld bytes of shared memory in %s", realShmSize, shmPath);
  } else {
-    SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
-  }
-  *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
-  if (*ptr == NULL) {
-    WARN("Could not map %s\n", shmPath);
-    return ncclSystemError;
-  }
-  close(*fd);
-  *fd = -1;
-  if (create) memset(*ptr, 0, shmSize);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
-  int fd = -1;
-  void* ptr = MAP_FAILED;
-  ncclResult_t res = ncclSuccess;
-
-  NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
-  if (devShmPtr) {
-    CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
-    CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+    SYSCHECKGOTO(fd = open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), ret, fail);
  }

-  *shmPtr = ptr;
-  return ncclSuccess;
-sysError:
-  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
-cudaError:
-  if (fd != -1) close(fd);
-  if (create) shm_unlink(shmPath);
-  if (ptr != MAP_FAILED) munmap(ptr, shmSize);
-  *shmPtr = NULL;
-  return res;
-}
+  hptr = (char*)mmap(NULL, realShmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  if (hptr == MAP_FAILED) {
+    WARN("Could not map %s size %zi, error: %s", shmPath, realShmSize, strerror(errno));
+    ret = ncclSystemError;
+    goto fail;
+  }

-ncclResult_t ncclShmUnlink(const char* shmPath) {
-  if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
-  return ncclSuccess;
-}
+  if (create) {
+    *(int*)(hptr + shmSize) = refcount;
+  } else {
+    int remref = __atomic_sub_fetch((int*)(hptr + shmSize), 1, __ATOMIC_RELAXED);
+    if (remref == 0) {
+      /* the last peer has completed attachment, it should unlink the shm mem file. */
+      if (unlink(shmPath) != 0) {
+        WARN("unlink shared memory %s failed, error: %s", shmPath, strerror(errno));
+      }
+    }

-ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
-  if (shmPtr) {
-    if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
-    if (munmap(shmPtr, shmSize) != 0) {
-      WARN("munmap of shared memory failed");
-      return ncclSystemError;
+    if (refcount != -1) {
+      WARN("attaching memory should only reduce refcount by 1 but %d is passed", refcount);
    }
  }
-  return ncclSuccess;
+
+  if (devShmPtr) {
+    CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterMapped), ret, fail);
+    CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
+  }
+
+  shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
+exit:
+  *shmPtr = hptr;
+  if (devShmPtr) *devShmPtr = dptr;
+  *handle = (ncclShmHandle_t)tmphandle;
+  return ret;
+fail:
+  WARN("Error while %s shared memory segment %s (size %ld)", create ? "creating" : "attaching to", shmPath, shmSize);
+  if (tmphandle) {
+    shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
+    ncclShmClose((ncclShmHandle_t)tmphandle);
+    tmphandle = NULL;
+  }
+  hptr = NULL;
+  dptr = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclShmClose(ncclShmHandle_t handle) {
+  ncclResult_t ret = ncclSuccess;
+  struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
+  if (tmphandle) {
+    if (tmphandle->fd >= 0) {
+      close(tmphandle->fd);
+      if (tmphandle->shmPath != NULL && *tmphandle->refcount > 0) {
+        if (unlink(tmphandle->shmPath) != 0) {
+          WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+          ret = ncclSystemError;
+        }
+        free(tmphandle->shmPath);
+      }
+    }
+
+    if (tmphandle->shmPtr) {
+      if (tmphandle->devShmPtr) CUDACHECK(cudaHostUnregister(tmphandle->shmPtr));
+      if (munmap(tmphandle->shmPtr, tmphandle->realShmSize) != 0) {
+        WARN("munmap of shared memory %p size %ld failed, error: %s", tmphandle->shmPtr, tmphandle->realShmSize, strerror(errno));
+        ret = ncclSystemError;
+      }
+    }
+    free(tmphandle);
+  }
+  return ret;
+}
+
+ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
+  ncclResult_t ret = ncclSuccess;
+  struct shmHandleInternal* tmphandle = (struct shmHandleInternal*)handle;
+  if (tmphandle) {
+    if (tmphandle->shmPath != NULL) {
+      if (unlink(tmphandle->shmPath) != 0) {
+        WARN("unlink shared memory %s failed, error: %s", tmphandle->shmPath, strerror(errno));
+        ret = ncclSystemError;
+      }
+      free(tmphandle->shmPath);
+      tmphandle->shmPath = NULL;
+    }
+  }
+  return ret;
 }
@@ -20,6 +20,52 @@

 static std::vector<std::pair<int, std::unordered_set<std::string>>> clientPortPool;

+static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+  int bytes = 0;
+  *closed = 0;
+  char* data = (char*)ptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+  do {
+    if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
+    if (op == NCCL_SOCKET_RECV && bytes == 0) {
+      *closed = 1;
+      return ncclSuccess;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        return ncclRemoteError;
+      } else {
+        bytes = 0;
+      }
+    }
+    (*offset) += bytes;
+    if (sock->abortFlag && *sock->abortFlag != 0) {
+      INFO(NCCL_NET, "socketProgressOpt: abort called");
+      return ncclInternalError;
+    }
+  } while (bytes > 0 && (*offset) < size);
+  return ncclSuccess;
+}
+
+static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  int closed;
+  NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
+  if (closed) {
+    char line[SOCKET_NAME_MAXLEN+1];
+    WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+    return ncclRemoteError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+  while (*offset < size)
+    NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+  return ncclSuccess;
+}
+
 /* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
 *
 * Output: "IPv4/IPv6 address<port>"
@@ -202,7 +248,7 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd
  return found;
 }

-ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
    WARN("Net : string is null");
    return ncclInvalidArgument;
@@ -304,7 +350,7 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
 	INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
 	// Try to find interface that is in the same subnet as the IP in comm id
        union ncclSocketAddress idAddr;
-        ncclGetSocketAddrFromString(&idAddr, commId);
+        ncclSocketGetAddrFromString(&idAddr, commId);
        nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
      }
    }
@@ -318,39 +364,31 @@ int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNa
 }

 ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
-  /* IPv4/IPv6 support */
-  int family = sock->addr.sa.sa_family;
-  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-  int flags;
-
-  /* Create socket and bind it to a port */
-  int fd = socket(family, SOCK_STREAM, 0);
-  if (fd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
+  if (sock == NULL) {
+    WARN("ncclSocketListen: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (sock->fd == -1) {
+    WARN("ncclSocketListen: file descriptor is -1");
+    return ncclInvalidArgument;
  }

  if (socketToPort(&sock->addr)) {
    // Port is forced by env. Make sure we get the port.
    int opt = 1;
 #if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
 #else
-    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
 #endif
  }

-  /* The socket is set non-blocking for OS level, but asyncFlag is used to control
-   * blocking and non-blocking behavior in user level. */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-
  // addr port should be 0 (Any port)
-  SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
+  SYSCHECK(bind(sock->fd, &sock->addr.sa, sock->salen), "bind");

  /* Get the assigned Port */
-  socklen_t size = salen;
-  SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
+  socklen_t size = sock->salen;
+  SYSCHECK(getsockname(sock->fd, &sock->addr.sa, &size), "getsockname");

 #ifdef ENABLE_TRACE
  char line[SOCKET_NAME_MAXLEN+1];
@@ -360,76 +398,226 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
  /* Put the socket in listen mode
   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
   */
-  SYSCHECK(listen(fd, 16384), "listen");
-  sock->fd = fd;
+  SYSCHECK(listen(sock->fd, 16384), "listen");
+  sock->state = ncclSocketStateReady;
  return ncclSuccess;
 }

-static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
-    struct pollfd pfd;
-    int timeout = 1, ret;
-    socklen_t rlen = sizeof(int);
-
-    memset(&pfd, 0, sizeof(struct pollfd));
-    pfd.fd = fd;
-    pfd.events = POLLOUT;
-    SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
-    if (ret == 0) {
-      ret = EINPROGRESS;
-    } else {
-      /* check socket status */
-      EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
-      SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
-    }
-
-    if (ret == EINPROGRESS || ret == ECONNREFUSED)
-      *state = ncclSocketConnecting;
-    else if (ret == 0)
-      *state = ncclSocketConnected;
-    else
-      *state = ncclSocketError;
-    return ncclSuccess;
+ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr) {
+  if (sock == NULL) {
+    WARN("ncclSocketGetAddr: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (sock->state != ncclSocketStateReady) return ncclInternalError;
+  memcpy(addr, &sock->addr, sizeof(union ncclSocketAddress));
+  return ncclSuccess;
 }

-ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
-    NCCLCHECK(getFdState(sock->fd, state));
-    sock->state = *state;
+static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
+  socklen_t socklen = sizeof(union ncclSocketAddress);
+  sock->fd = accept(sock->acceptFd, &sock->addr.sa, &socklen);
+  if (sock->fd != -1) {
+    sock->state = ncclSocketStateAccepted;
+  } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
+    WARN("socketTryAccept: get errno %d that is not EAGAIN or EWOULDBLOCK", errno);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
+  uint64_t magic;
+  enum ncclSocketType type;
+  int received = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+  if (received == 0) return ncclSuccess;
+  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+  if (magic != sock->magic) {
+    WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
+    close(sock->fd);
+    sock->fd = -1;
+    // Ignore spurious connection and accept again
+    sock->state = ncclSocketStateAccepting;
    return ncclSuccess;
+  } else {
+    received = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+    if (type != sock->type) {
+      WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
+      sock->state = ncclSocketStateError;
+      close(sock->fd);
+      sock->fd = -1;
+      return ncclInternalError;
+    } else {
+      sock->state = ncclSocketStateReady;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
+  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+
+  if (ret == 0) {
+    sock->state = ncclSocketStateConnected;
+    return ncclSuccess;
+  } else if (errno == EINPROGRESS) {
+    sock->state = ncclSocketStateConnectPolling;
+    return ncclSuccess;
+  } else if (errno == ECONNREFUSED) {
+    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
+      sock->state = ncclSocketStateError;
+      WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
+      return ncclRemoteError;
+    }
+    usleep(SLEEP_INT);
+    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    return ncclSuccess;
+  } else if (errno == ETIMEDOUT) {
+    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
+      sock->state = ncclSocketStateError;
+      WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
+      return ncclRemoteError;
+    }
+    usleep(SLEEP_INT);
+    return ncclSuccess;
+  } else {
+    char line[SOCKET_NAME_MAXLEN+1];
+    sock->state = ncclSocketStateError;
+    WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+    return ncclSystemError;
+  }
+}
+
+static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
+  struct pollfd pfd;
+  int timeout = 1, ret;
+  socklen_t rlen = sizeof(int);
+
+  memset(&pfd, 0, sizeof(struct pollfd));
+  pfd.fd = sock->fd;
+  pfd.events = POLLOUT;
+  SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
+  if (ret == 0) return ncclSuccess;
+
+  /* check socket status */
+  EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
+  SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+
+  if (ret == 0) {
+    sock->state = ncclSocketStateConnected;
+  } else if (ret == ECONNREFUSED) {
+    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
+      sock->state = ncclSocketStateError;
+      WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
+      return ncclRemoteError;
+    }
+    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+    usleep(SLEEP_INT);
+    sock->state = ncclSocketStateConnecting;
+  } else if (ret == ETIMEDOUT) {
+    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
+      sock->state = ncclSocketStateError;
+      WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
+      return ncclRemoteError;
+    }
+    usleep(SLEEP_INT);
+    sock->state = ncclSocketStateConnecting;
+  } else if (ret != EINPROGRESS) {
+    sock->state = ncclSocketStateError;
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
+  if (sock == NULL) {
+    WARN("ncclSocketPollConnect: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  NCCLCHECK(socketPollConnect(sock));
+  return ncclSuccess;
+}
+
+static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
+  int sent = 0;
+  NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+  if (sent == 0) return ncclSuccess;
+  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+  sent = 0;
+  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  sock->state = ncclSocketStateReady;
+  return ncclSuccess;
+}
+
+static ncclResult_t socketProgressState(struct ncclSocket* sock) {
+  if (sock->state == ncclSocketStateAccepting) {
+    NCCLCHECK(socketTryAccept(sock));
+  }
+  if (sock->state == ncclSocketStateAccepted) {
+    NCCLCHECK(socketFinalizeAccept(sock));
+  }
+  if (sock->state == ncclSocketStateConnecting) {
+    NCCLCHECK(socketStartConnect(sock));
+  }
+  if (sock->state == ncclSocketStateConnectPolling) {
+    NCCLCHECK(socketPollConnect(sock));
+  }
+  if (sock->state == ncclSocketStateConnected) {
+    NCCLCHECK(socketFinalizeConnect(sock));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running) {
+  if (sock == NULL) {
+    *running = 0;
+    return ncclSuccess;
+  }
+  if (sock->state == ncclSocketStateError || sock->state == ncclSocketStateClosed) {
+    WARN("ncclSocketReady: unexpected socket state %d", sock->state);
+    return ncclRemoteError;
+  }
+  *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+  if (*running == 0) {
+    NCCLCHECK(socketProgressState(sock));
+    *running = (sock->state == ncclSocketStateReady) ? 1 : 0;
+  }
+  return ncclSuccess;
 }

 ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
  char line[SOCKET_NAME_MAXLEN+1];
-  /* IPv4/IPv6 support */
-  int family = sock->addr.sa.sa_family;
-  if (family != AF_INET && family != AF_INET6) {
-    WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
-         ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+  const int one = 1;
+
+  if (sock == NULL) {
+    WARN("ncclSocketConnect: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (sock->fd == -1) {
+    WARN("ncclSocketConnect: file descriptor is -1");
+    return ncclInvalidArgument;
+  }
+
+  if (sock->state != ncclSocketStateInitialized) {
+    WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+    if (sock->state == ncclSocketStateError) return ncclRemoteError;
    return ncclInternalError;
  }
-  int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-  int flags;
+  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));

-  /* Connect to a hostname / port */
-  int fd = socket(family, SOCK_STREAM, 0);
-  if (fd == -1) {
-    WARN("Net : Socket creation failed : %s", strerror(errno));
-    return ncclSystemError;
-  }
-
-  const int one = 1;
-  SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-  
-  /* The socket is set non-blocking for OS level, but asyncFlag is used to control
-   * blocking and non-blocking behavior in user level. */
-  EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-  SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-
-  /*  const int bufsize = 128*1024;
-    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
-    SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");

  if (portReuse) {
-    // pre-define ports according to tid, to avoid extra lock for race condition
+    int family = sock->addr.sa.sa_family;
+    if (family != AF_INET && family != AF_INET6) {
+      WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+           ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+      return ncclInternalError;
+    }
+    int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);    // pre-define ports according to tid, to avoid extra lock for race condition
+
    if (clientPortPool.size() == 0) {
      for (int tid = syscall(SYS_gettid), i = 1; i < 5; i++) {
        clientPortPool.push_back(std::make_pair(60000 + i * 1000 + tid % 1000, std::unordered_set<std::string>()));
@@ -448,161 +636,227 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
    // bind the port in fd for connect system call
    if (reused_port != -1) {
      int opt = 1;
-      SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+      SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
      struct sockaddr_in sin;
      sin.sin_family = family;
      sin.sin_addr.s_addr = htonl(INADDR_ANY);
      sin.sin_port = htons(reused_port);
-      SYSCHECK(bind(fd, (struct sockaddr *)&sin, salen), "bind_client_port");
+      SYSCHECK(bind(sock->fd, (struct sockaddr *)&sin, salen), "bind_client_port");
    }
  }

-  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+  sock->state = ncclSocketStateConnecting;
+  do {
+    NCCLCHECK(socketProgressState(sock));
+  } while (sock->asyncFlag == 0 &&
+      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+      (sock->state == ncclSocketStateConnecting ||
+       sock->state == ncclSocketStateConnectPolling ||
+       sock->state == ncclSocketStateConnected));

-  int ret;
-  int timedout_retries = 0;
-  int refused_retries = 0;
-retry:
-  /* blocking/non-blocking connect() is determined by asyncFlag. */
-  ret = connect(fd, &sock->addr.sa, salen);
+  if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;

-  if (!sock->asyncFlag) {
-    /* blocking socket, need retry if connect fails. */
-    if (errno == EINPROGRESS || errno == EAGAIN || errno == EALREADY ||
-    (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
-    (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
-      /* check abortFlag as long as we have chance to retry. */
-      if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
-      if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-      usleep(SLEEP_INT);
-      goto retry;
-    }
-
-    /* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
-     * However, it can return EISCONN instead of success which indicates connection is built up in
-     * background already. No need to call connect() again. */
-    if (ret == 0 || errno == EISCONN) {
-      sock->fd = fd;
+  switch (sock->state) {
+    case ncclSocketStateConnecting:
+    case ncclSocketStateConnectPolling:
+    case ncclSocketStateConnected:
+    case ncclSocketStateReady:
      return ncclSuccess;
-    }
-  } else {
-    sock->fd = fd;
-    return ncclSuccess;
+    case ncclSocketStateError:
+      return ncclSystemError;
+    default:
+      WARN("ncclSocketConnect: wrong socket state %d", sock->state);
+      return ncclInternalError;
  }
-
-  WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-  return ncclRemoteError;
 }

-ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
-  socklen_t socklen = sizeof(union ncclSocketAddress);
-  struct pollfd pollfd;
-  int tmpFd = sock->fd = -1;
-  int pollret;
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSock) {
+  ncclResult_t ret = ncclSuccess;

-  pollfd.fd = listenSocket->fd;
-  pollfd.events = POLLIN;
-retry:
-  if ((pollret = poll(&pollfd, 1, listenSocket->asyncFlag ? 0 : 100)) < 0) {
-    return ncclSystemError;
-  } else {
-    tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
+  if (listenSock == NULL || sock == NULL) {
+    WARN("ncclSocketAccept: pass NULL socket");
+    ret = ncclInvalidArgument;
+    goto exit;
+  }
+  if (listenSock->state != ncclSocketStateReady) {
+    WARN("ncclSocketAccept: wrong socket state %d", listenSock->state);
+    if (listenSock->state == ncclSocketStateError)
+      ret = ncclSystemError;
+    else
+      ret = ncclInternalError;
+    goto exit;
  }

-  if (!listenSocket->asyncFlag) {
-    /* blocking socket, if tmpFd is still -1, we need to retry */
-    if (tmpFd == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
-      if (listenSocket->abortFlag && *listenSocket->abortFlag != 0) return ncclInternalError;
-      goto retry;
-    }
-    EQCHECK(tmpFd, -1);
+  if (sock->acceptFd == -1) {
+    memcpy(sock, listenSock, sizeof(struct ncclSocket));
+    sock->acceptFd = listenSock->fd;
+    sock->state = ncclSocketStateAccepting;
  }

-  sock->fd = tmpFd;
-  return ncclSuccess;
+  do {
+    NCCLCHECKGOTO(socketProgressState(sock), ret, exit);
+  } while (sock->asyncFlag == 0 &&
+      (sock->abortFlag == NULL || *sock->abortFlag == 0) &&
+      (sock->state == ncclSocketStateAccepting ||
+       sock->state == ncclSocketStateAccepted));
+
+  if (sock->abortFlag && *sock->abortFlag != 0) return ncclInternalError;
+
+  switch (sock->state) {
+    case ncclSocketStateAccepting:
+    case ncclSocketStateAccepted:
+    case ncclSocketStateReady:
+      ret = ncclSuccess;
+      break;
+    case ncclSocketStateError:
+      ret = ncclSystemError;
+      break;
+    default:
+      WARN("ncclSocketAccept: wrong socket state %d", sock->state);
+      ret = ncclInternalError;
+      break;
+  }
+
+exit:
+  return ret;
 }

-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
-  if (sock == NULL)
-    return ncclSuccess;
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
+  ncclResult_t ret = ncclSuccess;

+  if (sock == NULL) goto exit;
+  sock->timedOutRetries = 0;
+  sock->refusedRetries = 0;
+  sock->abortFlag = abortFlag;
+  sock->asyncFlag = asyncFlag;
+  sock->state = ncclSocketStateInitialized;
+  sock->magic = magic;
+  sock->type = type;
  sock->fd = -1;
+  sock->acceptFd = -1;
+
  if (addr) {
+    /* IPv4/IPv6 support */
+    int family;
    memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+    family = sock->addr.sa.sa_family;
+    if (family != AF_INET && family != AF_INET6) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+          ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+      ret = ncclInternalError;
+      goto fail;
+    }
+    sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+    /* Connect to a hostname / port */
+    sock->fd = socket(family, SOCK_STREAM, 0);
+    if (sock->fd == -1) {
+      WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
+      ret = ncclSystemError;
+      goto fail;
+    }
  } else {
    memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
  }
-  sock->abortFlag = abortFlag;
-  sock->asyncFlag = asyncFlag;
-  sock->state = ncclSocketStateNum;
-  return ncclSuccess;
-}

-static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
-  int bytes = 0;
-  *closed = 0;
-  char* data = (char*)ptr;
-  char line[SOCKET_NAME_MAXLEN+1];
-  do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
-    if (op == NCCL_SOCKET_RECV && bytes == 0) {
-      *closed = 1;
-      return ncclSuccess;
-    }
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
-        return ncclRemoteError;
-      } else {
-        bytes = 0;
-      }
-    }
-    (*offset) += bytes;
-    if (sock->abortFlag && *sock->abortFlag != 0) {
-      INFO(NCCL_NET, "Socket progress: abort called");
-      return ncclInternalError;
-    }
-  } while (bytes > 0 && (*offset) < size);
-  return ncclSuccess;
+  /* Set socket as non-blocking if async or if we need to be able to abort */
+  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+    int flags;
+    EQCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), -1, ret, fail);
+    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
 }

 ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
-  int closed;
-  NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
-  if (closed) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclRemoteError;
+  if (sock == NULL) {
+    WARN("ncclSocketProgress: pass NULL socket");
+    return ncclInvalidArgument;
  }
+  NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
  return ncclSuccess;
 }

 ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
-  while (*offset < size)
-    NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
+  if (sock == NULL) {
+    WARN("ncclSocketWait: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  NCCLCHECK(socketWait(op, sock, ptr, size, offset));
  return ncclSuccess;
 }

 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
  int offset = 0;
-  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+  if (sock == NULL) {
+    WARN("ncclSocketSend: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (sock->state != ncclSocketStateReady) {
+    WARN("ncclSocketSend: socket state (%d) is not ready", sock->state);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
  return ncclSuccess;
 }

 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
  int offset = 0;
-  NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+  if (sock == NULL) {
+    WARN("ncclSocketRecv: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (sock->state != ncclSocketStateReady) {
+    WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
  return ncclSuccess;
 }

 // Receive or detect connection closed
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
  int offset = 0;
+  if (sock == NULL) {
+    WARN("ncclSocketTryRecv: pass NULL socket");
+    return ncclInvalidArgument;
+  }
  *closed = 0;
  while (offset < size) {
-    NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+    NCCLCHECK(socketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
    if (*closed) return ncclSuccess;
  }
  return ncclSuccess;
 }
+
+ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+  if (sock != NULL) {
+    if (sock->fd >= 0) close(sock->fd);
+    sock->state = ncclSocketStateClosed;
+    sock->fd = -1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd) {
+  if (sock == NULL) {
+    WARN("ncclSocketGetFd: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (fd) *fd = sock->fd;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock) {
+  if (sock == NULL) {
+    WARN("ncclSocketGetFd: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  sock->fd = fd;
+  return ncclSuccess;
+}
@@ -305,7 +305,8 @@ static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bN
 }

 ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
+    bool b_subsumes_a
  ) {
  #if CUDART_VERSION >= 11030
    if (graph.graph == nullptr) {
@@ -319,6 +320,7 @@ ncclResult_t ncclStrongStreamWaitStream(
      NCCLCHECK(checkGraphId(ag, graph.graphId));
      struct ncclStrongStreamGraph* bg = b->graphHead;
      NCCLCHECK(checkGraphId(bg, graph.graphId));
+      if (b_subsumes_a) ag->tipCount = 0;
      mergeTips(ag, bg->tipNodes, bg->tipCount);
    }
    a->serialEventNeedsRecord = true;
@@ -330,7 +332,8 @@ ncclResult_t ncclStrongStreamWaitStream(
 }

 ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b
+    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
+    bool b_subsumes_a
  ) {
  #if CUDART_VERSION >= 11030
    if (graph.graph == nullptr) {
@@ -351,6 +354,7 @@ ncclResult_t ncclStrongStreamWaitStream(
      }
      struct ncclStrongStreamGraph* ag = a->graphHead;
      NCCLCHECK(checkGraphId(ag, graph.graphId));
+      if (b_subsumes_a) ag->tipCount = 0;
      mergeTips(ag, bNodes, bCount);
    }
    a->serialEventNeedsRecord = true;
@@ -362,7 +366,8 @@ ncclResult_t ncclStrongStreamWaitStream(
 }

 ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b
+    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
+    bool b_subsumes_a
  ) {
  #if CUDART_VERSION >= 11030
    if (graph.graph == nullptr) {
@@ -374,7 +379,9 @@ ncclResult_t ncclStrongStreamWaitStream(
    } else {
      struct ncclStrongStreamGraph* bg = b->graphHead;
      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount, cudaStreamAddCaptureDependencies));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
+        b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
+      ));
    }
  #else
    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -189,7 +189,7 @@ const char* pncclGetErrorString(ncclResult_t result);
 */
 const char*  ncclGetLastError(ncclComm_t comm);
 /// @cond include_hidden
-const char* pncclGetError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
 /// @endcond

 /* Checks whether the comm has encountered any asynchronous errors */
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -442,8 +442,7 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)

  int stepSize = info->comm->buffSizes[op->protocol]/NCCL_STEPS;

-  // If nNodes > 1 and we're using Simple, reduce the stepSize to increase shared buffer utilization
-  if (info->comm->nNodes > 1 && op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pNetChunkSize;
+  if (op->protocol == NCCL_PROTO_SIMPLE) stepSize = info->comm->p2pChunkSize;
  info->chunkSize = stepSize;
  op->root = info->root;

@@ -537,6 +536,8 @@ static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressS
  return ncclSuccess;
 }

+NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
+
 static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
  if (state->opsPool == NULL) return ncclInternalError;
@@ -575,9 +576,16 @@ process_nextops:
  int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
  for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;

+  uint64_t lastOpCount = 0;
+  int lastPeer = -1;
+  int count = 0;
  for (int opIndex = state->nextOps; opIndex != -1;) {
    struct ncclProxyOp* peerOp = pool->ops+opIndex;
    int peer = opIndex / MAX_OPS_PER_PEER;
+    if ((lastOpCount && peerOp->opCount != lastOpCount) || ((lastPeer != -1) && peer != lastPeer)) count++;
+    if (count == ncclParamProxyAppendBatchSize()+1) break;
+    lastOpCount = peerOp->opCount;
+    lastPeer = peer;
    if (peerOp->connection == NULL) return ncclInternalError;
    if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
    NCCLCHECK(ProxyAppend(state, peerOp));
@@ -681,7 +689,7 @@ void* ncclProxyProgress(void *comm_) {

  int lastIdle = 0;
  struct ncclProxyArgs profArgs; // Only used for profiling purposes
-  while (state->stop == 0 && *comm->abortFlag == 0) {
+  while ((state->stop == false || (state->stop == true && state->active)) && *comm->abortFlag == 0) {
    int idle = 1;
    ncclResult_t ret = progressOps(comm, state, state->active, &idle);
    if (ret != ncclSuccess) {
@@ -691,18 +699,17 @@ void* ncclProxyProgress(void *comm_) {
    }
    if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
    if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
-    if (idle) {
-      int added = 0;
-      TIME_START(3);
+    int added = 0;
+    TIME_START(3);
+    if (state->stop == false)
      ret = ncclProxyGetPostedOps(comm, &added);
-      if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
-      if (ret != ncclSuccess) {
-        (void) ncclCommSetAsyncError(comm, ret);
-        INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
-      }
-      if (added == 0) {
-        sched_yield(); // No request progressed. Let others run.
-      }
+    if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
+    if (ret != ncclSuccess) {
+      (void) ncclCommSetAsyncError(comm, ret);
+      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+    }
+    if (added == 0) {
+      sched_yield(); // No request progressed. Let others run.
    }
    lastIdle = idle;
  }
@@ -819,7 +826,7 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
    int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
    for (int i=0; i<max; i++) {
      ncclProxyConnection *connection = pool->pools[b]+i;
-      if (connection->initFlag == true) {
+      if (connection->state != connUninitialized) {
        NCCLCHECK(proxyFree(connection, comm));
      }
    }
@@ -832,6 +839,10 @@ static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* poo
 #include "transport.h"

 ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
+  struct ncclSocket* sock;
+  int ready;
+  int type = ncclProxyMsgInit;
+
  // Keep one connection per mlocal rank
  proxyConn->connection = NULL;
  proxyConn->rank = rank;
@@ -839,17 +850,18 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
    NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
    NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
    NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
-    for (int r=0; r<comm->localRanks; r++) {
-      NCCLCHECK(ncclSocketInit(&comm->proxyState.peerSocks[r], NULL, comm->abortFlag, 0));
+    for (int i = 0; i < comm->localRanks; ++i) {
+      NCCLCHECK(ncclSocketSetFd(-1, &comm->proxyState.peerSocks[i]));
    }
  }
+
  NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
-  struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
-  if (sock->fd == -1) {
-    memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
+  sock = comm->proxyState.peerSocks + proxyConn->localRank;
+  NCCLCHECK(ncclSocketReady(sock, &ready));
+  if (!ready) {
+    NCCLCHECK(ncclSocketInit(sock, comm->proxyState.peerAddresses+rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
    NCCLCHECK(ncclSocketConnect(sock));
  }
-  int type = ncclProxyMsgInit;
  NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
  NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
  NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
@@ -862,7 +874,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
    NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
    struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
    if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
      proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
    }
  }
@@ -873,11 +885,12 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in

 const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
 ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
-  if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
-  struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
-  if (sock->fd == -1) return ncclInternalError;
-  ncclResult_t ret;
+  struct ncclSocket* sock;
+  ncclResult_t ret = ncclSuccess;

+  if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
+  sock = proxyConn->comm->proxyState.peerSocks + proxyConn->localRank;
+  if (sock == NULL) return ncclInternalError;
  NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
  NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
  NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
@@ -887,7 +900,6 @@ ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void*
  return ncclSuccess;
 error:
  WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
-  sock->fd = -1;
  return ret;
 }

@@ -897,16 +909,15 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
    int size = sizeof(struct ncclProxyOpsPool);
    struct ncclProxyOpsPool* pool = NULL;

-    char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
-    shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
-
-    // Init pool
-    pool->nextOps = -1;
-
    // The service thread may be launched already but localRanks may not be set yet.
    while (comm->localRanks == 0) sched_yield();

+    char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
+    shmPath[0] = '\0';
+    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, comm->localRanks + 1, &state->handle));
+    // Init pool
+    pool->nextOps = -1;
+
    for (int r=0; r<comm->localRanks; r++) {
      pool->freeOps[r] = r*MAX_OPS_PER_PEER;
      for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
@@ -933,7 +944,7 @@ static ncclResult_t proxyProgressInit(struct ncclComm* comm) {

 static void proxyOpsFree(struct ncclComm* comm) {
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
-  if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
+  if (ncclShmClose(state->handle) != ncclSuccess) {
    WARN("[Service thread] shm close failed");
  }
 }
@@ -942,10 +953,8 @@ ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
  struct ncclProxyProgressState* state = &comm->proxyState.progressState;
  if (state->opsPool == NULL) return ncclSuccess;

-  char shmPath[] = "/dev/shm/nccl-XXXXXX";
-  memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
-  if (ncclShmUnlink(shmPath) != ncclSuccess) {
-    WARN("[Service thread] shm unlink failed");
+  if (ncclShmUnlink(state->handle) != ncclSuccess) {
+    WARN("[Service thread] proxy ops shm unlink failed");
  }
  return ncclSuccess;
 }
@@ -970,7 +979,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
    NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
  }
  INFO(NCCL_NET, "New proxy %s connection %d from local rank %d, transport %d", connection->send ? "send":"recv", id, connection->localRank, connection->transport);
-  __atomic_store_n(&connection->initFlag, true, __ATOMIC_RELEASE);
+  __atomic_store_n(&connection->state, connInitialized, __ATOMIC_RELEASE);
  return ncclSuccess;
 }

@@ -985,6 +994,7 @@ static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct
  int nChannels;
  NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
  if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
+  __atomic_store_n(&connection->state, connSharedInitialized, __ATOMIC_RELEASE);
  return ncclSuccess;
 }

@@ -996,14 +1006,29 @@ static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclC
    NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
  } else return ncclInternalError;
  if (done) {
-    if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
-    if (op->reqBuff) free(op->reqBuff);
-    if (op->respBuff) free(op->respBuff);
-    op->reqBuff = NULL;
-    op->respBuff = NULL;
+    if (op->type == ncclProxyMsgSetup)
+      __atomic_store_n(&op->connection->state, connSetupDone, __ATOMIC_RELEASE);
+    else if (op->type == ncclProxyMsgConnect)
+      __atomic_store_n(&op->connection->state, connConnected, __ATOMIC_RELEASE);
+    /* if setup or connect is done, we should not return any error at this point since
+     * ncclSocketSend might already send the respBuff to the requester. If we still choose
+     * to abort and close the connection, it can cause segfault if the requester is using
+     * the respBuff. */
+    if (op->respSize) ncclSocketSend(op->connection->sock, op->respBuff, op->respSize);
+    if (op->reqBuff) {
+      free(op->reqBuff);
+      op->reqBuff = NULL;
+    }
+    if (op->respBuff) {
+      free(op->respBuff);
+      op->respBuff = NULL;
+    }
    op->type = 0;
    (*asyncOpCount)--;
+  } else if (*comm->abortFlag != 0) {
+    return ncclInternalError;
  }
+
  return ncclSuccess;
 }

@@ -1047,36 +1072,52 @@ void* ncclProxyService(void* _args) {
  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
-    ncclSocketInit(&peers[s].sock, NULL, comm->abortFlag, 0);
    pollfds[s].fd = -1;
    pollfds[s].events = POLLHUP|POLLIN;
  }
-  pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
+  if (ncclSocketGetFd(comm->proxyState.listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
+    WARN("[Proxy Service] Get listenSock fd fails\n");
+    return NULL;
+  };
  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;

  int maxnpeers = 0;
  int npeers = 0;
  int stop = 0;
  int asyncOpCount = 0;
-  while ((stop == 0 || (stop == 1 && npeers > 0)) && *comm->abortFlag == 0) {
+  while (stop == 0 || (stop == 1 && npeers > 0)) {
+    /* Even if local comm aborts, we cannot let proxy thread exit if we still have peer
+     * connections. Need to wait until all other related comms call abort and safely exit
+     * together, or we could face segmentation fault. */
+    if (*comm->abortFlag != 0) stop = 1;
    /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
-    if (poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500) < 0) {
-      WARN("[Proxy Service] Poll failed: %s\n", strerror(errno));
+    int ret;
+    do {
+      ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
+    } while (ret < 0 && errno == EINTR);
+    if (ret < 0) {
+      WARN("[Proxy Service] Poll failed: %s", strerror(errno));
      return NULL;
    }
    if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
      int s = 0;
-      while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
+      while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
      if (s == NCCL_MAX_LOCAL_RANKS) {
        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
        return NULL;
      }
      if (maxnpeers < s+1) maxnpeers = s+1;
-      struct ncclSocket* sock = &peers[s].sock;
-      if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
+      if (ncclSocketInit(&peers[s].sock) != ncclSuccess) {
+        WARN("[Service thread] Initialize peers[%d].sock fails\n", s);
+        return NULL;
+      }
+      if (ncclSocketAccept(&peers[s].sock, comm->proxyState.listenSock) != ncclSuccess) {
        WARN("[Service thread] Accept failed %s", strerror(errno));
      } else {
-        pollfds[s].fd = sock->fd;
+        if (ncclSocketGetFd(&peers[s].sock, &pollfds[s].fd) != ncclSuccess) {
+          WARN("[Service thread] Get peers[%d].sock fd fails\n", s);
+          return NULL;
+        }
        npeers++;
        peers[s].localRank = -1;
      }
@@ -1088,10 +1129,12 @@ void* ncclProxyService(void* _args) {
      int closeConn = 0;
      int type = 0;
      ncclResult_t res = ncclSuccess;
+
+      if (pollfds[s].fd == -1) continue;
      if (op->type != 0) {
        res = proxyProgressAsync(op, comm, &asyncOpCount);
        type = op->type;
-        if (res != ncclSuccess) op->type = 0;
+        if (res != ncclSuccess) closeConn = 1;
      } else if (pollfds[s].revents & POLLIN) {
        int closed;
        if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
@@ -1125,26 +1168,31 @@ void* ncclProxyService(void* _args) {
        closeConn = 1;
      }
      if (closeConn) {
-        close(sock->fd);
-        sock->fd = pollfds[s].fd = -1;
+        ncclSocketClose(sock);
+        if (op->reqBuff) {
+          free(op->reqBuff);
+          op->reqBuff = NULL;
+        }
+        if (op->respBuff) {
+          free(op->respBuff);
+          op->respBuff = NULL;
+        }
+        op->type = 0;
+        pollfds[s].fd = -1;
        npeers--;
      }
    }
  }
-  /* wait until main thread flush all NCCL operations. */
-  while (*comm->abortFlag != 0 && __atomic_load_n(&comm->proxyState.safeAbortFlag, __ATOMIC_ACQUIRE) == 0)
-    usleep(1000);

  // Wait for all operations to complete and stop progress thread before freeing any resource
  if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
    WARN("[Proxy Service] proxyDestroy failed");
  }
  for (int s=0; s<maxnpeers; s++) {
-    if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
+    ncclSocketClose(&peers[s].sock);
  }
  ncclProxyFreeConnections(&connectionPool, comm);
-  close(comm->proxyState.listenSock->fd);
-  free(comm->proxyState.listenSock);
+  ncclSocketClose(comm->proxyState.listenSock);
  proxyOpsFree(comm);
  return NULL;
 }
@@ -1169,32 +1217,29 @@ ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
  if (state->peerAddresses) {
    if (*comm->abortFlag == 0) {
      struct ncclSocket sock;
-      sock.abortFlag = NULL;
-      sock.asyncFlag = 0;
-      memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
-      NCCLCHECK(ncclSocketConnect(&sock));
      int type = ncclProxyMsgStop;
+      NCCLCHECK(ncclSocketInit(&sock, comm->proxyState.peerAddresses + comm->rank, comm->magic, ncclSocketTypeProxy, comm->abortFlag));
+      NCCLCHECK(ncclSocketConnect(&sock));
      NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
-      close(sock.fd);
-    } else {
-      /* when abortFlag is set, all socket related communications are no longer reliable. We need to
-       * set a flag to let proxy thread exit. */
-      __atomic_store_n(&state->safeAbortFlag, 1, __ATOMIC_RELEASE);
+      NCCLCHECK(ncclSocketClose(&sock));
    }
    free(state->peerAddresses);
  }
+
  if (state->peerSocks) {
    for (int i=0; i<comm->localRanks; i++) {
-      if (state->peerSocks[i].fd != -1) {
+      int fd;
+      NCCLCHECK(ncclSocketGetFd(state->peerSocks + i, &fd));
+      if (fd >= 0) {
        if (state->proxyOps[i].pool) {
-          NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
+          NCCLCHECK(ncclShmClose(state->proxyOps[i].handle));
        }
        if (state->sharedDevMems[i]) {
          CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
        }
        int type = ncclProxyMsgClose;
-        if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
-        close(state->peerSocks[i].fd);
+        if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks + i, &type, sizeof(int)));
+        NCCLCHECK(ncclSocketClose(state->peerSocks + i));
      }
    }
    free(state->peerSocks);
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -78,9 +78,12 @@ void dumpData(struct ncclConnect* data, int ndata) {
 }

 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
+  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  ncclResult_t ret = ncclSuccess;
  int highestType = TRANSPORT_P2P;  // track highest transport type
-
  struct ncclConnect data[2*MAXCHANNELS];
+
+  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
  for (int i=1; i<comm->nRanks; i++) {
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
@@ -94,7 +97,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    TIME_START(0);
    for (int c=0; c<MAXCHANNELS; c++) {
      if (recvMask & (1UL<<c)) {
-        NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
+        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
@@ -103,7 +106,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    struct ncclConnect* sendData = recvData+recvChannels;
    for (int c=0; c<MAXCHANNELS; c++) {
      if (sendMask & (1UL<<c)) {
-        NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
+        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
@@ -112,16 +115,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    TIME_START(2);
    if (sendPeer == recvPeer) {
      if (recvChannels+sendChannels) {
-         NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
-         NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+         NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
         sendData = data;
         recvData = data+sendChannels;
      }
    } else {
-      if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
-      if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
-      if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
-      if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
+      if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
    }
    TIME_STOP(2);

@@ -129,10 +132,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    for (int c=0; c<MAXCHANNELS; c++) {
      if (sendMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
-        NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
+        NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
-        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
    TIME_STOP(3);
@@ -140,18 +143,23 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    for (int c=0; c<MAXCHANNELS; c++) {
      if (recvMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
-        NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
+        NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
-        CUDACHECK(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sideStream));
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
    TIME_STOP(4);
    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
  }
-  CUDACHECK(cudaStreamSynchronize(comm->sideStream));
+
  if (highestTransportType != NULL) *highestTransportType = highestType;
  TIME_PRINT("P2P Setup/Connect");
-  return ncclSuccess;
+exit:
+  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
+  return ret;
+fail:
+  goto exit;
 }

 extern struct ncclTransport collNetTransport;
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -445,7 +445,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
  if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
    uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));

    resources->gdcSync = cpuPtr;
    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -513,7 +513,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
  if (ncclGdrCopy) {
    uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));

    if (ncclParamGdrCopySyncEnable()) {
      resources->gdcSync = cpuPtr;
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
@@ -71,6 +71,7 @@ struct connectMapMem{
    char shmPath[PATH_MAX];
    cudaIpcMemHandle_t ipc;
  };
+  ncclShmHandle_t handle;
 };

 struct connectMap {
@@ -247,13 +248,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
 }

 static ncclResult_t netMapShm(struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
-  NCCLCHECK(ncclShmUnlink(mem->shmPath));
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, -1, &mem->handle));
  return ncclSuccess;
 }
 static ncclResult_t netCreateShm(struct connectMapMem* mem) {
  mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
-  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
+  NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1, &mem->handle));
  return ncclSuccess;
 }

@@ -362,7 +362,7 @@ static ncclResult_t sendFree(struct ncclConnector* send) {
  struct connectMap* map = (struct connectMap*)(send->transportResources);
  if (map) {
    if (map->sameProcess == 0) {
-      NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+      NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].handle));
      if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
        CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
      }
@@ -395,7 +395,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
  struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
  state->refcount++;
  if (state->size == 0) {
-    state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pNetChunkSize;
+    state->size = nChannels*NCCL_SHARED_STEPS*comm->p2pChunkSize;
  }

  if (size) *size = state->size;
@@ -422,7 +422,7 @@ static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int local
 static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
  // Use different pools for different channels and also separate send/recv.
  int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
-  *offset = comm->p2pNetChunkSize * globalSlot;
+  *offset = comm->p2pChunkSize * globalSlot;
  return ncclSuccess;
 }

@@ -547,6 +547,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
    NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
+
  if (resources->netSendComm == NULL) {
    *done = 0;
    return ncclSuccess;
@@ -605,7 +606,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
  }
  if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
    uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc, comm->sideStream));

    resources->gdcSync = cpuPtr;
    struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
@@ -693,6 +694,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
    NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
    connection->proxyAppendPtr = &connection->proxyAppend;
  }
+
  if (resources->netRecvComm == NULL) {
    *done = 0;
    return ncclSuccess;
@@ -741,7 +743,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
  map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
  if (ncclGdrCopy && map->sameProcess) {
    uint64_t *cpuPtr, *gpuPtr;
-    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+    NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc, comm->sideStream));

    if (ncclParamGdrCopySyncEnable()) {
      resources->gdcSync = cpuPtr;
@@ -794,67 +796,75 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str

 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  struct sendResources* resources = (struct sendResources*)(connection->transportResources);
-  if (resources == NULL) { // NVB Preconnect
+  if (connection->state == connSharedInitialized) { // NVB Preconnect
    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
    return ncclSuccess;
  }
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
+
+  if (connection->state == connConnected) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      if (resources->buffers[p]) {
+        NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
+      }
    }
-  }
-  struct connectMapMem* mems = resources->map.mems;
-  if (resources->map.sameProcess) {
-    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-  } else {
-    NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
-  }
-  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
-  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-  if (resources->shared) {
-    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
-    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
-      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
-      comms->sendRefCount[resources->channelId]--;
-      if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
+    struct connectMapMem* mems = resources->map.mems;
+    if (resources->map.sameProcess) {
+      NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+    } else {
+      NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].handle));
+    }
+    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+    if (resources->shared) {
+      NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
+      if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+        struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
+        comms->sendRefCount[resources->channelId]--;
+        if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
+      } else {
+        NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
+      }
    } else {
      NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
    }
-  } else {
-    NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
  }
-  free(resources);
+
+  if (connection->state == connSetupDone) free(resources);
  return ncclSuccess;
 }

 static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
  struct recvResources* resources = (struct recvResources*)(connection->transportResources);
-  if (resources == NULL) { // NVB Preconnect
+  if (connection->state == connSharedInitialized) { // NVB Preconnect
    NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
    return ncclSuccess;
  }
-  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    if (resources->buffers[p]) {
-      NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
+
+  if (connection->state == connConnected) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      if (resources->buffers[p]) {
+        NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
+      }
    }
-  }
-  struct connectMapMem* mems = resources->map.mems;
-  NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
-  CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
-  if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
-  if (resources->shared) {
-    NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
-    if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
-      struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
-      comms->recvRefCount[resources->channelId]--;
-      if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
+    struct connectMapMem* mems = resources->map.mems;
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+    CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+    if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+    if (resources->shared) {
+      NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
+      if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+        struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
+        comms->recvRefCount[resources->channelId]--;
+        if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
+      } else {
+        NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
+      }
    } else {
      NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
    }
-  } else {
-    NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
  }
-  free(resources);
+  
+  if (connection->state == connSetupDone) free(resources);
  return ncclSuccess;
 }

@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -59,6 +59,7 @@ struct alignas(64) ncclIbDev {
  int realPort;
  int maxQp;
  struct ncclIbMrCache mrCache;
+  int ar; // ADAPTIVE_ROUTING
 };

 #define MAX_IB_PORT 15
@@ -82,6 +83,7 @@ NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
+NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);

 NCCL_PARAM(IbSockClientPortReuse, "IB_SOCK_CLIENT_PORT_REUSE", 0);
 NCCL_PARAM(IbSockServerPortReuse, "IB_SOCK_SERVER_PORT_REUSE", 0);
@@ -228,6 +230,11 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
          ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
          ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;

+          // Enable ADAPTIVE_ROUTING by default on IB networks
+          // But allow it to be overloaded by an env parameter
+          ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
+          if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
+
          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
          ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
          pthread_detach(ncclIbAsyncThread); // will not be pthread_join()'d
@@ -320,11 +327,6 @@ failure:
  return ncclSystemError;
 }

-static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
-  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
-  return ncclSuccess;
-}
-
 #define NCCL_NET_IB_MAX_RECVS 8

 ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
@@ -386,6 +388,7 @@ struct ncclIbCommStage {

 struct ncclIbHandle {
  union ncclSocketAddress connectAddr; // Filled by the target
+  uint64_t magic; // random number to help debugging
  struct ncclIbCommStage stage; // Used by the other side when connecting
 };

@@ -398,7 +401,7 @@ struct ncclIbRequest {
  struct ncclIbVerbs* verbs;
  int type;
  int events;
-  union ncclSocketAddress *addr;
+  struct ncclSocket* sock;
  int nreqs;
  union {
    struct {
@@ -449,6 +452,7 @@ struct ncclIbSendComm {
  struct ibv_qp* qps[NCCL_IB_MAX_QPS];
  int nqps;
  struct ibv_mr* fifoMr;
+  int ar;
 };
 // The SendFifo needs to be 32-byte aligned and each element needs
 // to be a 32-byte multiple, so that an entry does not get split and
@@ -593,8 +597,8 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
  static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
  memset(handle, 0, sizeof(struct ncclIbHandle));
  comm->dev = dev;
-  comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
-  NCCLCHECK(GetSocketAddr(&comm->sock.addr));
+  handle->magic = NCCL_SOCKET_MAGIC;
+  NCCLCHECK(ncclSocketInit(&comm->sock, &ncclIbIfAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
  if (ncclParamIbSockServerPortReuse()) {
    // reuse the socket address and fd for listen system call
    if (reusedSockfd == -1) {
@@ -608,16 +612,16 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
  } else {
    NCCLCHECK(ncclSocketListen(&comm->sock));
  }
-  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
+  NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
  *listenComm = comm;
  return ncclSuccess;
 }

 ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
  struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
-  enum ncclSocketState conState;
  struct ncclIbCommStage* stage = &handle->stage;
  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
+  int ready;
  *sendComm = NULL;

  if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
@@ -628,20 +632,15 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
  }

  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-  NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
+  NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetIb, NULL, 1));
  stage->comm = comm;
  stage->state = ncclIbCommStateConnect;
  NCCLCHECK(ncclSocketConnect(&comm->sock, ncclParamIbSockClientPortReuse()));

 ib_connect_check:
  /* since ncclSocketConnect is async, we must check if connection is complete */
-  NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
-  if (conState == ncclSocketConnecting) {
-    /* expect user to call again */
-    return ncclSuccess;
-  } else if (conState == ncclSocketError) {
-    return ncclRemoteError;
-  }
+  NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
+  if (!ready) return ncclSuccess;

  // IB Setup
  struct ibv_context* ctx;
@@ -653,6 +652,7 @@ ib_connect_check:
  for (int q=0; q<comm->nqps; q++) {
    NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
  }
+  comm->ar = ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING

  // Send my QP Info to receiver through the socket. Hope this won't block.
  struct ibv_port_attr portAttr;
@@ -704,9 +704,10 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
  struct ncclIbCommStage* stage = &lComm->stage;
  struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
+  int ready;
  *recvComm = NULL;

-  if (stage->state == ncclIbCommStateAccept) goto ib_accept;
+  if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
  if (stage->state == ncclIbCommStateRecv) goto ib_recv;
  if (stage->state == ncclIbCommStateSend) goto ib_send;
  if (stage->state != ncclIbCommStateStart) {
@@ -717,12 +718,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
  NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
  stage->comm = rComm;
  stage->state = ncclIbCommStateAccept;
-  NCCLCHECK(ncclSocketInit(&rComm->sock, NULL, lComm->sock.abortFlag, 1));
-
-ib_accept:
+  NCCLCHECK(ncclSocketInit(&rComm->sock));
  NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
-  if (rComm->sock.fd == -1)
-    return ncclSuccess;
+
+ib_accept_check:
+  NCCLCHECK(ncclSocketReady(&rComm->sock, &ready));
+  if (!ready) return ncclSuccess;

  struct ncclIbQpInfo remQpInfo;
  stage->state = ncclIbCommStateRecv;
@@ -825,7 +826,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
    if (r->type == NCCL_NET_IB_REQ_UNUSED) {
      r->verbs = verbs;
      r->events = 1;
-      r->addr = NULL;
+      r->sock = NULL;
      *req = r;
      return ncclSuccess;
    }
@@ -1000,8 +1001,8 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
  }

  struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
-  if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
-    // When using adaptive routing, send the bulk of the data first as an
+  if (nreqs > 1 || (comm->ar && reqs[0]->send.size > ncclParamIbArThreshold())) {
+    // When using ADAPTIVE_ROUTING, send the bulk of the data first as an
    // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
    // completion.
    lastWr++;
@@ -1067,28 +1068,31 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh

    // Sanity checks to catch user collective call count/size mismatches
    if (size > slots[r].size) {
-      char line[SOCKET_NAME_MAXLEN+1];
+      char line[SOCKET_NAME_MAXLEN + 1];
+      union ncclSocketAddress addr;
+      ncclSocketGetAddr(&comm->sock, &addr);
      WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error, local size %d remote size %d",
-           r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size);
+        r, nreqs, tag, ncclSocketToString(&addr, line), size, slots[r].size);
      return ncclInvalidUsage;
    } // plus any potential programming errors
    else if (slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
-     char line[SOCKET_NAME_MAXLEN+1];
-     WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
-          r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
+      char line[SOCKET_NAME_MAXLEN + 1];
+      union ncclSocketAddress addr;
+      ncclSocketGetAddr(&comm->sock, &addr);
+      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkey %x",
+        r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkey);
      return ncclInternalError;
    }
    struct ncclIbRequest* req;
    NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
    req->type = NCCL_NET_IB_REQ_SEND;
-    req->addr = &comm->sock.addr;
+    req->sock = &comm->sock;
    req->verbs = &comm->verbs;
    req->nreqs = nreqs;
    req->send.size = size;
    req->send.data = data;
    req->send.lkey = mr->lkey;
    req->send.offset = 0;
-    req->addr = &comm->sock.addr;
    req->events = comm->nqps;
    *request = reqs[r] = req;

@@ -1181,7 +1185,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
  struct ncclIbRequest* req;
  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
  req->type = NCCL_NET_IB_REQ_RECV;
-  req->addr = &comm->sock.addr;
+  req->sock = &comm->sock;
  req->nreqs = n;
  for (int i=0; i<n; i++) req->recv.sizes[i] = 0;

@@ -1220,7 +1224,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
  struct ncclIbRequest* req;
  NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
  req->type = NCCL_NET_IB_REQ_FLUSH;
-  req->addr = &comm->sock.addr;
+  req->sock = &comm->sock;
  struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];

  struct ibv_send_wr wr;
@@ -1268,8 +1272,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
      struct ibv_wc *wc = wcs+w;
      if (wc->status != IBV_WC_SUCCESS) {
        char line[SOCKET_NAME_MAXLEN+1];
+        union ncclSocketAddress addr;
+        ncclSocketGetAddr(r->sock, &addr);
        WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
-             ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+             ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
        return ncclRemoteError;
      }

@@ -1301,7 +1307,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 ncclResult_t ncclIbCloseSend(void* sendComm) {
  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
  if (comm) {
-    close(comm->sock.fd);
+    NCCLCHECK(ncclSocketClose(&comm->sock));
    for (int q=0; q<comm->nqps; q++)
      if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
    if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
@@ -1315,7 +1321,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
 ncclResult_t ncclIbCloseRecv(void* recvComm) {
  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
  if (comm) {
-    close(comm->sock.fd);
+    if (!ncclParamIbSockServerPortReuse() || reusedSockfd != comm->sock.fd) NCCLCHECK(ncclSocketClose(&comm->sock));
    for (int q=0; q<comm->nqps; q++)
      if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
    if (comm->gpuFlush.enabled) {
@@ -1332,7 +1338,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
 ncclResult_t ncclIbCloseListen(void* listenComm) {
  struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
  if (comm) {
-    if (!ncclParamIbSockServerPortReuse() || reusedSockfd != comm->sock.fd) close(comm->sock.fd);
+    NCCLCHECK(ncclSocketClose(&comm->sock));
    free(comm);
  }
  return ncclSuccess;
@@ -18,16 +18,16 @@

 /* Init functions */
 static int ncclNetIfs = -1;
-struct ncclSocketDev {
+struct ncclNetSocketDev {
  union ncclSocketAddress addr;
  char devName[MAX_IF_NAME_SIZE];
  char* pciPath;
 };
-static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
+static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS];

-pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER;

-static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
+static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
  char devicePath[PATH_MAX];
  snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
  // May return NULL if the file doesn't exist.
@@ -35,9 +35,9 @@ static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
+ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
  if (ncclNetIfs == -1) {
-    pthread_mutex_lock(&ncclSocketLock);
+    pthread_mutex_lock(&ncclNetSocketLock);
    if (ncclNetIfs == -1) {
      char names[MAX_IF_NAME_SIZE*MAX_IFS];
      union ncclSocketAddress addrs[MAX_IFS];
@@ -52,9 +52,9 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
        line[0] = '\0';
        addrline[SOCKET_NAME_MAXLEN] = '\0';
        for (int i=0; i<ncclNetIfs; i++) {
-          strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
-          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
-          NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
+          strcpy(ncclNetSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
+          memcpy(&ncclNetSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
+          NCCLCHECK(ncclNetSocketGetPciPath(ncclNetSocketDevs[i].devName, &ncclNetSocketDevs[i].pciPath));
          snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
              ncclSocketToString(&addrs[i], addrline));
        }
@@ -62,17 +62,17 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
      }
    }
-    pthread_mutex_unlock(&ncclSocketLock);
+    pthread_mutex_unlock(&ncclNetSocketLock);
  }
  return ncclSuccess;
 }

-ncclResult_t ncclSocketDevices(int* ndev) {
+ncclResult_t ncclNetSocketDevices(int* ndev) {
  *ndev = ncclNetIfs;
  return ncclSuccess;
 }

-static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
+static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
  *speed = 0;
  char speedPath[PATH_MAX];
  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
@@ -91,12 +91,12 @@ static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
-  props->name = ncclSocketDevs[dev].devName;
-  props->pciPath = ncclSocketDevs[dev].pciPath;
+ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
+  props->name = ncclNetSocketDevs[dev].devName;
+  props->pciPath = ncclNetSocketDevs[dev].pciPath;
  props->guid = dev;
  props->ptrSupport = NCCL_PTR_HOST;
-  NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+  NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
  props->latency = 0; // Not set
  props->port = 0;
  props->maxComms = 65536;
@@ -104,12 +104,6 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
  return ncclSuccess;
 }

-ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
-  if (dev >= ncclNetIfs) return ncclInternalError;
-  memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
-  return ncclSuccess;
-}
-
 /* Communication functions */

 #define MAX_SOCKETS 64
@@ -120,29 +114,30 @@ ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
 NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);

-enum ncclSocketCommState {
-  ncclSocketCommStateStart = 0,
-  ncclSocketCommStateConnect = 1,
-  ncclSocketCommStateAccept = 3,
-  ncclSocketCommStateSend = 4,
-  ncclSocketCommStateRecv = 5,
+enum ncclNetSocketCommState {
+  ncclNetSocketCommStateStart = 0,
+  ncclNetSocketCommStateConnect = 1,
+  ncclNetSocketCommStateAccept = 3,
+  ncclNetSocketCommStateSend = 4,
+  ncclNetSocketCommStateRecv = 5,
 };

-struct ncclSocketCommStage {
-  enum ncclSocketCommState state;
+struct ncclNetSocketCommStage {
+  enum ncclNetSocketCommState state;
  uint8_t iteration;
  struct ncclSocket* sock;
-  struct ncclSocketComm* comm;
+  struct ncclNetSocketComm* comm;
 };

-struct ncclSocketHandle {
+struct ncclNetSocketHandle {
  union ncclSocketAddress connectAddr;
+  uint64_t magic; // random number to help debugging
  int nSocks;
  int nThreads;
-  struct ncclSocketCommStage stage;
+  struct ncclNetSocketCommStage stage;
 };

-struct ncclSocketTask {
+struct ncclNetSocketTask {
  int op;
  void* data;
  int size;
@@ -152,41 +147,41 @@ struct ncclSocketTask {
  ncclResult_t result;
 };

-struct ncclSocketRequest {
+struct ncclNetSocketRequest {
  int op;
  void* data;
  int size;
  struct ncclSocket* ctrlSock;
  int offset;
  int used;
-  struct ncclSocketComm* comm;
-  struct ncclSocketTask* tasks[MAX_SOCKETS];
+  struct ncclNetSocketComm* comm;
+  struct ncclNetSocketTask* tasks[MAX_SOCKETS];
  int nSubs;
 };

-struct ncclSocketTaskQueue {
+struct ncclNetSocketTaskQueue {
  int next;
  int len;
-  struct ncclSocketTask* tasks;
+  struct ncclNetSocketTask* tasks;
 };

-struct ncclSocketThreadResources {
-  struct ncclSocketTaskQueue threadTaskQueue;
+struct ncclNetSocketThreadResources {
+  struct ncclNetSocketTaskQueue threadTaskQueue;
  int stop;
-  struct ncclSocketComm* comm;
+  struct ncclNetSocketComm* comm;
  pthread_mutex_t threadLock;
  pthread_cond_t  threadCond;
 };

-struct ncclSocketListenComm {
+struct ncclNetSocketListenComm {
  struct ncclSocket sock;
-  struct ncclSocketCommStage stage;
+  struct ncclNetSocketCommStage stage;
  int nSocks;
  int nThreads;
  int dev;
 };

-struct ncclSocketComm {
+struct ncclNetSocketComm {
  struct ncclSocket ctrlSock;
  struct ncclSocket socks[MAX_SOCKETS];
  int dev;
@@ -194,15 +189,15 @@ struct ncclSocketComm {
  int nSocks;
  int nThreads;
  int nextSock;
-  struct ncclSocketRequest requests[MAX_REQUESTS];
+  struct ncclNetSocketRequest requests[MAX_REQUESTS];
  pthread_t helperThread[MAX_THREADS];
-  struct ncclSocketThreadResources threadResources[MAX_THREADS];
+  struct ncclNetSocketThreadResources threadResources[MAX_THREADS];
 };

 void* persistentSocketThread(void *args_) {
-  struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
-  struct ncclSocketComm* comm = resource->comm;
-  struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
+  struct ncclNetSocketThreadResources* resource = (struct ncclNetSocketThreadResources*)args_;
+  struct ncclNetSocketComm* comm = resource->comm;
+  struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
  int nSocksPerThread = comm->nSocks / comm->nThreads;
  while (1) {
    int idle = 1;
@@ -212,7 +207,7 @@ void* persistentSocketThread(void *args_) {
      do {
        repeat = 0;
        for (int j=0; j<nSocksPerThread; j++) {
-          struct ncclSocketTask* r = myQueue->tasks+i+j;
+          struct ncclNetSocketTask* r = myQueue->tasks+i+j;
          if (r != NULL && r->used == 1 && r->offset < r->size) {
            r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
            if (r->result != ncclSuccess) {
@@ -236,7 +231,7 @@ void* persistentSocketThread(void *args_) {
  }
 }

-ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
+ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
  int nSocksPerThread = ncclParamSocketNsocksPerThread();
  int nThreads = ncclParamSocketNthreads();
  if (nThreads > MAX_THREADS) {
@@ -247,7 +242,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
    // Auto-detection
    int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
    char vendorPath[PATH_MAX];
-    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
+    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetSocketDevs[dev].devName);
    char* rPath = realpath(vendorPath, NULL);
    int fd = open(rPath, O_RDONLY);
    free(rPath);
@@ -285,36 +280,20 @@ end:
  return ncclSuccess;
 }

-ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->sock.fd = -1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->ctrlSock.fd = -1;
-  for (int i=0; i < MAX_SOCKETS; i++) {
-    (*comm)->socks[i].fd = -1;
-  }
-  (*comm)->nextSock = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
-  if (dev < 0) { // data transfer socket is based on specified dev
+ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+  if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
    return ncclInternalError;
  }
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  memset(handle, 0, sizeof(struct ncclSocketHandle));
-  static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
-  struct ncclSocketListenComm* comm;
-  NCCLCHECK(ncclSocketNewListenComm(&comm));
-  NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
-  comm->sock.asyncFlag = 1;
+  struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
+  memset(handle, 0, sizeof(struct ncclNetSocketHandle));
+  static_assert(sizeof(struct ncclNetSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclNetSocketHandle size too large");
+  struct ncclNetSocketListenComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+  handle->magic = NCCL_SOCKET_MAGIC;
+  NCCLCHECK(ncclSocketInit(&comm->sock, &ncclNetSocketDevs[dev].addr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));
  NCCLCHECK(ncclSocketListen(&comm->sock));
-  memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
-  NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
+  NCCLCHECK(ncclSocketGetAddr(&comm->sock, &handle->connectAddr));
+  NCCLCHECK(ncclNetSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
  handle->nSocks = comm->nSocks;
  handle->nThreads = comm->nThreads;
  comm->dev = dev;
@@ -322,46 +301,41 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
-  if (dev < 0) { // data transfer socket is based on specified dev
+ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+  if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
    return ncclInternalError;
  }

-  enum ncclSocketState conState;
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  struct ncclSocketCommStage* stage = &handle->stage;
-  struct ncclSocketComm* comm = stage->comm;
+  int ready;
+  struct ncclNetSocketHandle* handle = (struct ncclNetSocketHandle*) opaqueHandle;
+  struct ncclNetSocketCommStage* stage = &handle->stage;
+  struct ncclNetSocketComm* comm = stage->comm;
  uint8_t i = stage->iteration;
  struct ncclSocket* sock = stage->sock;
  *sendComm = NULL;

-  if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
-  if (stage->state == ncclSocketCommStateSend) goto socket_send;
+  if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check;
+  if (stage->state == ncclNetSocketCommStateSend) goto socket_send;

-  NCCLCHECK(ncclSocketNewComm(&comm));
+  NCCLCHECK(ncclCalloc(&comm, 1));
  stage->comm = comm;
  comm->nSocks = handle->nSocks;
  comm->nThreads = handle->nThreads;
  comm->dev = dev;
  CUDACHECK(cudaGetDevice(&comm->cudaDev));
  for (; i<comm->nSocks+1; i++) {
-    sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
-    NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
+    sock = (i == comm->nSocks) ? &comm->ctrlSock : comm->socks+i;
+    NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, handle->magic, ncclSocketTypeNetSocket, NULL, 1));

    stage->sock = sock;
-    stage->state = ncclSocketCommStateConnect;
+    stage->state = ncclNetSocketCommStateConnect;
    stage->iteration = i;
    NCCLCHECK(ncclSocketConnect(sock));

 socket_connect_check:
-    NCCLCHECK(ncclGetSocketState(sock, &conState));
-    if (conState == ncclSocketConnecting) {
-      /* expect user to call again */
-      return ncclSuccess;
-    } else if (conState == ncclSocketError) {
-      return ncclRemoteError;
-    }
-    stage->state = ncclSocketCommStateSend;
+    NCCLCHECK(ncclSocketReady(sock, &ready));
+    if (! ready) return ncclSuccess;
+    stage->state = ncclNetSocketCommStateSend;

 socket_send:
    int done = 0;
@@ -372,59 +346,63 @@ socket_send:
  return ncclSuccess;
 }

-ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
-  struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
-  struct ncclSocketCommStage* stage = &lComm->stage;
-  struct ncclSocketComm* rComm = stage->comm;
+ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm) {
+  struct ncclNetSocketListenComm* lComm = (struct ncclNetSocketListenComm*)listenComm;
+  struct ncclNetSocketCommStage* stage = &lComm->stage;
+  struct ncclNetSocketComm* rComm = stage->comm;
  uint8_t i = stage->iteration;
  struct ncclSocket* sock = stage->sock;
+  int ready;

  *recvComm = NULL;
-  if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
-  if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
+  if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check;
+  if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv;

-  NCCLCHECK(ncclSocketNewComm(&rComm));
+  NCCLCHECK(ncclCalloc(&rComm, 1));
  stage->comm = rComm;
  rComm->nSocks = lComm->nSocks;
  rComm->nThreads = lComm->nThreads;
  rComm->dev = lComm->dev;
  CUDACHECK(cudaGetDevice(&rComm->cudaDev));
-  lComm->sock.asyncFlag = 1;
  for (; i<rComm->nSocks+1; i++) {
    uint8_t sendSockIdx;
-    ncclCalloc(&sock, 1);
-    NCCLCHECK(ncclSocketInit(sock, NULL, lComm->sock.abortFlag, 1));
-    stage->sock = sock;
-    stage->state = ncclSocketCommStateAccept;
-    stage->iteration = i;
-socket_accept:
-    NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
-    if (sock->fd == -1) return ncclSuccess;

-    stage->state = ncclSocketCommStateRecv;
+    NCCLCHECK(ncclCalloc(&sock, 1));
+    NCCLCHECK(ncclSocketInit(sock));
+    stage->sock = sock;
+    stage->state = ncclNetSocketCommStateAccept;
+    stage->iteration = i;
+    NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
+
+socket_accept_check:
+    NCCLCHECK(ncclSocketReady(sock, &ready));
+    if (!ready) return ncclSuccess;
+
+    stage->state = ncclNetSocketCommStateRecv;
 socket_recv:
    int done = 0;
    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
    if (done == 0) return ncclSuccess;

-    if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
-    else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
-
+    if (sendSockIdx == rComm->nSocks)
+      memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
+    else
+      memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
    free(sock);
  }
  *recvComm = rComm;

  /* reset lComm state */
-  stage->state = ncclSocketCommStateStart;
+  stage->state = ncclNetSocketCommStateStart;
  stage->iteration = 0;
  stage->sock = NULL;
  stage->comm = NULL;
  return ncclSuccess;
 }

-ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
+ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketRequest** req) {
  for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclSocketRequest* r = comm->requests+i;
+    struct ncclNetSocketRequest* r = comm->requests+i;
    if (r->used == 0) {
      r->op = op;
      r->data = data;
@@ -441,10 +419,10 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
  return ncclInternalError;
 }

-ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
+ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
  int tid = comm->nextSock % comm->nThreads;
-  struct ncclSocketThreadResources* res = comm->threadResources+tid;
-  struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
+  struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
+  struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
  // create helper threads and prepare per-thread task queue
  if (queue->tasks == NULL) {
    // each request can be divided up to nSocks tasks, and
@@ -459,12 +437,12 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
    pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
    ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
  }
-  struct ncclSocketTask* r = queue->tasks+queue->next;
+  struct ncclNetSocketTask* r = queue->tasks+queue->next;
  if (r->used == 0) {
    r->op = op;
    r->data = data;
    r->size = size;
-    r->sock = comm->socks+comm->nextSock;
+    r->sock = comm->socks + comm->nextSock;
    r->offset = 0;
    r->result = ncclSuccess;
    comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
@@ -480,9 +458,9 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
  return ncclInternalError;
 }

-ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
  *done = 0;
-  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+  struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request;
  if (r == NULL) {
    WARN("NET/Socket : test called with NULL request");
    return ncclInternalError;
@@ -500,9 +478,11 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
    // Check size is less or equal to the size provided by the user
    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
      char line[SOCKET_NAME_MAXLEN+1];
+      union ncclSocketAddress addr;
+      ncclSocketGetAddr(r->ctrlSock, &addr);
      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
          there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
-          ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
+          ncclSocketToString(&addr, line), data, r->size);
      return ncclInvalidUsage;
    }
    r->size = data;
@@ -515,7 +495,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
      while (chunkOffset < r->size) {
        int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
        chunkOffset += chunkSize;
      }
    }
@@ -525,7 +505,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
    if (r->nSubs > 0) {
      int nCompleted = 0;
      for (int i=0; i<r->nSubs; i++) {
-        struct ncclSocketTask* sub = r->tasks[i];
+        struct ncclNetSocketTask* sub = r->tasks[i];
        if (sub->result != ncclSuccess) return sub->result;
        if (sub->offset == sub->size) nCompleted++;
      }
@@ -534,7 +514,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
        *done = 1;
        r->used = 0;
        for (int i=0; i<r->nSubs; i++) {
-          struct ncclSocketTask* sub = r->tasks[i];
+          struct ncclNetSocketTask* sub = r->tasks[i];
          sub->used = 0;
        }
      }
@@ -552,43 +532,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
  return ncclSuccess;
 }

-ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+ncclResult_t ncclNetSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
 }
-ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }

-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
-  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
  return ncclSuccess;
 }

-ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
  if (n != 1) return ncclInternalError;
-  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
  return ncclSuccess;
 }

-ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
  // We don't support CUDA pointers, so we don't need a flush operation
  return ncclInternalError;
 }

-ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
-  struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
+ncclResult_t ncclNetSocketCloseListen(void* opaqueComm) {
+  struct ncclNetSocketListenComm* comm = (struct ncclNetSocketListenComm*)opaqueComm;
  if (comm) {
-    if (comm->sock.fd != -1) close(comm->sock.fd);
+    int ready;
+    NCCLCHECK(ncclSocketReady(&comm->sock, &ready));
+    if (ready) NCCLCHECK(ncclSocketClose(&comm->sock));
    free(comm);
  }
  return ncclSuccess;
 }

-ncclResult_t ncclSocketClose(void* opaqueComm) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+ncclResult_t ncclNetSocketClose(void* opaqueComm) {
+  struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)opaqueComm;
  if (comm) {
    for (int i=0; i<comm->nThreads; i++) {
-      struct ncclSocketThreadResources* res = comm->threadResources+i;
+      struct ncclNetSocketThreadResources* res = comm->threadResources+i;
      if (comm->helperThread[i]) {
        pthread_mutex_lock(&res->threadLock);
        res->stop = 1;
@@ -598,9 +580,12 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
      }
      free(res->threadTaskQueue.tasks);
    }
-    if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
+    int ready;
+    NCCLCHECK(ncclSocketReady(&comm->ctrlSock, &ready));
+    if (ready) NCCLCHECK(ncclSocketClose(&comm->ctrlSock));
    for (int i=0; i<comm->nSocks; i++) {
-      if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
+      NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready));
+      if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
    }
    free(comm);
  }
@@ -609,20 +594,20 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {

 ncclNet_t ncclNetSocket = {
  "Socket",
-  ncclSocketInit,
-  ncclSocketDevices,
-  ncclSocketGetProperties,
-  ncclSocketListen,
-  ncclSocketConnect,
-  ncclSocketAccept,
-  ncclSocketRegMr,
+  ncclNetSocketInit,
+  ncclNetSocketDevices,
+  ncclNetSocketGetProperties,
+  ncclNetSocketListen,
+  ncclNetSocketConnect,
+  ncclNetSocketAccept,
+  ncclNetSocketRegMr,
  NULL, // No DMA-BUF support
-  ncclSocketDeregMr,
-  ncclSocketIsend,
-  ncclSocketIrecv,
-  ncclSocketIflush,
-  ncclSocketTest,
-  ncclSocketClose,
-  ncclSocketClose,
-  ncclSocketCloseListen
+  ncclNetSocketDeregMr,
+  ncclNetSocketIsend,
+  ncclNetSocketIrecv,
+  ncclNetSocketIflush,
+  ncclNetSocketTest,
+  ncclNetSocketClose,
+  ncclNetSocketClose,
+  ncclNetSocketCloseListen
 };
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -37,6 +37,7 @@ struct p2pProxyInfo {
  struct p2pShm* devShm;
  char shmName[7];
  int shmSize;
+  ncclShmHandle_t handle;

  // Intermediate step for sender
  struct ncclRecvMem* ceRecvMem;
@@ -67,6 +68,7 @@ struct p2pRecvResources {
  struct p2pShm* shm;
  struct p2pShm* devShm;
  int shmSize;
+  ncclShmHandle_t handle;
 };

 #include <sys/types.h>
@@ -379,9 +381,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
    sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
    TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
    resources->shmSize = info->shmSize;
-    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
-    // Remove the file to ensure proper clean-up
-    NCCLCHECK(ncclShmUnlink(shmPath));
+    NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, -1, &resources->handle));

    recv->conn.tail = &resources->devShm->recvMem.tail;
    recv->conn.head = &resources->devShm->sendMem.head;
@@ -424,7 +424,7 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
    if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
    if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
    if (useMemcpy) {
-      NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
+      NCCLCHECK(ncclShmClose(resources->handle));
    }
    free(resources);
  }
@@ -442,7 +442,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
    char shmPath[PATH_MAX];
    shmPath[0] = '\0';
    proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
-    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
+    NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1, &proxyInfo->handle));
    TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
    memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));

@@ -505,7 +505,7 @@ static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, str
  if (useMemcpy) {
    struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
    if (proxyInfo) {
-      NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
+      NCCLCHECK(ncclShmClose(proxyInfo->handle));
      NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
      CUDACHECK(cudaFree(proxyInfo->ceDevBuff));
      CUDACHECK(cudaStreamDestroy(proxyInfo->stream));
@@ -17,18 +17,22 @@ struct shmSendResources {
  int remShmSize;
  struct ncclRecvMem* remHostMem;
  struct ncclRecvMem* devRemHostMem;
+  ncclShmHandle_t remHandle;
  int shmSize;
  struct ncclSendMem* hostMem;
  struct ncclSendMem* devHostMem;
+  ncclShmHandle_t hostHandle;
 };

 struct shmRecvResources {
  int remShmSize;
  struct ncclSendMem* remHostMem;
  struct ncclSendMem* devRemHostMem;
+  ncclShmHandle_t remHandle;
  int shmSize;
  struct ncclRecvMem* hostMem;
  struct ncclRecvMem* devHostMem;
+  ncclShmHandle_t hostHandle;
 };

 #define SHM_SEND_SIDE 1
@@ -84,7 +88,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
  }
  info->shmSize = resources->shmSize = shmSize;
-  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));

@@ -107,7 +111,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
  }
  info->shmSize = resources->shmSize = shmSize;
-  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+  NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1, &resources->hostHandle));
  TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
  memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));

@@ -137,9 +141,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
  resources->remShmSize = info->shmSize;
  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
-  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  // Remove the file to ensure proper clean-up
-  NCCLCHECK(ncclShmUnlink(shmPath));
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));

  char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -172,8 +174,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
  sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
  resources->remShmSize = info->shmSize;
  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
-  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  NCCLCHECK(ncclShmUnlink(shmPath));
+  NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, -1, &resources->remHandle));

  char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
  for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -196,8 +197,8 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
 static ncclResult_t shmSendFree(struct ncclConnector* send) {
  struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
  if (resources) {
-    NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-    NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+    NCCLCHECK(ncclShmClose(resources->hostHandle));
+    NCCLCHECK(ncclShmClose(resources->remHandle));
    free(resources);
  }
  return ncclSuccess;
@@ -206,8 +207,8 @@ static ncclResult_t shmSendFree(struct ncclConnector* send) {
 static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
  if (resources) {
-    NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-    NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+    NCCLCHECK(ncclShmClose(resources->hostHandle));
+    NCCLCHECK(ncclShmClose(resources->remHandle));
    free(resources);
  }
  return ncclSuccess;
@@ -6,7 +6,7 @@ endif
 HIPCC = $(HIP_PATH)/bin/hipcc

 EXE = topo_expl
-CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE
+CXXFLAGS = -g -O3 -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL

 files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
 	hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc
@@ -17,10 +17,9 @@ $(EXE): $(files)
 	$(HIPCC) $(CXXFLAGS) $^ -o $@

 hipify:
-	mkdir -p hipify_rccl/include
-	mkdir -p hipify_rccl/graph
-	cp ../../src/include/*.h hipify_rccl/include/
-	cp ../../src/graph/* hipify_rccl/graph/
+	mkdir -p hipify_rccl
+	cp -a ../../src/include/ hipify_rccl/
+	cp -a ../../src/graph/ hipify_rccl/
 	hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
 	hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*

@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -45,4 +45,12 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
 ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph);

+#define TIME_START(index)
+
+#define TIME_STOP(index)
+
+#define TIME_CANCEL(index)
+
+#define TIME_PRINT(name)
+
 #endif
@@ -1,6 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -245,7 +245,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
  struct ncclChannel* channel = &comm->channels[channelId];
-  uint32_t mask = 1 << channelId;
+  uint64_t mask = 1UL << channel->id;
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
@@ -270,72 +270,87 @@ void dumpData(struct ncclConnect* data, int ndata) {

 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
  // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
+  ncclResult_t ret = ncclSuccess;
  int highestType = TRANSPORT_P2P;  // track highest transport type
-
-  //hipStream_t transportSetupStream;
-  //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
-
  struct ncclConnect data[2*MAXCHANNELS];
+
+  //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->hostStream), ret, fail);
  for (int i=1; i<comm->nRanks; i++) {
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
    int sendPeer = (comm->rank + i) % comm->nRanks;
-    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
-    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+    uint64_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
+    uint64_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];

    struct ncclConnect* recvData = data;
    int sendChannels = 0, recvChannels = 0;
    int type;
+    TIME_START(0);
    for (int c=0; c<MAXCHANNELS; c++) {
-      if (recvMask & (1<<c)) {
-        NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
+      if (recvMask & (1UL<<c)) {
+        NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
+    TIME_STOP(0);
+    TIME_START(1);
    struct ncclConnect* sendData = recvData+recvChannels;
    for (int c=0; c<MAXCHANNELS; c++) {
-      if (sendMask & (1<<c)) {
-        NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type));
+      if (sendMask & (1UL<<c)) {
+        NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
        if (type > highestType) highestType = type;
      }
    }
+    TIME_STOP(1);

+    TIME_START(2);
    if (sendPeer == recvPeer) {
      if (recvChannels+sendChannels) {
-         //NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
-         //NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+         //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
+         //NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)), ret, fail);
         sendData = data;
         recvData = data+sendChannels;
      }
    } else {
-      //if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
-      //if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
-      //if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
-      //if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
+      //if (recvChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
+      //if (sendChannels) NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      //if (sendChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels), ret, fail);
+      //if (recvChannels) NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels), ret, fail);
    }
+    TIME_STOP(2);

+    TIME_START(3);
    for (int c=0; c<MAXCHANNELS; c++) {
-      if (sendMask & (1<<c)) {
+      if (sendMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
-        //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
+        //NCCLCHECKGOTO(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
-        //CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
+        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
+        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
+    TIME_STOP(3);
+    TIME_START(4);
    for (int c=0; c<MAXCHANNELS; c++) {
-      if (recvMask & (1<<c)) {
+      if (recvMask & (1UL<<c)) {
        struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
-        //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
+        //NCCLCHECKGOTO(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn), ret, fail);
        conn->connected = 1;
-        //CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
+        //CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->hostStream.cudaStream), ret, fail);
      }
    }
-    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
+    TIME_STOP(4);
+    comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
  }
-  //CUDACHECK(hipStreamSynchronize(transportSetupStream));
-  //CUDACHECK(hipStreamDestroy(transportSetupStream));
+
  if (highestTransportType != NULL) *highestTransportType = highestType;
-  return ncclSuccess;
+  TIME_PRINT("P2P Setup/Connect");
+exit:
+  //NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->deviceStream, &comm->hostStream));
+  //NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->hostStream));
+  return ret;
+fail:
+  goto exit;
 }

 extern struct ncclTransport collNetTransport;
@@ -466,32 +481,120 @@ RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
 NCCL_PARAM(AllocP2pNetLLBuffers, "NCCL_ALLOC_P2P_NET_LL_BUFFERS", 0);
 RCCL_PARAM(LL128ForceEnable, "LL128_FORCE_ENABLE", 0);

+static ncclResult_t collNetTrySetup(ncclComm_t comm, struct ncclTopoGraph* collNetGraph) {
+  ncclResult_t ret = ncclSuccess;
+  int* heads = NULL;
+  int rank = comm->rank;
+  int collNetSetupFail = 0;
+  int highestTypes[NCCL_MAX_LOCAL_RANKS] = { TRANSPORT_P2P };
+  // Find all head ranks
+  int nHeads = collNetGraph->nChannels;
+  int highestTransportType0, highestTransportType1;
+  char line[1024];
+
+  NCCLCHECKGOTO(ncclCalloc(&heads, nHeads), ret, fail);
+  // Head GPU index is always 0
+  for (int c = 0; c < nHeads; c++) {
+    heads[c] = collNetGraph->intra[c * comm->localRanks + 0];
+  }
+
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels + c;
+    for (int h = 0; h < nHeads; h++) {
+      const int head = heads[h];
+      collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv);
+      if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend);
+    }
+    // Verify CollNet setup across ranks after trying the first channel
+    if (c == 0) {
+      NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+    }
+  }
+  // Verify CollNet setup across ranks after trying all channels
+  NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, fail);
+  TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
+
+  line[0] = '\0';
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclTree* chain = &comm->channels[c].collnetChain;
+    snprintf(line + strlen(line), 1023 - strlen(line), " [%d] %d->%d->%d",
+      c, chain->down[0], rank, chain->up);
+  }
+  line[1023] = '\0';
+
+  INFO(NCCL_INIT, "Collnet Chains %s", line);
+  // Connect Collnet + chain
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0), ret, fail);
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1), ret, fail);
+  INFO(NCCL_INIT, "Connected collnet + chain");
+
+  // Connect intra-node CollNet + Direct
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channelRecv = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 0, &highestTransportType0), ret, fail);
+
+  for (int c = 0; c < comm->nChannels; c++) {
+    struct ncclChannel* channelSend = comm->channels + c;
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
+  }
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, collNetGraph, 1, &highestTransportType1), ret, fail);
+
+#if 0
+  // Exchange highest intra-node transport type among ranks
+  // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
+  comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)), ret, fail);
+  for (int i = 0; i < comm->localRanks; i++) {
+    if (highestTypes[i] > comm->intraHighestTransportType)
+      comm->intraHighestTransportType = highestTypes[i];
+  }
+#endif
+  INFO(NCCL_INIT, "rank %d Connected CollNet", rank);
+
+exit:
+  free(heads);
+  return ret;
+fail:
+  ncclTransportCollNetFree(comm);
+  comm->collNetSupport = 0;
+  goto exit;
+}
+
 ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
  struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
  // We use 2 AllGathers
  // 1. { peerInfo, comm, compCap}
  // 2. { nChannels, graphInfo, topoRanks }
-
+  ncclResult_t ret = ncclSuccess;
  int rank = comm->rank;
  int nranks = comm->nRanks;
  //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  cpu_set_t affinitySave;
  //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  // [RCCL] Collect the PID of the root
-  int rootPid;
-  //NCCLCHECK(bootstrapInit(commId, comm));
-  // [/RCCL]
+  //NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)commId, comm), ret, fail);

  // AllGather1 - begin
-  //NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
-  //NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, comm->rank));
-  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
+  //NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
+  //NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, commHash), ret, fail);
+  //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);

  //If virtualId == -1 multiRank support has not been requested by user, using original interface
  if (comm->virtualId == -1) {
    for (int i = 0; i < nranks; i++) {
      if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
-	WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
-	return ncclInvalidUsage;
+        WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+        ret = ncclInvalidUsage;
+        goto fail;
      }
    }
  }
@@ -512,8 +615,43 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
  }
  // AllGather1 - end

+  do {
+    // Compute intra-process ranks
+    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+    for (int i = 0; i < nranks; i++) {
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+        // Rank is in same process
+        if (intraProcRanks == 0) intraProcRank0 = i;
+        if (i == rank) intraProcRank = intraProcRanks;
+        intraProcRanks++;
+        if (intraProcRank0 == rank && rank != i) {
+          comm->peerInfo[i].comm->intraNext = comm->intraNext;
+          comm->intraNext = comm->peerInfo[i].comm;
+        }
+      }
+    }
+    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+          intraProcRank, intraProcRanks, intraProcRank0);
+      ret = ncclInternalError;
+      goto fail;
+    }
+    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
+    assert(intraProcRank==0 ? comm==comm0 : true);
+    comm->intraComm0 = comm0;
+    comm->intraRank = intraProcRank;
+    comm->intraRanks = intraProcRanks;
+    comm->intraBarrierPhase = 0;
+    comm->intraBarrierCounter = 0;
+    comm->intraBarrierGate = 0;
+  } while(0);
+
  // Topo detection / System graph creation
-  //NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
+  //NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
  // save nRanks to ncclTopoSystem as indicator of multi-node
  comm->topo->nRanks = comm->nRanks;
  // init netGdrLevel
@@ -524,57 +662,54 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
  // LL128
  comm->topo->ll128Enabled = false;
  // Compute paths between GPUs and NICs
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
  // Remove inaccessible GPUs and unused NICs
-  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
+  NCCLCHECKGOTO(ncclTopoTrimSystem(comm->topo, comm), ret, fail);
  // Recompute paths after trimming
-  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
+  NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
  // Init search
-  NCCLCHECK(ncclTopoSearchInit(comm->topo));
+  NCCLCHECKGOTO(ncclTopoSearchInit(comm->topo), ret, fail);
  // Print final topology
-  NCCLCHECK(ncclTopoPrint(comm->topo));
+  NCCLCHECKGOTO(ncclTopoPrint(comm->topo), ret, fail);

  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
  // on the host is local.
-  //NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
-  //cpu_set_t affinitySave;
- // if (CPU_COUNT(&comm->cpuAffinity)) {
-    //sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-    //sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  //NCCLCHECKGOTO(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity), ret, fail);
+  //if (CPU_COUNT(&comm->cpuAffinity)) {
+  //  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  //  sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
  //}
-  ncclResult_t ret;

  // Launch proxy service thread
-  //NCCLCHECK(ncclProxyCreate(comm));
+  //NCCLCHECKGOTO(ncclProxyCreate(comm), ret, fail);

  // Get rings and trees
-  //struct ncclTopoGraph ringGraph;
  ringGraph.id = 0;
  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
  ringGraph.collNet = 0;
  ringGraph.minChannels = 1;
  ringGraph.maxChannels = MAXCHANNELS/2;
-  NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &ringGraph), ret, fail);

-  //struct ncclTopoGraph treeGraph;
  treeGraph.id = 1;
  treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
  treeGraph.collNet = 0;
  treeGraph.minChannels = comm->topo->nodes[NET].count != 0 ? 1 : ringGraph.nChannels;
  treeGraph.maxChannels = ringGraph.nChannels;
-  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &treeGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &treeGraph), ret, fail);

-  //struct ncclTopoGraph collNetGraph;
  collNetGraph.id = 2;
  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
  collNetGraph.collNet = 1;
  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
-  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
+  NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);

-  bool allXgmi = true, hasPeerAccess = true;
+  bool allXgmi, hasPeerAccess;
+  allXgmi = true;
+  hasPeerAccess = true;
  // Check that all the GPUs have peer access to one another and are XGMI connected
  for (int i = 0; i < nranks && hasPeerAccess; i++) {
    int cudaDev1 = comm->peerInfo[i].cudaDev;
@@ -599,7 +734,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t

  if (comm->rank == ncclParamGraphDumpFileRank()) {
    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
-    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
+    NCCLCHECKGOTO(ncclTopoDumpGraphs(comm->topo, 3, graphs), ret, fail);
  }

  // Determine local CollNet support before all-gather
@@ -623,31 +758,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
      INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
  }
  // AllGather3 - begin
-#if 0
-  struct ncclGraphInfo {
-    int pattern;
-    int nChannels;
-    int sameChannels;
-    float bwIntra;
-    float bwInter;
-    int typeIntra;
-    int typeInter;
-  };
-
-  struct {
-    int netDev;
-    int collNetSupport;
-    int nc;
-    struct ncclGraphInfo tree;
-    struct ncclGraphInfo ring;
-    struct ncclGraphInfo collNet;
-    struct ncclTopoRanks topoRanks;
-    bool pivotA2AEnabled;
-    bool ll128Enabled;
-  } *allGather3Data;
-
-  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
-#endif
+  //NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
  int idx;
  NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, comm->busId, &idx));
  allGather3Data[rank].nc = 2;
@@ -669,7 +780,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
    allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels);
  if (ringGraph.nChannels > MAXCHANNELS/2)
    allGather3Data[rank].nc = 1;
-  NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
+  NCCLCHECKGOTO(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev), ret, fail);
  allGather3Data[rank].tree.pattern = treeGraph.pattern;
  allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -698,8 +809,9 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t

  comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
    ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
-  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
-  return ncclSuccess;
+  NCCLCHECKGOTO(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks), ret, fail);
+fail:
+  return ret;
 }

 ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
@@ -707,13 +819,20 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  int rank = comm->rank;
  int nranks = comm->nRanks;
  ncclResult_t ret;
-  //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+  int nChannelsOrig;
+  struct ncclTopoRanks** allTopoRanks = NULL;
+  int *nodesFirstRank = NULL, *nodesTreePatterns = NULL;
+  int *rings = NULL;
+  int* nvbPeers = NULL;
+  struct ncclProxyConnector proxyConn;
+  int* pxnPeers = NULL;
+
+  //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);

  // Determine nNodes, firstRanks, ...
-  int *nodesFirstRank, *nodesTreePatterns;
-  NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
-  NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
-  NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
+  NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
  for (int r=0; r<nranks; r++) {
    int node;
    int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
@@ -727,8 +846,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    comm->rankToNode[r] = node;
  }
  // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
-  NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
-  NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
+  NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
  for (int r=0; r<comm->nRanks; r++) {
    int node = comm->rankToNode[r];
    comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
@@ -736,7 +855,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
  }
  // Allocate ranks arrays for each node
  for (int n=0; n<comm->nNodes; n++) {
-    NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
+    NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
    comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
    comm->nodeRanks[n].localRanks = 0;
  }
@@ -756,13 +875,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
         rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
         comm->localRank, comm->localRanks, comm->localRankToRank[0]);
-    return ncclInternalError;
+    ret = ncclInternalError;
+    goto fail;
  }

-  int nChannelsOrig = comm->nChannels;
-  struct ncclTopoRanks** allTopoRanks;
-  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
-  int nc = allGather3Data[0].nc;
+  nChannelsOrig = comm->nChannels;
+  NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
+  int nc;
+  nc = allGather3Data[0].nc;
  for (int i=0; i<nranks; i++) {
    comm->peerInfo[i].netDev = allGather3Data[i].netDev;
    allTopoRanks[i] = &allGather3Data[i].topoRanks;
@@ -816,19 +936,14 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    }
  }

-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc));
+  NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc), ret, fail);

  if (comm->topo->pivotA2ANumBiRings == 3) {
    NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
    NCCLCHECK(ncclBinaryTreePostset(comm, &treeGraph));
  }

-  free(allTopoRanks);
-  free(nodesTreePatterns);
-  free(nodesFirstRank);
-  //free(allGather3Data);

  // AllGather3 - end

@@ -854,125 +969,41 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
    INFO(NCCL_INIT, "BinTrees%s", binline);
  }

-  //NCCLCHECK(computeBuffSizes(comm));
+  //NCCLCHECKGOTO(computeBuffSizes(comm), ret, fail);

  // Connect with prev/next for each ring
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
-    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
+    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail);
  if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
    comm->useIntraNet = 1;
    // Connect NET for intranode use
    for (int c=0; c<comm->nChannels; c++) {
      struct ncclChannel* channel = comm->channels+c;
      if (comm->nRanks == 1) continue;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
+      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail);
    }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, fail);
  }
-  free(rings);
  INFO(NCCL_INIT, "Connected all rings");

  // Connect Trees
  for (int c=0; c<comm->nChannels; c++) {
    struct ncclChannel* channel = comm->channels+c;
    if (comm->nRanks == 1) continue;
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
-    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
-    // RCCL: need to connect binTree as well
-    if (comm->topo->pivotA2ANumBiRings == 3) {
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->binTree.down, 1, &channel->binTree.up, 0), ret, affinity_restore);
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->binTree.up, NCCL_MAX_TREE_ARITY, channel->binTree.down, 0), ret, affinity_restore);
-    }
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
+    NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
  }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
  INFO(NCCL_INIT, "Connected all trees");

  // Check if we can setup CollNet
-  if (comm->collNetSupport > 0) {
-    int collNetSetupFail = 0;
-    int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
-    // Find all head ranks
-    int nHeads = collNetGraph.nChannels;
-    int *heads;
-    NCCLCHECK(ncclCalloc(&heads, nHeads));
-    // Head GPU index is always 0
-    for (int c=0; c<nHeads; c++) {
-      heads[c] = collNetGraph.intra[c*comm->localRanks+0];
-    }
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      for (int h=0; h<nHeads; h++) {
-        const int head = heads[h];
-        collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
-        collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
-      }
-      // Verify CollNet setup across ranks after trying the first channel
-      if (c == 0) {
-        NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
-      }
-    }
-    // Verify CollNet setup across ranks after trying all channels
-    NCCLCHECKGOTO(ncclTransportCollNetCheck(comm, collNetSetupFail), ret, collnet_cleanup);
-    TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank);
+  if (comm->collNetSupport > 0) collNetTrySetup(comm, &collNetGraph);

-    char line[1024];
-    line[0]='\0';
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclTree* chain = &comm->channels[c].collnetChain;
-      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d",
-          c, chain->down[0], rank, chain->up);
-    }
-    line[1023] = '\0';
-    INFO(NCCL_INIT, "Collnet Chains %s", line);
-    // Connect Collnet + chain
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->collnetChain.up, 1, channel->collnetChain.down, 0), ret, collnet_cleanup);
-    }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0), ret, collnet_cleanup);
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, channel->collnetChain.down, 1, &channel->collnetChain.up, 1), ret, collnet_cleanup);
-    }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1), ret, collnet_cleanup);
-    INFO(NCCL_INIT, "Connected collnet + chain");
-
-    // Connect intra-node CollNet + Direct
-    int highestTransportType0, highestTransportType1;
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channelRecv = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, collnet_cleanup);
-    }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channelSend = comm->channels+c;
-      NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, collnet_cleanup);
-    }
-    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
-
-    // Exchange highest intra-node transport type among ranks
-    // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
-    comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
-    //NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
-    for (int i=0; i<comm->localRanks; i++) {
-      if (highestTypes[i] > comm->intraHighestTransportType)
-        comm->intraHighestTransportType = highestTypes[i];
-    }
-    INFO(NCCL_INIT, "rank %d Connected CollNet comm %p nRanks %02d", rank, comm, comm->nRanks);
-
-collnet_cleanup:
-    free(heads);
-    if (ret != ncclSuccess) {
-      NCCLCHECK(ncclTransportCollNetFree(comm));
-      comm->collNetSupport = 0;
-      ret = ncclSuccess;
-    }
-  }
  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);

  // Compute time models for algorithm and protocol combinations
@@ -983,11 +1014,11 @@ collnet_cleanup:
      minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
      maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
    }
-    NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+    NCCLCHECKGOTO(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph), ret, fail);
  } while(0);

  // Compute nChannels per peer for p2p
-  NCCLCHECK(ncclTopoComputeP2pChannels(comm));
+  NCCLCHECKGOTO(ncclTopoComputeP2pChannels(comm), ret, fail);
 #if 0
  do { // Setup p2p structures in comm->tasks
    struct ncclTasks* tasks = &comm->tasks;
@@ -1038,79 +1069,40 @@ collnet_cleanup:
  if (ncclParamNvbPreconnect()) {
    // Connect p2p when using NVB path
    int nvbNpeers;
-    int* nvbPeers;
-    NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
+    NCCLCHECKGOTO(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers), ret, fail);
    for (int r=0; r<nvbNpeers; r++) {
      int peer = nvbPeers[r];
      int channelId;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
+        NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId), ret, fail);
        if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
-          comm->connectSend[peer] |= (1<<channelId);
+          comm->connectSend[peer] |= (1UL<<channelId);
        }
      }
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
-        NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
+        NCCLCHECKGOTO(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId), ret, fail);
        if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
-          comm->connectRecv[peer] |= (1<<channelId);
+          comm->connectRecv[peer] |= (1UL<<channelId);
        }
      }
    }
-    NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
-    free(nvbPeers);
+    NCCLCHECKGOTO(ncclTransportP2pSetup(comm, NULL, 1), ret, fail);
  }
 #endif
  // Connect to local net proxy
-  struct ncclProxyConnector proxyConn;
-  //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
-  //NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+  //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn), ret, fail);
+  //NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);

  // Then to remote ones when using PXN
  if (ncclPxnDisable(comm) == 0) {
    int nranks;
-    int* pxnPeers;
-    NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
+    NCCLCHECKGOTO(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks), ret, fail);
    for (int r=0; r<nranks; r++) {
-      //NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
-     // NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+      //NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn), ret, fail);
+      //NCCLCHECKGOTO(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0), ret, fail);
    }
-    free(pxnPeers);
  }

-  do {
-    // Compute intra-process ranks
-    int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-    for (int i = 0; i < nranks; i++) {
-      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
-          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
-        // Rank is in same process
-        if (intraProcRanks == 0) intraProcRank0 = i;
-        if (i == rank) intraProcRank = intraProcRanks;
-        intraProcRanks++;
-        if (intraProcRank0 == rank && rank != i) {
-          comm->peerInfo[i].comm->intraNext = comm->intraNext;
-          comm->intraNext = comm->peerInfo[i].comm;
-        }
-      }
-    }
-    TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-        rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
-    if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
-      WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
-          rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
-          intraProcRank, intraProcRanks, intraProcRank0);
-      return ncclInternalError;
-    }
-    struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
-    assert(intraProcRank==0 ? comm==comm0 : true);
-    comm->intraComm0 = comm0;
-    comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
-    comm->intraRank = intraProcRank;
-    comm->intraRanks = intraProcRanks;
-    comm->intraBarrierPhase = 0;
-    comm->intraBarrierCounter = 0;
-    comm->intraBarrierGate = 0;
-  } while(0);

 #if 0
  if (comm->intraRank == 0) { // Load ncclParamLaunchMode
@@ -1129,23 +1121,31 @@ collnet_cleanup:
    }
  }

-  NCCLCHECKGOTO(devCommSetup(comm), ret, affinity_restore);
+  // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
+  // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
+  NCCLCHECKGOTO(devCommSetup(comm), ret, fail);

  /* Local intra-node barrier */
-  //NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
-
-  // Unlink proxy shm to make sure it will be properly cleaned up.
-  NCCLCHECK(ncclProxyShmUnlink(comm));
+  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
 #endif
-
  // We should have allocated all buffers, collective fifos, ... we can
  // restore the affinity.
-affinity_restore:
-  //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  if (ret != ncclSuccess) return ret;
-
  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
-  return ncclSuccess;
+
+exit:
+  //if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  // Unlink proxy shm to make sure it will be properly cleaned up.
+  //ncclProxyShmUnlink(comm);
+  free(allTopoRanks);
+  free(nodesTreePatterns);
+  free(nodesFirstRank);
+  //free(allGather3Data);
+  free(rings);
+  free(nvbPeers);
+  free(pxnPeers);
+  return ret;
+fail:
+  goto exit;
 }

 ncclResult_t rocm_smi_init() {