Select sendrecv path based on collective data size (#391)

* Select sendrecv path based on collective data size * Add comments on packing and unpacking group field * Toggling RCCL_P2P_NET_DISABLE in combined calls unit tests
2021-06-10 17:51:04 -07:00
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
      T * __restrict__ thisOutput = (T*)args->recvbuff;

      ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
-        prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
+        prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));

      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
@@ -37,7 +37,7 @@ class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
    T * __restrict__ thisOutput = (T*)args->recvbuff;

    ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
-      prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
+      prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));

    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
      ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
@@ -33,7 +33,7 @@ class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T
      T * __restrict__ thisOutput = (T*)args->recvbuff;

      ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
-        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));

      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
@@ -49,6 +49,13 @@
 #define ROLE_POST_SEND 0x10
 #define ROLE_POST_RECV 0x20

+// Connection index is used to select P2P and NET and needs to be passed into ncclPrimitives constructor.
+// To avoid adding another parameter which requires changes to every places ncclPrimitives are constructed,
+// we pack group (max 7) and connection index (max 2) to original group which is 32-bit.
+#define PACK_GROUP(gr, idx) (gr | (idx<<16))
+#define TO_GR(group) (group&0xffff)
+#define TO_IDX(group) (group>>16)
+
 // Implementation of primitive types
 template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
 class ncclPrimitives {
@@ -75,7 +82,7 @@ class ncclPrimitives {
  T* direct = NULL;
  T* buff;
  struct ncclDevComm* comm;
-  const int p2pNet;
+  const int connIndex;

  const T** srcs;
  T** dsts;
@@ -247,8 +254,6 @@ class ncclPrimitives {
  __device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
    if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
      // For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
-      const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
-        ((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
      conn = &channel->devPeers[peer].recv[connIndex].conn;
      step = conn->step;
      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
@@ -273,8 +278,6 @@ class ncclPrimitives {
  __device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
    if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
      // For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1
-      const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 :
-        ((p2pNet && (NSEND+NRECV) == 1 ? NCCL_CONN_IDX_P2P_NET : ((NSEND+NRECV) == 1 ? 0 : group)));
      conn = &channel->devPeers[peer].send[connIndex].conn;
      step = conn->step;
      step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
@@ -308,7 +311,9 @@ class ncclPrimitives {
 public:
  __device__ __forceinline__
  ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
-    : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2pNet(*comm->p2pNet) {
+    : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[TO_GR(group)].srcs), dsts((T**)ptrs[TO_GR(group)].dsts),
+    group(TO_GR(group)), barriers(&ptrs[TO_GR(group)].barrier), barrier_next(ptrs[TO_GR(group)].barrier_next),
+    connIndex((NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? TO_GR(group)/2 : TO_IDX(group)) {
    nthreads = nworkers;
    // For send operations, we need an extra warp to overlap the threadfence and the copy
    // int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
@@ -34,7 +34,7 @@ class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, U
      T * __restrict__ thisOutput = (T*)args->recvbuff;

      ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
-        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));

      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
@@ -31,7 +31,7 @@ class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUN
      T * __restrict__ thisOutput = (T*)args->recvbuff;

      ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
-        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, args->coll.connIndex);
+        prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(0, args->coll.connIndex));

      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
@@ -57,7 +57,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
              int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
              int nt = nThreadsSplit;
              ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
-                prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
+                prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupRecv, args->p2p.recvIdx));

              if (recvCount == 0) {
                prims.recv(recvbuff, 0);
@@ -73,7 +73,7 @@ class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T,
              int peer = (comm->rank+delta)%comm->nRanks;
              int nt = nThreads-nThreadsSplit;
              ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
-                prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
+                prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, PACK_GROUP(groupSend, args->p2p.sendIdx));

              if (sendCount == 0) {
                prims.send(sendbuff, 0);
@@ -701,9 +701,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
      int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].send[0].connected == 0) {
          comm->connectSend[peer] |= (1<<channelId);
-          comm->connect = 1;
+          comm->connect[0] = 1;
+        }
+        if (comm->p2pNet && comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+          comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
+          comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
        }
      }
    }
@@ -714,9 +718,13 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
      int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
      for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
        int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
-        if (comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector
+        if (comm->channels[channelId].peers[peer].recv[0].connected == 0) {
          comm->connectRecv[peer] |= (1<<channelId);
-          comm->connect = 1;
+          comm->connect[0] = 1;
+        }
+        if (comm->p2pNet && comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
+          comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1<<channelId);
+          comm->connect[NCCL_CONN_IDX_P2P_NET] = 1;
        }
      }
    }
@@ -792,10 +800,16 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
  // Compute cuda kernel arg and proxy arg templates
  struct ncclQueueElem* eqElem;
  NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem));
+
  // The proxy code will set and tune the send/recv chunk size, make sure to run it first.
  NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
  NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));

+  eqElem->proxyArgs.sendIdx = info->sendIdx;
+  eqElem->proxyArgs.recvIdx = info->recvIdx;
+  eqElem->work.p2p.sendIdx = info->sendIdx;
+  eqElem->work.p2p.recvIdx = info->recvIdx;
+
  int channelId = info->channelId;
  hipLaunchParams* params = comm->myParams;
  params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
@@ -36,6 +36,7 @@ struct ncclInitArgs {
 };
 struct ncclCollArgs {
  ncclComm_t comm;
+  uint16_t connIndex;
 };

 enum ncclAsyncFuncType {
@@ -118,7 +119,8 @@ ncclResult_t ncclGroupStart() {
  return ncclSuccess;
 }

-static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
+static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes,
+  void* recvbuff, ssize_t sendbytes, const void* sendbuff, uint16_t sendIdx, uint16_t recvIdx) {
  struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
    sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
    1, 1 };
@@ -126,6 +128,8 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann
  info.channelId = channelId;
  info.sendbytes = sendbytes;
  info.recvbytes = recvbytes;
+  info.sendIdx = sendIdx;
+  info.recvIdx = recvIdx;
  if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
  NCCLCHECK(ncclSetupP2pKernel(&info));
  return ncclSuccess;
@@ -135,7 +139,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
  struct ncclComm* comm = args->coll.comm;
  CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
-  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P));
+  NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
  return args;
 }

@@ -150,6 +154,8 @@ static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels
  return size;
 }

+RCCL_PARAM(P2pNetThreshold, "RCCL_P2P_NET_THRESHOLD", 131072);
+
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
@@ -195,14 +201,15 @@ ncclResult_t ncclGroupEnd() {

  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
+      args->coll.connIndex = 0;
      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
    }
  }

  for (int i=0; i<ncclGroupIndex; i++) {
    struct ncclAsyncArgs* args = ncclGroupArgs+i;
-    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[0]) {
      int err = pthread_join(ncclGroupThreads[i], NULL);
      if (err != 0) {
        WARN("Error waiting for pthread_join : %s", strerror(errno));
@@ -210,7 +217,29 @@ ncclResult_t ncclGroupEnd() {
      }
      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
      NCCLCHECKGOTO(args->ret, ret, end);
-      args->coll.comm->connect = 0;
+      args->coll.comm->connect[0] = 0;
+    }
+  }
+
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
+      args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
+      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
+    }
+  }
+
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
+      int err = pthread_join(ncclGroupThreads[i], NULL);
+      if (err != 0) {
+        WARN("Error waiting for pthread_join : %s", strerror(errno));
+        return ncclSystemError;
+      }
+      INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
+      NCCLCHECKGOTO(args->ret, ret, end);
+      args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
    }
  }

@@ -253,6 +282,12 @@ sched_delta:
            ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
            ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);

+            uint16_t sendIdx = 0, recvIdx = 0;
+            if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
+              sendIdx = NCCL_CONN_IDX_P2P_NET;
+            if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
+              recvIdx = NCCL_CONN_IDX_P2P_NET;
+
            ssize_t sendOffset = 0;
            ssize_t recvOffset = 0;
            int sendRemaining = 1, recvRemaining = 1;
@@ -270,7 +305,7 @@ sched_delta:
              if (sendbytes >= 0 || recvbytes >= 0) {
                NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
                      recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
-                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
+                      sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL, sendIdx, recvIdx), ret, group_cleanup);
              }
              recvOffset += recvChunkSize;
              sendOffset += sendChunkSize;
@@ -74,7 +74,7 @@ struct ncclComm {

  void* bootstrap;
  // Bitmasks for ncclTransportP2pSetup
-  int connect;
+  int connect[NCCL_MAX_CONNS];
  uint32_t* connectSend;
  uint32_t* connectRecv;

@@ -128,7 +128,7 @@ struct ncclComm {
  volatile uint32_t *abortFlag;

  // Flags for enable P2P NET
-  uint32_t *p2pNet;
+  uint32_t p2pNet;
  uint32_t useIntraNet;

  // Device side of the communicator
@@ -163,7 +163,6 @@ struct ncclDirect {
  int down[NCCL_MAX_DIRECT_ARITY];
 };

-#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2)
 #define NCCL_CONN_IDX_P2P_NET 2
 #define NCCL_MAX_CONNS 3
 struct ncclPeer {
@@ -208,7 +207,14 @@ struct ncclWorkElem {
      int sendChunkSize;
      int recvChunkSize;
      int32_t delta;
-      uint16_t nThreads;
+      union {
+        struct {
+          uint16_t nThreads:12;
+          uint16_t sendIdx:2;
+          uint16_t recvIdx:2;
+        };
+        uint16_t padding;
+      };
    } p2p;
    struct {
      uint16_t padding[15];
@@ -357,9 +363,6 @@ struct ncclDevComm {
  // Channels, device side
  struct ncclChannel* channels;

-  // Flags for enable P2P NET
-  uint32_t *p2pNet;
-
 #ifdef ENABLE_PROFILING
  // Profiling counters
  struct ncclProf* devProf;
@@ -53,6 +53,8 @@ struct ncclInfo {
  int sendChunkSize;
  uint32_t delta;
  int channelId;
+  uint16_t sendIdx;
+  uint16_t recvIdx;
 };

 #endif
@@ -61,6 +61,8 @@ struct ncclProxyArgs {
  int idle;
  uint64_t hdp_flushed;
  uint8_t connIndex;
+  uint8_t sendIdx;
+  uint8_t recvIdx;

  // Element linking
  pthread_mutex_t mutex;
@@ -377,7 +377,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
    free(comm->intraCC);
  }
  NCCLCHECK(ncclCudaHostFree((void *)comm->abortFlag));
-  NCCLCHECK(ncclCudaHostFree((void *)comm->p2pNet));

  // Poison comm to try and catch a double free
  commPoison(comm);
@@ -430,9 +429,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  comm->hostDevComm.abortFlag = comm->abortFlag;
  STORE(comm->abortFlag, 0);

-  NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->p2pNet, 1));
-  comm->hostDevComm.p2pNet = comm->p2pNet;
-  STORE(comm->p2pNet, 0);
  comm->collOpCount = 0;
  comm->p2pOpCount = 0x8000;

@@ -466,8 +462,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {

  static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
  static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels");
-  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks*NCCL_MAX_CONNS));
+  NCCLCHECK(ncclCalloc(&comm->connectRecv, comm->nRanks*NCCL_MAX_CONNS));

  comm->p2pSendCount = comm->p2pRecvCount = 0;
  NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
@@ -837,7 +833,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm

  if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
    if (rcclParamP2pNetDisable() == 0) {
-      STORE(comm->p2pNet, 1);
+      comm->p2pNet = 1;
      INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
    }
    else
@@ -287,7 +287,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
    sub->sendbytes = 0;
    sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, NCCL_CONN_IDX_P2P));
+    NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, args->recvIdx));
  }
  if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
    int peersend = (comm->rank+sub->delta)%comm->nRanks;
@@ -295,7 +295,7 @@ ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args)
    sub->recvbytes = 0;
    sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
    if (sub->nsteps == 0) sub->nsteps = 1;
-    NCCLCHECK(SaveProxy(proxySend, peersend, args, NCCL_CONN_IDX_P2P));
+    NCCLCHECK(SaveProxy(proxySend, peersend, args, args->sendIdx));
  }
  // Reset proxy args for potentially multiple cuda graph launches
  // It is safe as long as SaveProxy copies contents of args to op
@@ -69,12 +69,12 @@ ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel*
  for (int i=0; i<nrecv; i++) {
    int peer = peerRecv[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
-    comm->connectRecv[peer] |= mask;
+    comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
  }
  for (int i=0; i<nsend; i++) {
    int peer = peerSend[i];
    if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
-    comm->connectSend[peer] |= mask;
+    comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
  }
  return ncclSuccess;
 }
@@ -98,8 +98,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
    int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
    int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
    int sendPeer = (comm->rank + i) % comm->nRanks;
-    uint32_t recvMask = comm->connectRecv[recvPeer];
-    uint32_t sendMask = comm->connectSend[sendPeer];
+    uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
+    uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];

    struct ncclConnect* recvData = data;
    int sendChannels = 0, recvChannels = 0;
@@ -145,7 +145,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
        CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
      }
    }
-    comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
+    comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
  }
  CUDACHECK(hipStreamSynchronize(transportSetupStream));
  CUDACHECK(hipStreamDestroy(transportSetupStream));
@@ -122,6 +122,6 @@ namespace CorrectnessTests
                                testing::Values(2,3,4,5,6,7,8),
                                // In-place or not
                                testing::Values(false),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_P2P_NET_DISABLE=0", "RCCL_P2P_NET_DISABLE=1")),
                            CorrectnessTest::PrintToStringParamName());
 } // namespace