rocm-systems/src/enqueue.cc

/*************************************************************************
 * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
 * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "enqueue.h"
#include "argcheck.h"
#include "coll_net.h"
#include "graph/topo.h"
#include <hip/hip_runtime.h>
#include <hip/hip_ext.h>
#include "gdrwrap.h"
#include "bootstrap.h"
#include <cstring>
#include "channel.h"
#include "rocmwrap.h"
#include "rccl_vars.h"
#include "transport.h"
#include "common.h"
#include "api_trace.h"
#include <cassert>
#include <cstring> // std::memcpy
#include <cinttypes> // PRIx64

struct ncclKernelMatch {
  void* kernelFn;
  bool specialized;
};

#ifdef ENABLE_COLLTRACE
static ncclKernelMatch const ncclKerns[4] = {
  {(void *)ncclDevKernel_Generic, true},
  {(void *)ncclDevKernel_Generic_4, true},
  {(void *)ncclDevKernelDebug_Generic, true},
  {(void *)ncclDevKernelDebug_Generic_4, true},
};
#else
static ncclKernelMatch const ncclKerns[2] = {
  {(void*)ncclDevKernel_Generic, true},
  {(void*)ncclDevKernel_Generic_4, true},
};
#endif

static ncclResult_t computeColl(struct ncclInfo* info /* input */, int* workFuncIndex, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */);

NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);

static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work);
static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work);
static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg);
static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels);
static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp);
static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps);
static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps);
static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo);
static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo);
static ncclResult_t getPatternInfo(struct ncclInfo* collInfo);
static ncclResult_t getLoopInfo(struct ncclInfo* collInfo);
static ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport);

int ncclGetKernelIndex(struct ncclComm* comm) {
#if ENABLE_COLLTRACE
  int start_idx = comm->collTraceThread ? 2 : 0;
#else
  int start_idx = 0;
#endif
  hipDeviceProp_t devProp;
  CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
  if(IsArchMatch(devProp.gcnArchName, "gfx908") || (IsArchMatch(devProp.gcnArchName, "gfx94")
    && devProp.multiProcessorCount > 80))
    return start_idx;
  else
    return start_idx + 1;
}

// Returns maximum kernel stack size of all CUDA kernels
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
  constexpr int KernelCount = sizeof(ncclKerns)/sizeof(ncclKerns[0]);
  ncclResult_t result = ncclSuccess;

  if (maxStackSize) *maxStackSize = 0;
  int carveout = ncclParamL1SharedMemoryCarveout();

  // Keep track if we already visited a function pointer.
  void* lru[2] = {nullptr, nullptr};
  for (int i=0; i < KernelCount; i++) {
    void* fn = ncclKerns[i].kernelFn;
    if (fn == lru[0] || fn == lru[1]) goto next_kernel;
    lru[1] = lru[0];
    lru[0] = fn;

    if (maxStackSize) {
      cudaFuncAttributes attr = {0};
      if (cudaFuncGetAttributes(&attr, fn) != cudaSuccess)
        WARN("Failed to get kernel attributes");
      if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
    ignore0:;
    }

    if (carveout) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
        result, ignore1);
    ignore1:;
    }

    if (ncclShmemDynamicSize(cudaArch) != 0) {
      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
        result, next_kernel);
    }
  next_kernel:;
  }
  return result;
}

/*****************************************************************************/
/*       Launch system : synchronization and CUDA kernel launch              */
/*****************************************************************************/
static void appendWorkElemColl(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
    int funcIndex, struct ncclWorkElem const *elem) {
  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
  if (q && funcIndex == q->work.header.funcIndex
        && elem->nWarps == q->work.elems[0].nWarps
        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS
        && ncclWorkTypeColl == q->work.header.type) {
    int e = chan->nWorkElem++;
    q->work.elems[e] = *elem; // C++ struct assignment
    return;
  }
  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
  q->work.header.type = ncclWorkTypeColl;
  q->work.header.funcIndex = funcIndex;
  q->work.elems[0] = *elem; // C++ struct assignment
  chan->nWorkElem = 1;
  chan->nWork += 1;
  ncclIntruQueueEnqueue(&chan->workQueue, q);
}

static void appendWorkElemColl(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
    int funcIndex, struct ncclWorkElemReg const *elem) {
  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
  if (q && funcIndex == q->work.header.funcIndex
        && elem->elem.nWarps == q->work.regElems[0].elem.nWarps
        && chan->nWorkElem < NCCL_MAX_WORK_ELEMENTS_REG
        && ncclWorkTypeRegColl == q->work.header.type) {
    int e = chan->nWorkElem++;
    q->work.regElems[e] = *elem; // C++ struct assignment
    q->work.regElems[e].elem.isUsed = 1;
    return;
  }
  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
  q->work.header.type = ncclWorkTypeRegColl;
  q->work.header.funcIndex = funcIndex;
  q->work.regElems[0] = *elem; // C++ struct assignment
  q->work.regElems[0].elem.isUsed = 1;
  chan->nWorkElem = 1;
  chan->nWork += 1;
  ncclIntruQueueEnqueue(&chan->workQueue, q);
}

static void finishWorkP2p(struct ncclWork* work, int WarpSize) {
  int nElem = 0;
  for (int e=0; e < NCCL_MAX_WORK_ELEMENTS_P2P; e++) {
    if (work->p2pElems[e].p2pType != ncclWorkP2pTypeUnused)
      nElem = e+1;
  }
  int nGroup = 1;
  while (nGroup < nElem) nGroup *= 2;
  int nWarp = 1;
  while (nWarp*nGroup <= (NCCL_MAX_NTHREADS/WarpSize)/2) nWarp *= 2;
  for (int i=0; i < nGroup; i++) {
    work->p2pElems[i].ngroups = nGroup;
    work->p2pElems[i].warpStart = i*(NCCL_MAX_NTHREADS/WarpSize)/nGroup;
    int extraWarp = /*nWarp >= 2 ? i%2 : */0;
    work->p2pElems[i].nWarps = nWarp + extraWarp;
  }
}

static void finishWork(struct ncclWork* work, int WarpSize) {
  if (work->header.type == ncclWorkTypeP2p) {
    finishWorkP2p(work, WarpSize);
  }
}

static ncclResult_t appendWorkElemP2p(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int channelId,
    struct ncclWorkElemP2p const *elem, bool fuseOk
  ) {
  int funcIndex = ncclDevFuncId_P2p();
  if (funcIndex < 0) {
    WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
    return ncclInvalidUsage;
  }
  struct ncclKernelPlan::Channel* chan = &plan->channels[channelId];
  struct ncclWorkList* q = ncclIntruQueueTail(&chan->workQueue);
  if (q && funcIndex == q->work.header.funcIndex) {
    if (!fuseOk) goto NewWork;
    if (chan->p2pTailElem[elem->p2pType-1] < NCCL_MAX_WORK_ELEMENTS_P2P) {
      for (int e = -2 + chan->p2pTailElem[elem->p2pType-1]; e >= 0; e -= 2) {
        // Can't have multiple elements of the same ncclWork communicate with the
        // same peer otherwise they would attempt to use that connection concurrently.
        if (q->work.p2pElems[e].peer == elem->peer)
          goto NewWork;
      }
      int e = chan->p2pTailElem[elem->p2pType-1];
      q->work.p2pElems[e] = *elem; // C++ struct assignment
      chan->p2pTailElem[elem->p2pType-1] += 2;
      return ncclSuccess;
    }
  NewWork:
    finishWorkP2p(&q->work, comm->WarpSize);
  }
  q = ncclMemoryStackAlloc<struct ncclWorkList>(&comm->memScoped);
  q->work.header.type = ncclWorkTypeP2p;
  q->work.header.funcIndex = ncclDevFuncId_P2p();
  chan->p2pTailElem[ncclWorkP2pTypeRecv-1] = 0;
  chan->p2pTailElem[ncclWorkP2pTypeSend-1] = 1;
  q->work.p2pElems[chan->p2pTailElem[elem->p2pType-1]] = *elem; // C++ struct assignment
  chan->p2pTailElem[elem->p2pType-1] += 2;
  chan->nWork += 1;
  ncclIntruQueueEnqueue(&chan->workQueue, q);
  return ncclSuccess;
}

static ncclResult_t addProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
  bool needed = true;
  NCCLCHECK(ncclProxySaveOp(comm, op, &needed));
  if (needed) {
    struct ncclProxyOp* q = ncclMemoryPoolAlloc<struct ncclProxyOp>(&comm->memPool_ncclProxyOp, &comm->memPermanent);
    *q = *op; // C++ struct assignment
    ncclIntruQueueEnqueue(&plan->channels[op->channelId].proxyOpQueue, q);
  }
  return ncclSuccess;
}

static ncclResult_t computeCollSteps(struct ncclInfo* collInfo, size_t workCount, uint32_t* steps) {
  struct ncclComm* comm = collInfo->comm;
  if (collInfo->coll == ncclFuncAllReduce) {
    if (collInfo->algorithm == NCCL_ALGO_RING)
      *steps = DIVUP(workCount, comm->nRanks * collInfo->chunkCount) * (comm->nRanks - 1) * 2 * collInfo->chunkSteps;
    else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT)
      *steps = DIVUP(workCount, comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount) * collInfo->chunkSteps;
    else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE)
      *steps = DIVUP(workCount, comm->channels[0].nvls.nHeads * collInfo->chunkCount) * collInfo->chunkSteps;
    else
      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
  } else if (collInfo->coll == ncclFuncReduceScatter) {
    if (collInfo->algorithm == NCCL_ALGO_RING)
      *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps;
    else
      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
  } else if (collInfo->coll == ncclFuncAllGather) {
    if (collInfo->algorithm == NCCL_ALGO_RING)
      *steps = DIVUP(workCount, collInfo->chunkCount) * (comm->nRanks - 1) * collInfo->chunkSteps;
    else
      *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
  } else {
    *steps = DIVUP(workCount, collInfo->chunkCount) * collInfo->chunkSteps;
  }
  return ncclSuccess;
}

static ncclResult_t computeCollAlignCount(struct ncclInfo* collInfo, size_t* alignCount) {
  if (collInfo->protocol == NCCL_PROTO_SIMPLE) {
    *alignCount = NCCL_SIMPLE_ALIGNMENT / ncclTypeSize(collInfo->datatype);
  } else if (collInfo->protocol == NCCL_PROTO_LL128) {
    // LL128 alignCount should be same as LL for now. NCCL_LL128_ALIGNMENT_PER_WARP needs review
    *alignCount = NCCL_LL_ALIGNMENT_PER_THREAD / ncclTypeSize(collInfo->datatype) * collInfo->nThreads;
  } else {
    *alignCount = NCCL_LL_ALIGNMENT_PER_THREAD / ncclTypeSize(collInfo->datatype) * collInfo->nThreads;
  }
  return ncclSuccess;
}

static ncclResult_t computeCollLastChunkInfo(struct ncclInfo* collInfo, size_t workCount, size_t alignCount, size_t* lastChunkCount) {
  struct ncclComm* comm = collInfo->comm;

  if (collInfo->coll == ncclFuncAllReduce) {
    if (collInfo->algorithm == NCCL_ALGO_RING) {
      size_t remCount = workCount % (comm->nRanks * collInfo->chunkCount);
      *lastChunkCount = DIVUP(DIVUP(remCount, comm->nRanks), alignCount) * alignCount;
    } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
      size_t remCount = workCount % (comm->channels[0].nvls.nHeads * collInfo->chunkCount);
      *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].nvls.nHeads), alignCount) * alignCount;
    } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
      size_t remCount = workCount % (comm->channels[0].collnetDirect.nHeads * collInfo->chunkCount);
      *lastChunkCount = DIVUP(DIVUP(remCount, comm->channels[0].collnetDirect.nHeads), alignCount) * alignCount;
    } else {
      *lastChunkCount = collInfo->chunkCount;
    }
  } else {
    *lastChunkCount = collInfo->chunkCount;
  }
  return ncclSuccess;
}

static ncclResult_t getCollnetLoopInfo(struct ncclInfo* collInfo, int* nstepsPerLoop, int* nchunksPerLoop) {
  switch (collInfo->pattern) {
    case ncclPatternCollnetChain:
      *nstepsPerLoop = *nchunksPerLoop = 1; break;
    case ncclPatternNvls:
      *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].nvls.nHeads; break;
    case ncclPatternCollnetDirect:
      *nstepsPerLoop = 1; *nchunksPerLoop = collInfo->comm->channels[0].collnetDirect.nHeads; break;
    default:
      WARN("Unknown collnet pattern %d", collInfo->pattern);
      return ncclInternalError;
  }
  return ncclSuccess;
}

static ncclResult_t addCollnetCollToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
    struct ncclInfo* collInfo, int* nWorkBudget
  ) {
  ncclResult_t ret = ncclSuccess;
  struct ncclKernelPlan::Channel *chans = plan->channels;
  struct ncclWorkElem workElem;
  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
  ncclRegBufferType regBufType = collInfo->regBufType;
  int nChannels = std::min(collInfo->nChannels, usableChannels);
  size_t countPerChannel = DIVUP(collInfo->count, nChannels);
  uint32_t typeSize = ncclTypeSize(collInfo->datatype);
  int steps, nchunksPerLoop, nstepsPerLoop, nLoop;

  NCCLCHECK(computeCollChunkInfo(collInfo, collInfo->nBytes, collInfo->nChannels));
  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);
  workElem.nChannels = nChannels;

  NCCLCHECKGOTO(getCollnetLoopInfo(collInfo, &nstepsPerLoop, &nchunksPerLoop), ret, fail);
  nLoop = (int)DIVUP(collInfo->nBytes, (size_t)nChannels * nchunksPerLoop * collInfo->chunkSize);
  steps = nstepsPerLoop * nLoop * collInfo->chunkSteps;

  for (int bid = 0; bid < nChannels; bid++) {
    workElem.bid = bid;
    // Add work elem
    *nWorkBudget += chans[bid].nWork;
    if (regBufType == NCCL_REGULAR_BUFFER) {
      appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElem);
    } else {
      struct ncclWorkElemReg workElemReg;
      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[bid], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
      appendWorkElemColl(comm, plan, bid, collInfo->workFuncIndex, &workElemReg);
    }
    *nWorkBudget -= chans[bid].nWork; // subtract delta of chans[c].nWork

    // Add proxy task. Empty collectives do not make it to the proxy thread
    // since they don't imply synchronization for the user like p2p.
    if (collInfo->nBytes != 0) {
      struct ncclProxyOp proxyOp;
      NCCLCHECKGOTO(initCollProxyOp(collInfo, bid, opCount, steps, &proxyOp), ret, fail);
      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
    }

    chans[bid].collBytes += countPerChannel * typeSize;
  }

  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
  if (!plan->kernelSpecialized) {
    plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
    plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
  }

  if (comm->rank == 0) {
    TRACE(NCCL_COLL, "collnetColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), collInfo->nChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, collInfo->workFuncIndex, collInfo->nThreads);
  }

exit:
  return ret;
fail:
  goto exit;
}

static ncclResult_t addTunedCollToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
    struct ncclInfo* collInfo, int* nWorkBudget
  ) {
  ncclResult_t ret = ncclSuccess;
  struct ncclKernelPlan::Channel *chans = plan->channels;
  struct ncclWorkElem workElem;
  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
  uint64_t workCount;
  uint64_t workOffset = 0;
  uint32_t typeSize = ncclTypeSize(collInfo->datatype);
  ncclRegBufferType regBufType = collInfo->regBufType;
  size_t alignCount, lastChunkCount;
  int least[/*nBid*/MAXCHANNELS];
  int maxIndexInLeast;
  size_t maxBytesInLeast;
  int nChannels = std::min(collInfo->nChannels, usableChannels);
  int rnChannels = 0;
  size_t countPerChannels;
  size_t remCount = collInfo->count;

  NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail);
  countPerChannels = DIVUP(DIVUP(collInfo->count, nChannels), alignCount) * alignCount;
  nChannels = DIVUP(collInfo->count, countPerChannels);
  NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->nBytes, nChannels), ret, fail);
  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);

  // Choose the `nBid` least loaded channels to do the work. This ensures
  // all bids go to different channels in case they need to synchronize.
  least[0] = 0;
  maxIndexInLeast = 0;
  maxBytesInLeast = chans[0].collBytes;
  // Initialize least[] such that the first nBid channels are accounted for.
  for (int b = 1; b < nChannels; b++) {
    least[b] = b;
    if (maxBytesInLeast < chans[b].collBytes) {
      maxIndexInLeast = b;
      maxBytesInLeast = chans[b].collBytes;
    }
  }
  // Sort in the rest of the channels. If a channel has less work than the max
  // member of least[], replace that member and compute the new max. We only
  // sort channels when coll algo is not collnet.
  for (int c = nChannels; c < usableChannels; c++) {
    if (chans[c].collBytes < maxBytesInLeast) {
      least[maxIndexInLeast] = c;
      maxBytesInLeast = chans[least[0]].collBytes;
      maxIndexInLeast = 0;
      for (int b = 1; b < nChannels; b++) {
        if (maxBytesInLeast < chans[least[b]].collBytes) {
          maxIndexInLeast = b;
          maxBytesInLeast = chans[least[b]].collBytes;
        }
      }
    }
  }

  for (int bid = 0; bid < nChannels && remCount > 0; bid++) {
    int c = least[bid];

    workCount = std::min(countPerChannels, remCount);
    NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail);
    NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail);

    // Add work elem
    *nWorkBudget += chans[c].nWork;
    if (regBufType == NCCL_REGULAR_BUFFER) {
      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem);
    } else {
      struct ncclWorkElemReg workElemReg;
      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg);
    }
    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork

    // Add proxy task. Empty collectives do not make it to the proxy thread
    // since they don't imply synchronization for the user like p2p.
    if (collInfo->nBytes != 0) {
      uint32_t steps;
      struct ncclProxyOp proxyOp;
      NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail);
      NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail);
      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
    }

    remCount -= workCount;
    chans[c].collBytes += workCount * typeSize;
    workOffset += workCount;
    rnChannels++;
  }

  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
  if (!plan->kernelSpecialized) {
    plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
    plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
  }

  if (comm->rank == 0) {
    TRACE(NCCL_COLL, "tunedColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannels, collInfo->count, collInfo->workBytes, usableChannels, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads);
  }

exit:
  return ret;
fail:
  goto exit;
}

static ncclResult_t addCBDCollToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int usableChannels,
    struct ncclInfo* collInfo, int* nWorkBudget
  ) {
  ncclResult_t ret = ncclSuccess;
  struct ncclKernelPlan::Channel *chans = plan->channels;
  size_t enqBytes;
  uint64_t opCount = uint64_t(plan->collOpCount++) << 1 | 0;
  size_t typeSize = ncclTypeSize(collInfo->datatype);
  size_t workBytesTotal = collInfo->count * typeSize;
  size_t workCountTotal = collInfo->count;
  struct ncclWorkElem workElem;
  size_t workOffset = 0;
  size_t workCount;
  ncclRegBufferType regBufType = collInfo->regBufType;
  size_t alignCount;
  size_t lastChunkCount;
  int rnChannel = 0;

  NCCLCHECKGOTO(computeCollChunkInfo(collInfo, collInfo->aggnBytes, collInfo->nChannels), ret, fail);
  NCCLCHECKGOTO(computeCollAlignCount(collInfo, &alignCount), ret, fail);
  NCCLCHECKGOTO(initCollWorkElem(collInfo, &workElem), ret, fail);
  for (int c = 0; c < usableChannels; c++) {
    enqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal);
    workCount = std::min(DIVUP(DIVUP(enqBytes, typeSize), alignCount) * alignCount, workCountTotal);
    enqBytes = workCount * typeSize;

    // AllToAllPivot needs bid/nChannels/pivotA2ANumBiRings from ncclWorkElem instead
    if (collInfo->coll == ncclFuncAllToAllPivot) {
      workElem.nChannels = usableChannels;
      workElem.pivotA2ANumBiRings = collInfo->comm->topo->pivotA2ANumBiRings;
      workElem.bid = c;
    } else {
      if (plan->maxBytesPerChannel <= chans[c].collBytes) continue;
      if (workBytesTotal == 0) break;

      NCCLCHECKGOTO(computeCollLastChunkInfo(collInfo, workCount, alignCount, &lastChunkCount), ret, fail);
      NCCLCHECKGOTO(setCollWorkElem(workCount, workOffset, lastChunkCount, &workElem), ret, fail);
    }

    // Add work elem
    *nWorkBudget += chans[c].nWork;
    if (regBufType == NCCL_REGULAR_BUFFER) {
      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElem);
    } else {
      struct ncclWorkElemReg workElemReg;
      NCCLCHECKGOTO(initCollWorkElemReg(comm, &workElem, &comm->channels[c], regBufType, collInfo->regBufSend, collInfo->regBufRecv, &workElemReg), ret, fail);
      appendWorkElemColl(comm, plan, c, collInfo->workFuncIndex, &workElemReg);
    }
    *nWorkBudget -= chans[c].nWork; // subtract delta of chans[c].nWork

    // Add proxy task. Empty collectives do not make it to the proxy thread
    // since they don't imply synchronization for the user like p2p.
    if (collInfo->nBytes != 0) {
      uint32_t steps;
      struct ncclProxyOp proxyOp;
      NCCLCHECKGOTO(computeCollSteps(collInfo, workCount, &steps), ret, fail);
      NCCLCHECKGOTO(initCollProxyOp(collInfo, c, opCount, steps, &proxyOp), ret, fail);
      NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOp), ret, fail);
    }

    workBytesTotal -= enqBytes;
    workCountTotal -= workCount;
    chans[c].collBytes += enqBytes;
    workOffset += workCount;
    rnChannel++;
  }

  plan->threadPerBlock = std::max(plan->threadPerBlock, collInfo->nThreads);
  if (!plan->kernelSpecialized) {
    plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
    plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
  }

  if (comm->rank == 0) {
    TRACE(NCCL_COLL, "CBDColl enqueue coll %s(%s, %s, %s, %s), nChannels %d, count %ld (nbytes %ld), usableChannel %d, maxBytesPerChannel %ld, chunkCount %d, lastChunkCount %ld, funcIndex %d, nThreads %d", collInfo->opName, ncclOpToString(collInfo->op), ncclDatatypeToString(collInfo->datatype), ncclAlgoToString(collInfo->algorithm), ncclProtoToString(collInfo->protocol), rnChannel, collInfo->count, collInfo->workBytes, usableChannels, plan->maxBytesPerChannel, collInfo->chunkCount, lastChunkCount, collInfo->workFuncIndex, collInfo->nThreads);
  }

exit:
  return ret;
fail:
  goto exit;
}

NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);

// Put p2p op in plan assuming there is space in nWorkBudget, so you must
// ensure *nWorkBudget >= 1 upon entry.
static ncclResult_t addP2pToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget,
    bool isSendNotRecv, int peer, int chunk, void *addr, size_t bytes, uint32_t connIndex, bool fuseOk
  ) {
  struct ncclInfo info = {
    isSendNotRecv ? ncclFuncSend : ncclFuncRecv,
    isSendNotRecv ? "Send" : "Recv",
    nullptr, addr, bytes, ncclInt8, ncclSum, peer, comm, (cudaStream_t)0,
    /*Args*/1, 1
  };

  int channelId;
  NCCLCHECK(ncclChannelCompute(comm, peer, chunk%comm->p2pnChannelsPerPeer, info.coll, &channelId));
  info.channelId = channelId;

  // 1 is connIndex
  struct ncclConnInfo* conn = isSendNotRecv ?
    &comm->channels[channelId].peers[peer]->send[1].conn : &comm->channels[channelId].peers[peer]->recv[1].conn;
  // do not use LL on gfx12
  info.protocol = ((conn->buffs[NCCL_PROTO_LL] != nullptr) && bytes <= ncclParamP2pLLThreshold() && !IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx12")) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE;

  int reg = 0;
  if (info.protocol == NCCL_PROTO_SIMPLE) {
    struct ncclReg* regRecord;
    NCCLCHECK(ncclRegFind(comm, addr, bytes, &regRecord));
    reg = regRecord && regRecord->nDevs ? 1 : 0;
  }

  struct ncclProxyOp proxyOp = {};
  // May tune chunksize and set proxyOp.reg=0 if not using the network.
  NCCLCHECK(ncclProxyComputeP2p(&info, &proxyOp, reg));
  proxyOp.connIndex = connIndex;

  struct ncclWorkElemP2p elem = {0};
  elem.proto = info.protocol;
  elem.peer = addr == nullptr ? -1 : peer;
  elem.nWarps = NCCL_MAX_NTHREADS/comm->WarpSize;
  elem.reg = proxyOp.reg;
  elem.p2pType = isSendNotRecv ? ncclWorkP2pTypeSend : ncclWorkP2pTypeRecv;
  elem.buffLo32 = uint32_t(reinterpret_cast<uintptr_t>(addr));
  elem.buffHi32 = reinterpret_cast<uintptr_t>(addr)>>32;
  elem.countLo32 = uint32_t(bytes);
  elem.countHi32 = bytes>>32;
  elem.chunkSize = info.chunkSize; // computed by ncclProxyComputeP2p
  elem.opCount = (uint16_t)comm->opCount;
  elem.connIndex = connIndex;

  *nWorkBudget += plan->channels[channelId].nWork;
  appendWorkElemP2p(comm, plan, channelId, &elem, fuseOk);
  *nWorkBudget -= plan->channels[channelId].nWork;

  // Calculate the opCount after appendWorkElemP2p since it will always return
  // with channel->nWork equal to one plus the work index this p2p settled in.
  proxyOp.opCount = uint64_t(plan->channels[channelId].nWork)<<1 | 1;
  if (addr != nullptr) NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
  return ncclSuccess;
}

static void finishPlan(struct ncclKernelPlan* plan) {
  int channelUbound = 0;
  int channelCount = 0;
  //uint64_t channelMask = 0;
  struct channelMasks channelMask;
  for (int i =0; i < MAXCHANNELS/64; i++) {
	channelMask.masks[i] = 0;
  }
  bool hasProxyOps = false;
  for (int c=0; c < MAXCHANNELS; c++) {
    struct ncclWorkList* tail = ncclIntruQueueTail(&plan->channels[c].workQueue);
    if (tail != nullptr) {
      channelUbound = c+1;
      channelCount += 1;
      //channelMask |= 1ull<<c;
      channelMask.masks[c/64] |= 1ull<<(c%64);
      tail->work.header.isLast = 1;
      finishWork(&tail->work, plan->comm->WarpSize);
    }
    hasProxyOps |= !ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue);
  }
  plan->channelUbound = channelUbound;
  plan->channelCount = channelCount;
  plan->channelMask = channelMask;
  plan->hasProxyOps = hasProxyOps;
  plan->threadPerBlock = std::max(plan->threadPerBlock, 3*plan->comm->WarpSize);
}

int64_t ncclParamLocalRegister();
NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);

static ncclResult_t registerIntraNodeBuffers(
    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclInfo* info
  ) {
  ncclResult_t result = ncclSuccess;

  info->regBufType = NCCL_REGULAR_BUFFER;
#if CUDART_VERSION >= 11030
  if ((info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) && comm->nvlsRegSupport) {
    bool regBufUsed = false;
    const void *sendbuff = info->sendbuff;
    void *recvbuff = info->recvbuff;

    if (info->coll == ncclFuncAllGather)
      sendbuff = NULL;
    else if (info->coll == ncclFuncReduceScatter)
      recvbuff = NULL;

    /* first try local registration. */
    if (ncclParamLocalRegister()) {
      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, info->regBufSend, info->regBufRecv);
    }

    if (regBufUsed == false && plan->persistent && ncclParamGraphRegister()) {
      ncclNvlsGraphRegisterBuffer(comm, plan, sendbuff, recvbuff, info->sendbuffSize, info->recvbuffSize, &regBufUsed, info->regBufSend, info->regBufRecv);
    }

    if (regBufUsed) {
      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
       * saturate bandwidth. */
      if (comm->nNodes == 1) {
        if (info->coll == ncclFuncReduceScatter)
          info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
        else
          info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
      } else {
        info->nChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
      }

      info->regBufType = NCCL_NVLS_REG_BUFFER;
    }
  } else if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT &&   // limited to CollNetDirect for now
    comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
    comm->intraRanks < comm->localRanks &&  // only with inter-process & intra-node peers
    plan->persistent && 0) {
    /* Disable CollnetDirect registration since it does not support cuMem* allocated memory. */
    int localRank = comm->localRank;
    cudaPointerAttributes sattr, rattr;

    CUDACHECK(cudaPointerGetAttributes(&sattr, info->sendbuff));
    CUDACHECK(cudaPointerGetAttributes(&rattr, info->recvbuff));
    if (sattr.type != cudaMemoryTypeDevice || rattr.type != cudaMemoryTypeDevice) return ncclSuccess;

    if (CUPFN(cuMemGetAddressRange) == nullptr) return ncclSuccess;

    struct HandlePair {
      cudaIpcMemHandle_t ipc[2]; // {send, recv}
      size_t offset[2]; // {send, recv}
    };
    struct HandlePair handles[NCCL_MAX_LOCAL_RANKS];

    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[0], (void*)info->sendbuff), result, fallback);
    CUDACHECKGOTO(cudaIpcGetMemHandle(&handles[localRank].ipc[1], (void*)info->recvbuff), result, fallback);

    void *baseSend, *baseRecv;
    size_t size;
    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &size, (CUdeviceptr)info->sendbuff));
    handles[localRank].offset[0] = (char*)info->sendbuff - (char*)baseSend;
    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &size, (CUdeviceptr)info->recvbuff));
    handles[localRank].offset[1] = (char*)info->recvbuff - (char*)baseRecv;

    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, handles, sizeof(struct HandlePair)));

    // Open handles locally
    for (int i=0; i < comm->localRanks; i++) {
      if (i == localRank) { // Skip self
        info->regBufSend[i] = nullptr;
        info->regBufRecv[i] = nullptr;
      } else {
        for (int sr=0; sr < 2; sr++) {
          // Get base address of mapping
          void* base;
          CUDACHECK(cudaIpcOpenMemHandle(&base, handles[i].ipc[sr], cudaIpcMemLazyEnablePeerAccess));
          // Get real buffer address by adding offset in the mapping
          (sr == 0 ? info->regBufSend : info->regBufRecv)[i] = (char*)base + handles[i].offset[sr];
          // Enqueue reminder to close memory handle
          struct ncclPointerList* q = ncclMemoryPoolAlloc<struct ncclPointerList>(&comm->memPool_ncclPointerList, &comm->memPermanent);
          q->ptr = base;
          ncclIntruQueueEnqueue(&plan->ipcMemQueue, q);
        }
      }
    }
    info->regBufType = NCCL_IPC_REG_BUFFER;
  }
fallback:
#endif
  return result;
}

static ncclResult_t getCBDCollnChannel(struct ncclKernelPlan* plan, struct ncclInfo* collInfo, int usableChannels) {
  size_t firstEnqBytes;
  size_t workBytesTotal = collInfo->workBytes;
  struct ncclKernelPlan::Channel *chans = plan->channels;
  int typeSize = ncclTypeSize(collInfo->datatype);
  size_t maxCount = DIVUP(plan->maxBytesPerChannel, typeSize);

  if (workBytesTotal == 0) {
    collInfo->nChannels = 1;
    goto exit;
  }

  for (int c = 0; c < usableChannels; c++) {
    if (plan->maxBytesPerChannel <= chans[c].collBytes) continue;
    firstEnqBytes = std::min(plan->maxBytesPerChannel - chans[c].collBytes, workBytesTotal);
    firstEnqBytes = DIVUP(firstEnqBytes, typeSize) * typeSize;
    collInfo->nChannels = 1 + DIVUP((workBytesTotal - firstEnqBytes) / typeSize, maxCount);
    break;
  }

exit:
  return ncclSuccess;
}

static ncclResult_t scheduleCollTasksToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
  ) {
  struct ncclTasks* tasks = &comm->tasks;
  size_t totalCBDBytes = tasks->workBytesTotal;
  struct ncclInfo* collInfo;

  if (!ncclIntruQueueEmpty(&tasks->collQueue)) {
    int usableChannels = 0, accChannels = 0;

    tasks->usableChannels = 1;
    while (!ncclIntruQueueEmpty(&tasks->collQueue)) {
      collInfo = ncclIntruQueueDequeue(&tasks->collQueue);
      if (collInfo->count == 0) continue;
      if (collInfo->algorithm == NCCL_ALGO_UNDEF) {
        struct ncclInfo* aggInfo = ncclMemoryStackAlloc<struct ncclInfo>(&comm->memScoped);
        struct ncclInfo* nextInfo = collInfo->next;
        int nvlsSupport;
        int collNetSupport;

        memcpy(aggInfo, collInfo, sizeof(struct ncclInfo));
        while (nextInfo) {
          if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) {
            aggInfo->count += nextInfo->count;
            nextInfo = nextInfo->next;
          } else {
            break;
          }
        }

        nvlsSupport = comm->nvlsSupport && ncclNvlsSupported(aggInfo->opFull.op, aggInfo->datatype);
        NCCLCHECK(getCollNetSupport(aggInfo, &collNetSupport));
        NCCLCHECK(ncclInfoSetDerived(aggInfo, comm->nRanks));
        NCCLCHECK(getTunerInfo(aggInfo, collNetSupport, nvlsSupport, 1));
        NCCLCHECK(topoGetAlgoInfo(aggInfo, collNetSupport, nvlsSupport, 1));
        NCCLCHECK(getChannnelThreadInfo(aggInfo));
        NCCLCHECK(computeCollWorkFunc(aggInfo));
        NCCLCHECK(getPatternInfo(aggInfo));

        // Try to assign algo and proto to all possible collectives
        nextInfo = collInfo;
        while (nextInfo) {
          if (nextInfo->coll == aggInfo->coll && nextInfo->opFull.op == aggInfo->opFull.op && nextInfo->datatype == aggInfo->datatype) {
            NCCLCHECK(ncclInfoSetDerived(nextInfo, comm->nRanks));
            NCCLCHECK(getTunerInfo(nextInfo, collNetSupport, nvlsSupport, 1));
            nextInfo->algorithm = aggInfo->algorithm;
            nextInfo->protocol = aggInfo->protocol;
            nextInfo->nThreads = aggInfo->nThreads;
            nextInfo->pattern = aggInfo->pattern;
            nextInfo->workFuncIndex = aggInfo->workFuncIndex;
            nextInfo->aggnBytes = aggInfo->nBytes;

            NCCLCHECK(getChannnelThreadInfo(nextInfo));
            // if possible, start registration
            registerIntraNodeBuffers(comm, plan, nextInfo);
            // accumulate channels
            accChannels += nextInfo->nChannels;
            nextInfo = nextInfo->next;
          } else {
            break;
          }
        }
      } // end of aggInfo

      if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
        usableChannels = std::max(usableChannels, comm->nvlsChannels);
      } else {
        usableChannels = std::max(usableChannels, comm->collChannels);
      }

      if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT || collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN || (collInfo->algorithm == NCCL_ALGO_NVLS && comm->nNodes > 1)) {
        // substract collective which needs to be executed separately
        totalCBDBytes -= collInfo->workBytes;
        tasks->workBytesTotal -= collInfo->workBytes;
        ncclIntruQueueEnqueue(&tasks->collnetQueue, collInfo);
      } else if (collInfo->userTuned) {
        // substract collective which needs to be executed separately
        totalCBDBytes -= collInfo->workBytes;
        tasks->workBytesTotal -= collInfo->workBytes;
        ncclIntruQueueEnqueue(&tasks->collTunedQueue, collInfo);
      } else {
        ncclIntruQueueEnqueue(&tasks->collCBDQueue, collInfo);
      }
    }

    tasks->usableChannels = std::min(usableChannels, accChannels);
  }

  /* Calculate maxBytesPerChannel for CBD colls and it should be 16 bytes aligned
   * Note: it it not hard upper bound for maxBytes, we can relax it if any optimization
   * is needed */
  plan->maxBytesPerChannel = DIVUP(DIVUP(totalCBDBytes, tasks->usableChannels), NCCL_BYTES_ALIGNMENT) * NCCL_BYTES_ALIGNMENT;
  // First enqueue CBD colls
  while (!ncclIntruQueueEmpty(&tasks->collCBDQueue)) {
    // Get nChannels and peek whether the budget allows before we enqueue
    collInfo = ncclIntruQueueHead(&tasks->collCBDQueue);
    collInfo->nChannels = DIVUP(collInfo->aggnBytes * tasks->usableChannels, totalCBDBytes);
    // Haven't got nChannels info yet, relax the budget boundary a bit.
    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;

    collInfo = ncclIntruQueueDequeue(&tasks->collCBDQueue);
    NCCLCHECK(addCBDCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
    tasks->nTasksColl -= 1;
    tasks->workBytesTotal -= collInfo->count * ncclTypeSize(collInfo->datatype);
  }

  // Then enqueue collnet colls
  while (!ncclIntruQueueEmpty(&tasks->collnetQueue)) {
    collInfo = ncclIntruQueueHead(&tasks->collnetQueue);
    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;

    collInfo = ncclIntruQueueDequeue(&tasks->collnetQueue);
    NCCLCHECK(addCollnetCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
    tasks->nTasksColl -= 1;
  }

  // Finally enqueue user-tuned colls
  while (!ncclIntruQueueEmpty(&tasks->collTunedQueue)) {
    collInfo = ncclIntruQueueHead(&tasks->collTunedQueue);
    if (*nWorkBudget < collInfo->nChannels) return ncclSuccess;

    collInfo = ncclIntruQueueDequeue(&tasks->collTunedQueue);
    NCCLCHECK(addTunedCollToPlan(comm, plan, tasks->usableChannels, collInfo, nWorkBudget));
    tasks->nTasksColl -= 1;
  }

  return ncclSuccess;
}

static size_t calcP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
  size_t size = std::max(minSize, divUp(totalSize, minChannels));
  int nChannels = minChannels;
  while (size > maxSize && nChannels <= maxChannels/2) {
    nChannels *= 2;
    size = divUp(totalSize, nChannels);
  }
  return alignUp(size, minSize);
}

RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);

static ncclResult_t scheduleP2pTasksToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, int* nWorkBudget
  ) {
  struct ncclTasks* tasks = &comm->tasks;
  int nRanks = comm->nRanks;
  struct ncclTasks::Peer* peers = tasks->peers;
  int const *sendOrder = tasks->p2pSendOrder;
  int const *recvOrder = tasks->p2pRecvOrder;

  plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
  if (!plan->kernelSpecialized) {
    plan->kernelFn = ncclKerns[ncclGetKernelIndex(comm)].kernelFn;
    plan->kernelSpecialized = ncclKerns[ncclGetKernelIndex(comm)].specialized;
  }

  // Compute how much to split operations
  // Natural step size matching buffer steps.
  ssize_t stepSize = comm->p2pChunkSize;
  // Try to use all channels
  int nChannelsMax = comm->p2pnChannelsPerPeer;
  int nChannelsMin = nChannelsMax;
  // Try to use all channels, but one channel per operation.
  while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;

  bool fuseOk = false;
  // We can perform 8 send/recv per round per CTA. Make sure we jump between fused blocks at node boundaries.
  while (tasks->nTasksP2p != 0) {
    for (int i=0; i < tasks->p2pOrderSteps; i++) {
      int sendPeer = sendOrder[i];
      int recvPeer = recvOrder[i];
      struct ncclTaskP2p* send = sendPeer != -1 ? ncclIntruQueueHead(&peers[sendPeer].sendQueue) : NULL;
      struct ncclTaskP2p* recv = recvPeer != -1 ? ncclIntruQueueHead(&peers[recvPeer].recvQueue) : NULL;
      if (sendPeer == comm->rank) {
        if (recvPeer != comm->rank) {
          WARN("Sendrecv plan not aligned for self");
          return ncclInternalError;
        }
        if (send && recv == nullptr) {
          WARN("Trying to send to self without a matching recv");
          return ncclInvalidUsage;
        }
        if (send == nullptr && recv) {
          WARN("Trying to recv to self without a matching send");
          return ncclInvalidUsage;
        }
      }
      if (send != nullptr || recv != nullptr) {
        char* recvPtr = recv ? (char*)recv->buff : nullptr;
        char* sendPtr = send ? (char*)send->buff : nullptr;
        ssize_t recvBytes = recv ? recv->bytes : 0;
        ssize_t sendBytes = send ? send->bytes : 0;
        ssize_t minSize = comm->nNodes > 1 ? stepSize/2 : stepSize/8;
        ssize_t maxSize = comm->nNodes > 1 ? stepSize : stepSize*32;
        ssize_t recvChunkBytesMax = calcP2pChunkSize(recvBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
        ssize_t sendChunkBytesMax = calcP2pChunkSize(sendBytes, nChannelsMin, nChannelsMax, minSize, maxSize);
        // Zero size send/recv are syncs, encode here with -1.
        recvBytes = recv && recvBytes == 0 ? -1 : recvBytes;
        sendBytes = send && sendBytes == 0 ? -1 : sendBytes;
        // Advance to current chunk. Syncs will always have chunk=0 so no effect on the -1.
        if (recv) recvPtr   += recv->chunk*recvChunkBytesMax;
        if (recv) recvBytes -= recv->chunk*recvChunkBytesMax;
        if (send) sendPtr   += send->chunk*sendChunkBytesMax;
        if (send) sendBytes -= send->chunk*sendChunkBytesMax;

        uint16_t sendIdx = 1, recvIdx = 1;
        if(comm->p2pNet && sendBytes > rcclParamP2pNetThreshold())
          sendIdx = NCCL_CONN_IDX_P2P_NET;
        if(comm->p2pNet && recvBytes > rcclParamP2pNetThreshold())
          recvIdx = NCCL_CONN_IDX_P2P_NET;

        do {
          if ((i % (NCCL_MAX_WORK_ELEMENTS_P2P/2)) == 0) fuseOk = false;
          ssize_t recvChunkBytes = std::min(recvBytes, recvChunkBytesMax); // -1 preserved
          ssize_t sendChunkBytes = std::min(sendBytes, sendChunkBytesMax);
          if (recvChunkBytes != 0) {
            if (recvChunkBytes == -1) recvChunkBytes = 0;
            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/false, recvPeer, recv->chunk, recvPtr, recvChunkBytes, recvIdx, fuseOk));
            fuseOk = true;
            recvPtr += recvChunkBytes;
            recvBytes -= recvChunkBytes;
            recv->chunk += 1;
            if (recvBytes <= 0) {
              recvBytes = 0; // in case still -1
              ncclIntruQueueDequeue(&peers[recvPeer].recvQueue);
              tasks->nTasksP2p -= 1;
            }
          }
          if (sendChunkBytes != 0) {
            if (sendChunkBytes == -1) sendChunkBytes = 0;
            if (*nWorkBudget < 1) return ncclSuccess; // ensure room in budget
            NCCLCHECK(addP2pToPlan(comm, plan, nWorkBudget, /*isSendNotRecv=*/true, sendPeer, send->chunk, sendPtr, sendChunkBytes, sendIdx, fuseOk));
            fuseOk = true;
            sendPtr += sendChunkBytes;
            sendBytes -= sendChunkBytes;
            send->chunk += 1;
            if (sendBytes <= 0) {
              sendBytes = 0; // in case still -1
              ncclIntruQueueDequeue(&peers[sendPeer].sendQueue);
              tasks->nTasksP2p -= 1;
            }
          }
        } while (sendBytes != 0 || recvBytes != 0);
      }
    }
  }
  return ncclSuccess;
}

// Comparison of monotonic rolling counters.
static inline bool rollingLess32(uint32_t a, uint32_t b) {
  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
  return a-b > PositiveMax;
}
static inline uint32_t rollingMin32(uint32_t a, uint32_t b) {
  constexpr uint32_t PositiveMax = uint32_t(-1)>>1;
  return (b-a <= PositiveMax) ? a : b;
}

// Spin until its safe to increase comm->workFifoSent to desiredSent.
static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredSent) {
  if (__builtin_expect(rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent), false)) {
    while (1) {
      // We have to poll for notifications from device.
      uint32_t* doneLive = comm->workFifoDone;
      uint32_t ackd[MAXCHANNELS];
      for (int c=0; c < MAXCHANNELS; c++) {
        ackd[c] = __atomic_load_n(&doneLive[c], __ATOMIC_RELAXED);
      }
      // Compiler-only fence to prevent fusion of loops to encourage dense loads.
      __atomic_signal_fence(__ATOMIC_SEQ_CST);

      uint32_t ackdAll = comm->workFifoSent;
      for (int c=0; c < MAXCHANNELS; c++) {
        // ackdAll is min over all non-quiesced channels
        if (ackd[c] != comm->channels[c].workFifoSent)
          ackdAll = rollingMin32(ackdAll, ackd[c]);
      }

      // Compiler only fence to prevent fusion of loops to encourage dense stores.
      __atomic_signal_fence(__ATOMIC_SEQ_CST);

      for (int c=0; c < MAXCHANNELS; c++) {
        // Advance counter on quiesced channels so they don't lag behind
        // too far where they could get lost in 32-bit wraparound.
        if (ackd[c] == comm->channels[c].workFifoSent) {
          comm->channels[c].workFifoSent = ackdAll;
          __atomic_store_n(&doneLive[c], ackdAll, __ATOMIC_RELAXED);
        }
      }
      comm->workFifoAckdMin = ackdAll;

      // See if that was enough.
      if (!rollingLess32(comm->workFifoAckdMin + comm->workFifoDepth, desiredSent)) break;
      sched_yield();
    }
  }
}

static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  bool persistent = plan->persistent;
  int channelUbound = plan->channelUbound;
  int nWork = 0;
  for (int c=0; c < channelUbound; c++) nWork += plan->channels[c].nWork;

  struct ncclWork* workHeap;
  if (!persistent) {
    workHeap = comm->workFifoHeap;
  } else {
    workHeap = ncclMemoryStackAlloc<struct ncclWork>(&comm->memScoped, nWork);
  }
  uint32_t ixMask = persistent ? ~uint32_t(0) : comm->workFifoDepth-1;
  uint32_t ixSent;
  if (persistent) {
    ixSent = 0;
  } else {
    ixSent = comm->workFifoSent;
    // First work for a channel has to be at workHeap+blockIdx.x which means
    // we cannot tolerate fifo wraparound. So round up to the wrap boundary
    // if not doing so would incur crossing it.
    if (((ixSent + plan->channelCount-1) & ixMask) < (ixSent & ixMask)) {
      ixSent = (ixSent + ixMask) & ~ixMask;
      // Need to update workFifoSent so waitWorkFifoAvailable() knows we've
      // skipped those elements. Consider if all the channels report quiesced,
      // this way the skipped slots will be considered consumed as well.
      comm->workFifoSent = ixSent;
    }
    waitWorkFifoAvailable(comm, ixSent + nWork);
  }
  uint32_t ixHead = ixSent;
  ixSent += plan->channelCount;
  int channelsWithWork = 0; // number of channels below `c` with work structs.
  for (int c=0; c < channelUbound; c++) {
    struct ncclWorkList* q = ncclIntruQueueHead(&plan->channels[c].workQueue);
    // Offset of first work equals number of channels below with work.
    uint32_t ix = ixHead + channelsWithWork;
    channelsWithWork += q != nullptr ? 1 : 0;
    while (q != nullptr) {
      if (q->next != nullptr) {
        q->work.header.workNext = int32_t(ixSent & ixMask) - int32_t(ixHead & ixMask);
      } else {
        q->work.header.inFifo = !persistent ? 1 : 0;
        // Tell channel to ack us back ix+1 indicating that all slots up to and
        // including ix have been consumed.
        q->work.header.doneAcks = ix+1;
        comm->channels[c].workFifoSent = ix+1;
      }
      workHeap[ix & ixMask] = q->work; // C++ struct assignment
      q = q->next;
      if (q != nullptr) ix = ixSent++;
    }
  }

  if (!persistent) {
    comm->workFifoSent = ixSent;
    if (comm->workFifoHeapGdrHandle != nullptr) wc_store_fence();
    plan->workHead = &comm->devWorkFifoHeap[ixHead & ixMask];
  } else {
    NCCLCHECK(ncclCudaMalloc(&plan->workHead, nWork));
    NCCLCHECK(ncclCudaMemcpy(plan->workHead, workHeap, nWork));
  }
  return ncclSuccess;
}

static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  uint64_t collOpCount = comm->sharedRes->collOpCount;
  // Advance comm's collOpCount by number of colls in this plan.
  comm->sharedRes->collOpCount += plan->collOpCount;

  uint64_t p2pOpBump[MAXCHANNELS];
  struct ncclProxyOp* heads[MAXCHANNELS];
  uint64_t headIds[MAXCHANNELS];
  int nHeads = 0;
  for (int c=0; c < plan->channelUbound; c++) {
    p2pOpBump[c] = 0;
    heads[c] = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
    nHeads += (heads[c] != nullptr) ? 1 : 0;
    headIds[c] = (heads[c] != nullptr) ? heads[c]->opCount : uint64_t(-1);
  }

  while (nHeads != 0) {
    int minChan = -1;
    uint64_t minId = uint64_t(-1);
    // We store the heads[c]->opCount in headIds[c] specifically to remove indirect
    // loads from this loop which speeds it up considerably.
    for (int c=0; c < plan->channelUbound; c++) {
      uint64_t id = headIds[c];
      id = (id>>1 | id<<63); // Move tag bit to order collectives before p2p's
      if (id < minId) { minChan = c; minId = id; }
    }

    struct ncclProxyOp* q = heads[minChan];
    uint64_t oldId = headIds[minChan]; // same as q->opCount
    // Advance heads[c]
    heads[minChan] = q->enqNext;
    if (q->enqNext == nullptr) nHeads -= 1;
    headIds[minChan] = (q->enqNext != nullptr) ? q->enqNext->opCount : uint64_t(-1);

    // Ignoring the bottom tag bit, opCount's are zero-based within plan so
    // translate them to the tip of the comm's history.
    if (oldId & 1) { // p2p
      // opCount is monotonic increasing within a plan's channel so just
      // remember last value to compute max.
      p2pOpBump[minChan] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide
      q->opCount = (comm->sharedRes->p2pOpCount[minChan]<<1) + oldId;
    } else { // coll
      q->opCount = (collOpCount<<1) + oldId;
    }

    NCCLCHECK(ncclProxySaveOp(comm, q, nullptr));
    q->opCount = oldId; // Restore for next uploadProxyOps()
    if (!plan->persistent) {
      // Non-persistent kernels upload ops only once so can be free'd here.
      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
    }
  }

  for (int c=0; c < plan->channelUbound; c++) {
    // Erase proxyOpQueue since all ops were free'd back to mempool.
    if (!plan->persistent) ncclIntruQueueConstruct(&plan->channels[c].proxyOpQueue);
    // Advance channel's p2pOpCount by number of p2p's in this plan channel.
    comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
  }
  return ncclSuccess;
}

static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  NCCLCHECK(uploadProxyOps(comm, plan));
  NCCLCHECK(ncclProxyStart(comm));
  if (!plan->persistent) {
    // Notify main thread of our reclaiming. This will reclaim plan concurrently.
    ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
  }
  return ncclSuccess;
}

static void HIPRT_CB hostStreamPlanCallback(void *plan_) {
  NVTX3_FUNC_RANGE_IN(nccl_domain);
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
  ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
  if (result != ncclSuccess) {
    WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
  }
}

static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
  if (plan->persistent) {
    comm->persistentRefs -= 1;
    NCCLCHECK(ncclCudaFree(plan->workHead));
    for (int c=0; c < plan->channelUbound; c++) {
      struct ncclProxyOp* q = ncclIntruQueueHead(&plan->channels[c].proxyOpQueue);
      while (q != nullptr) {
        struct ncclProxyOp* q1 = q->enqNext;
        ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
        q = q1;
      }
    }
    while (!ncclIntruQueueEmpty(&plan->ipcMemQueue)) {
      struct ncclPointerList* q = ncclIntruQueueDequeue(&plan->ipcMemQueue);
      CUDACHECKIGNORE(cudaIpcCloseMemHandle(q->ptr));
      ncclMemoryPoolFree(&comm->memPool_ncclPointerList, q);
    }
    /* free mcHandle */
    while (!ncclIntruQueueEmpty(&plan->nvlsMcHandleQueue)) {
      struct ncclNvlsMcHandleList* obj = ncclIntruQueueDequeue(&plan->nvlsMcHandleQueue);
      NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
      INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
      ncclMemoryPoolFree(&comm->memPool_ncclNvlsHandleList, obj);
    }
  }
  ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
  return ncclSuccess;
}

static void persistentDestructor(void* plans_) {
  struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plans_;
  struct ncclComm* comm = plan->comm;
  while (plan != nullptr) {
    struct ncclKernelPlan* next = plan->next;
    ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
    plan = next;
  }
}

ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
  ncclResult_t result = ncclSuccess;
  struct ncclTasks* tasks = &comm->tasks;
  bool persistent = ncclCudaGraphValid(tasks->capturingGraph);
  int nPlans = 0;

  // Poll for callbacks sent to us from other threads. Typically these free
  // resources from to our memory pools.
  NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));

  // We already have one frame present which holds all of our tasks (which we
  // are about to schedule). Now push an additional frame for allocating
  // work structs (see appendWorkElem() variants all use scoped allocation).
  ncclMemoryStackPush(&comm->memScoped);

  if (tasks->nTasksColl + tasks->nTasksP2p != 0) {
    do {
      struct ncclKernelPlan* plan = ncclMemoryPoolAlloc<struct ncclKernelPlan>(&comm->memPool_ncclKernelPlan, &comm->memPermanent);
      ncclIntruQueueEnqueue(&comm->planQueue, plan);
      nPlans += 1;
      plan->comm = comm;
      plan->reclaimer.fn = reclaimPlan;
      plan->persistent = persistent;

      // Non-persistent kernels fill up at most half of our fifo per kernel.
      int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2;
      int nWorkBudgetOld = nWorkBudget;

      // Drain coll tasks first. This is essential since we partition tasks based
      // on the work budget and p2p work isn't collective. If we were to drain p2p
      // first, the place where we cut the kernel could vary by rank which would
      // cause the "shortest channel first" channel picker to have divergent results.
      if (tasks->nTasksColl != 0) {
        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure);
      }
      // And only drain p2p tasks once colls are depleted.
      if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) {
        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure);
      }
      if (nWorkBudget == nWorkBudgetOld) {
        // We weren't able to fit any tasks into our budget which means now we're
        // stuck in an infinite loop. We defer this check until here, instead of
        // doing it in comm init, to permit testing with insanely shallow queues
        // for cases where that's expected to still work (e.g. few channels).
        WARN("'NCCL_WORK_FIFO_DEPTH=%d' is too small. Minimum value is %d", comm->workFifoDepth, 2*MAXCHANNELS);
        result = ncclInvalidUsage;
        goto failure;
      }
      finishPlan(plan);
    } while (tasks->nTasksColl + tasks->nTasksP2p != 0);

    struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);
    comm->unlaunchedPlansHead = planHead;

    // Semantically we want these dependencies for the kernels launched:
    //   1. Launch host task on hostStream.
    //   2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
    //   3. {deviceStream, userStream[i]...} depend on kernel.
    // We achieve this by:
    //   1. userStream[0] waits on deviceStream
    //   2. deviceStream waits on each of userStream[1...]
    //   3. host task launch on hostStream
    //   4. userStream[0] waits on hostStream
    //   5. kernel launch on userStream[0]
    //   6. deviceStream waits on userStream[0]
    //   7. userStream[1...] each waits on deviceStream
    // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
    // at least one of the two streams to be strong-stream.
    cudaStream_t launchStream = tasks->streams->stream;
    NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);

    if (tasks->numStreams != 1 || persistent) {
      // Create dependency for device stream on user streams. First from extra user
      // streams to deviceStream. Then deviceStream to first user stream.
      for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) {
        NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
      }
      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
    } else if (tasks->streams->stream != comm->lastStream && comm->lastStream != nullptr && !persistent) {
      // Stream changed from last call, create dependency against last NCCL kernel launch
      CUDACHECK(hipStreamWaitEvent(tasks->streams->stream, comm->doneEvent, 0));
    }

    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
      // We have to launch host tasks to push proxy args. We are careful to only
      // do this if necessary since host tasks impose a high performance cost in CUDA.
      bool acquired = false;
      for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
        if (plan->hasProxyOps) {
          if (!acquired) {
            acquired = true;
            NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
          }
          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
        }
      }
      if (acquired) {
        // Make to-be-launched kernels dependent on just-launched host stream tasks.
        NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
        NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure);
      }
    }

    if (persistent) {
      comm->persistentRefs += nPlans;
      NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
    }
  }

  if (false) {
  failure:
    ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's
  }
  return result;
}

ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  // This code is called after we've checked in to the intra-process barrier
  // but before launching the kernel. We are not allowed to call CUDA unless the
  // kernel launch is captured.
  NCCLCHECK(uploadWork(comm, plan));
  return ncclSuccess;
}

#if CUDART_VERSION >= 12000
// NCCL uses the "Remote" Mem Sync domain by default
NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
#endif

ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  struct ncclTasks* tasks = &comm->tasks;
  void *fn = plan->kernelFn;
  cudaStream_t launchStream = tasks->streams->stream;
  dim3 grid = {(unsigned)plan->channelCount, 1, 1};
  dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
  size_t smem = ncclShmemDynamicSize(comm->cudaArch);
  void *args[3] = {&comm->devComm, &plan->channelMask, &plan->workHead};
  if (tasks->numStreams == 1 && !plan->persistent) {
    CUDACHECK(hipExtLaunchKernel(plan->kernelFn, grid, block, args, 0, tasks->streams->stream, NULL, comm->doneEvent, 0));
    comm->lastStream = tasks->streams->stream;
    return ncclSuccess;
  }

  #if CUDART_VERSION >= 11080
  int driverVersion;
  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
  if (driverVersion >= 11080) {
    int compCap = comm->compCap;
    unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;

    cudaLaunchConfig_t launchConfig = {0};
    cudaLaunchAttribute launchAttrs[3];
    int attrs = 0;
    /* Cooperative Group Array (CGA)
     * On sm90 and later we have an extra level of hierarchy where we
     * can group together several blocks within the Grid, called
     * Thread Block Clusters.
     * Clusters enable multiple thread blocks running concurrently
     * across multiple SMs to synchronize and collaboratively fetch
     * and exchange data. A cluster of blocks are guaranteed to be
     * concurrently scheduled onto a group of SMs.
     * The maximum value is 8 and it must be divisible into the grid dimensions
     */
    if (clusterSize) {
      // Grid dimension must be divisible by clusterSize
      if (grid.x % clusterSize) clusterSize = 1;
      launchAttrs[attrs].id = cudaLaunchAttributeClusterDimension;
      launchAttrs[attrs++].val.clusterDim = {clusterSize, 1, 1};
      launchAttrs[attrs].id = cudaLaunchAttributeClusterSchedulingPolicyPreference;
      launchAttrs[attrs++].val.clusterSchedulingPolicyPreference = cudaClusterSchedulingPolicySpread;
    }
    #if CUDART_VERSION >= 12000
    if (compCap >= 90 && driverVersion >= 12000) {
      // Set the NCCL Mem Sync domain on CUDA 12.0 and later (sm90)
      launchAttrs[attrs].id = cudaLaunchAttributeMemSyncDomain;
      launchAttrs[attrs++].val.memSyncDomain = (cudaLaunchMemSyncDomain) ncclParamMemSyncDomain();
    }
    #endif
    launchConfig.gridDim = grid;
    launchConfig.blockDim = block;
    launchConfig.dynamicSmemBytes = smem;
    launchConfig.attrs = launchAttrs;
    launchConfig.numAttrs = attrs;
    launchConfig.stream = launchStream;

    CUDACHECK(cudaLaunchKernelExC(&launchConfig, fn, args));
    return ncclSuccess;
  }
  #endif
  // Standard kernel launch
  CUDACHECK(cudaLaunchKernel(fn, grid, block, args, smem, launchStream));
  return ncclSuccess;
}

ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
  if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
    // We are not using the host stream for proxy ops and reclaimation submission.
    NCCLCHECK(hostStreamPlanTask(comm, plan));
  } else {
    // We are using the host stream for proxy ops and reclaimation submission.
    // Only plans with proxy ops have a callback pushed by ncclLaunchPrepare.
    // Since non-persistent plans also require reclaimation, we have to do it
    // here.
    if (!plan->persistent && !plan->hasProxyOps) {
      ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
    }
  }
  return ncclSuccess;
}

ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
  ncclResult_t result = ncclSuccess;
  struct ncclTasks* tasks = &comm->tasks;
  bool persistent = ncclCudaGraphValid(tasks->capturingGraph);
  tasks->workBytesTotal = 0; // Just in case subtraction during scheduleCollTasksToPlan() doesn't get to 0

  // Deallocate ncclWork's. This frame exists so long as ncclLaunchPrepare
  // succeeded, and if it ncclLaunchPrepare didn't succeed we wouldn't be here.
  ncclMemoryStackPop(&comm->memScoped);

  if (!ncclIntruQueueEmpty(&comm->planQueue)) {
    // Reset queue to empty without destroying plans since those will be sent
    // back to us for reclaiming via callbackQueue.
    ncclIntruQueueConstruct(&comm->planQueue);
    cudaStream_t launchStream = tasks->streams->stream; // First user stream gets launch
    // Create dependency for deviceStream on launchStream. We know that deviceStream
    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
    // so we can say that launchStream subsumes it.
    if (persistent || tasks->numStreams != 1) NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
  resume1:
    // Create dependency for other user streams (skip launch stream) on deviceStream.
    // Again, the user streams haven't been touched since deviceStream waited on them
    // so we can say they are subsumed by deviceStream.
    struct ncclCudaStreamList* sl = tasks->streams->next;
    tasks->streams = nullptr; // Reset comm->tasks.streams to empty.
    while (sl != nullptr && (tasks->numStreams != 1 || persistent)) {
      NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
    resume2:
      sl = sl->next;
    }
    tasks->numStreams = 0;
    // Release device stream as acquired in ncclLaunchPrepare()
    NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
  resume3:;
  }
  return result;
}

/*****************************************************************************/
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/

static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetSupport) {
  // Translate ncclAvg and PreMulSum
  ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
  *collNetSupport = info->comm->collNetSupport;
  switch (info->coll) {
  case ncclFuncAllReduce:
  case ncclFuncReduce:
  case ncclFuncReduceScatter:
    *collNetSupport &= info->comm->collNetSupportMatrix[netOp][info->datatype];
    break;
  default:
    break;
  }
  return ncclSuccess;
}

// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
static ncclResult_t topoGetAlgoInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) {
  struct ncclComm* comm = collInfo->comm;
  if (comm->nRanks == 1 || collInfo->coll == ncclFuncAllToAllPivot) {
    collInfo->algorithm = NCCL_ALGO_RING;
    collInfo->protocol = NCCL_PROTO_SIMPLE;
  }
  else if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) {
    float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
    float backupMinTime = 3600000000.0;
    bool backup = false;
    int backupAlgo = NCCL_ALGO_UNDEF; // back up algo and proto if no algo/proto is picked up.
    int backupProto = NCCL_PROTO_UNDEF;
    // Find algorithm / protocol.
    collInfo->algorithm = -1;
    collInfo->protocol = -1;
    int nAlgos = NCCL_NUM_ALGORITHMS;
    for (int a=0; a<nAlgos; a++) {
      if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
      if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1) continue;
      if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
      /* now we only support single-node NVLS allgather and reducescatter */
      if (a == NCCL_ALGO_NVLS && (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) && comm->nNodes > 1) continue;

      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
        if (p == NCCL_PROTO_LL128 && collInfo->comm->topo->type != RCCL_TOPO_XGMI_ALL) continue;
        float time;
        NCCLCHECK(ncclTopoGetAlgoTime(collInfo, a, p, numPipeOps, &time, &backup));
        if (!backup) {
          if (time >= 0 && time < minTime) {
            collInfo->algorithm = a;
            collInfo->protocol = p;
            minTime = time;
          }
        } else {
          if (time >= 0 && time < backupMinTime) {
            backupAlgo = a;
            backupProto = p;
            backupMinTime = time;
          }
        }
      }
    }

    if (collInfo->algorithm == NCCL_ALGO_UNDEF || collInfo->protocol == NCCL_PROTO_UNDEF) {
      if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
        WARN("Error : no algorithm/protocol available");
        return ncclInternalError;
      }
      collInfo->algorithm = backupAlgo;
      collInfo->protocol = backupProto;
    }
    if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime);
    TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", collInfo->nBytes, collInfo->algorithm, collInfo->protocol, minTime);
  }

  return ncclSuccess;
}

// Use the default topo-based tuner if tuner plugin is not successful.
// Call the plugin first. Let it set algo+proto, and/or nChannels.
// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
// Finally, nChannels will be overriden by the plugin setting.
static ncclResult_t getTunerInfo(struct ncclInfo* collInfo, int collNetSupport, int nvlsSupport, int numPipeOps) {
  collInfo->algorithm = NCCL_ALGO_UNDEF;
  collInfo->protocol = NCCL_PROTO_UNDEF;
  collInfo->nChannels = 0;
  if (collInfo->comm->tuner != NULL) {
    NCCLCHECK(collInfo->comm->tuner->getCollInfo(
          collInfo->coll, collInfo->nBytes,
          collNetSupport, nvlsSupport, numPipeOps,
          &collInfo->algorithm, &collInfo->protocol, &collInfo->nChannels));
  }

  /* We only honor nChannels decision when user sets the nChannels by tuner plugin or the coll picks
   * collnet algorithm. For other cases, we need to decide nChannels based on the maxBytesPerChannel */
  if (collInfo->nChannels != 0)
    collInfo->userTuned = true;
  else
    collInfo->userTuned = false;
  return ncclSuccess;
}

/* Compute nChannels and nThreads. */
static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) {
  struct ncclComm *comm = collInfo->comm;
  int nc = (collInfo->nChannels > 0) ? collInfo->nChannels : comm->nChannels;
  int nt = comm->maxThreads[collInfo->algorithm][collInfo->protocol];
  int threadThreshold = comm->threadThresholds[collInfo->algorithm][collInfo->protocol];
  if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
    // CollNet channel tuning
    int ncSwitch = 16;
    bool flag = true;
    while (ncSwitch >= 1 && flag) {
      while ((flag = collInfo->nBytes < nc*nt*collInfo->comm->channels[0].collnetDirect.nHeads*threadThreshold) && nc > ncSwitch) {
        if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2;
        nc--;
      }
      ncSwitch /= 2;
    }
  } else if (collInfo->algorithm == NCCL_ALGO_NVLS || collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
    // NVLS should not need more than 16 channels to get peak BW.
    nc = comm->nvlsChannels;
  } else {
    // Ring/Tree channel tuning
    while (collInfo->nBytes < nc*nt*threadThreshold) {
      if (nc >= 2) nc--;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
      // do not reduce threads count on VEGA
#else
      else if ((nt % 128) == 0) nt/=2;
#endif
      else break;
    }
  }
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#else
  if (collInfo->protocol == NCCL_PROTO_SIMPLE) {
    if (collInfo->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
    // More threads or sync warps needed due to split thread model
    if (collInfo->algorithm == NCCL_ALGO_TREE) nt += 4*WARP_SIZE;
  }
  nt = nt/WARP_SIZE < 3 ? 3*WARP_SIZE : nt;
#endif
  if (collInfo->coll == ncclFuncAllReduce && comm->topo->pivotA2ANumBiRings == 3) {
    static int userTuneInput = -2;
    if (userTuneInput == -2) {
      const char *protoStr = getenv("NCCL_PROTO");
      const char *algoStr = getenv("NCCL_ALGO");
      if (!protoStr && !algoStr)
        userTuneInput = 0;
      else
        userTuneInput = 1;
    }
    collInfo->nChannels = nc;
    if (!userTuneInput) {
      // always respect user settings
      if (collInfo->nBytes <= 2200008) {
        collInfo->protocol = NCCL_PROTO_LL;
        collInfo->algorithm = NCCL_ALGO_TREE;
        collInfo->nChannels = std::min(24, comm->nChannels);
      } else {
        collInfo->protocol = NCCL_PROTO_SIMPLE;
        collInfo->algorithm = NCCL_ALGO_RING;
      }
    }
  } else if (collInfo->coll == ncclFuncAllReduce && comm->topo->treeDefined == 1) {
    collInfo->algorithm = NCCL_ALGO_TREE;
    collInfo->nChannels = nc;
  } else {
    collInfo->nChannels = nc;
  }
  collInfo->nThreads = nt;
  return ncclSuccess;
}

static ncclResult_t getPatternInfo(struct ncclInfo* collInfo) {
  switch (collInfo->coll) {
    case ncclFuncBroadcast:
      collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
    case ncclFuncReduce:
      collInfo->pattern = collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
    case ncclFuncReduceScatter:
    case ncclFuncAllGather:
      collInfo->pattern =
        collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
        collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
        ncclPatternRing; break;
    case ncclFuncAllToAllPivot:
      collInfo->pattern = ncclPatternRing; break;
    case ncclFuncAllReduce:
      collInfo->pattern =
        collInfo->algorithm == NCCL_ALGO_NVLS ? ncclPatternNvls :
        collInfo->algorithm == NCCL_ALGO_NVLS_TREE ? ncclPatternNvlsTree :
        collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT ? ncclPatternCollnetDirect :
        collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN ? ncclPatternCollnetChain :
        collInfo->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown :
        ncclPatternRingTwice; break;
    default:
      WARN("Unknown pattern for collective %d algorithm %d", collInfo->coll, collInfo->algorithm);
      return ncclInternalError;
  }
  return ncclSuccess;
}

RCCL_PARAM(IntraNetThreshold, "INTRANET_THRESHOLD", 8388608);

static ncclResult_t computeCollWorkFunc(struct ncclInfo* collInfo) {
  collInfo->workFuncIndex = ncclDevFuncId(collInfo->coll, collInfo->opFull.op, collInfo->datatype, collInfo->algorithm, collInfo->protocol);
  if (collInfo->workFuncIndex < 0) {
    WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
    return ncclInvalidUsage;
  }
  return ncclSuccess;
}

static ncclResult_t initCollWorkElem(struct ncclInfo* collInfo, struct ncclWorkElem* work) {
  work->sendbuff = collInfo->sendbuff;
  work->recvbuff = collInfo->recvbuff;
  work->root = collInfo->root;
  work->count = collInfo->count;
  work->nWarps = collInfo->nThreads / WARP_SIZE;
  work->redOpArg = collInfo->opFull.scalarArg;
  work->redOpArgIsPtr = collInfo->opFull.scalarArgIsPtr;
  work->chunkCount = collInfo->chunkCount;
  work->opCount = collInfo->comm->opCount;
  work->regUsed = 0;
  work->isUsed = 1;
  work->connIndex = 0;
  if (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) {
    if (collInfo->comm->useIntraNet && collInfo->nBytes > rcclParamIntraNetThreshold()) {
      work->connIndex = NCCL_CONN_IDX_P2P_NET;
    }
  }
  if (collInfo->comm->nNodes == 1)
    work->oneNode = 1;
  else
    work->oneNode = 0;
  if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
    // Set direct direction for broadcast-gather (read or write)
    work->direct = (collInfo->nBytes / collInfo->nChannels <= 1024 * 1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
  } else {
    work->direct = 0;
  }
  return ncclSuccess;
}

static ncclResult_t setCollWorkElem(uint64_t workCount, uint64_t workOffset, size_t lastChunkCount, struct ncclWorkElem* work) {
  work->workCount = workCount;
  work->workOffset = workOffset;
  work->lastChunkCount = lastChunkCount;
  return ncclSuccess;
}

static ncclResult_t initCollWorkElemReg(struct ncclComm* comm, struct ncclWorkElem* work, struct ncclChannel* channel, ncclRegBufferType regBufType, void* regBufSend[], void* regBufRecv[], struct ncclWorkElemReg* workElemReg) {
  if (regBufType == NCCL_IPC_REG_BUFFER) {
    workElemReg->elem = *work;
    workElemReg->elem.regUsed = 1;
    for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
      int peer = channel->collnetDirect.down[i];
      if (peer == -1) break;
      int j = comm->rankToLocalRank[peer]; // Get intra-node slot
      workElemReg->dnInputs[i] = regBufSend[j]; // Input buffer of leaf peer
      workElemReg->dnOutputs[i] = regBufRecv[j]; // Output buffer of leaf peer
    }
    for (int i = 0; i < NCCL_MAX_DIRECT_ARITY; i++) {
      int peer = channel->collnetDirect.up[i];
      if (peer == -1) break;
      int j = comm->rankToLocalRank[peer];
      // Output buffer of root peer
      workElemReg->upOutputs[i] = regBufRecv[j];
    }
  } else if (regBufType == NCCL_NVLS_REG_BUFFER) {
    workElemReg->elem = *work;
    workElemReg->elem.regUsed = 1;
    /* NVLS only has one send and recv buffer registered */
    workElemReg->dnInputs[0] = regBufSend[0];
    workElemReg->dnOutputs[0] = regBufRecv[0];
  } else {
    /* impossible value */
    WARN("Invalid regBufType %d\n", regBufType);
    return ncclInvalidArgument;
  }
  return ncclSuccess;
}

NCCL_PARAM(NvlsTreeChunkSize, "NVLSTREE_MAX_CHUNKSIZE", -2);

static ncclResult_t computeCollChunkInfo(struct ncclInfo* collInfo, size_t nBytes, int nChannels) {
  int stepSize = collInfo->comm->buffSizes[collInfo->protocol] / NCCL_STEPS;
  int chunkSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->chunkSteps : 1;
  int sliceSteps = (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) ? collInfo->sliceSteps : 1;
  int chunkSize = stepSize * chunkSteps;

  if (collInfo->protocol == NCCL_PROTO_LL) chunkSize /= 2;
  if (collInfo->protocol == NCCL_PROTO_LL128) chunkSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;

  if (collInfo->algorithm == NCCL_ALGO_TREE && collInfo->protocol == NCCL_PROTO_SIMPLE) {
    if (collInfo->pattern == ncclPatternTreeUpDown) {
      // Optimize chunkSize / nSteps
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < collInfo->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < collInfo->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < collInfo->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
    }
  } else if (collInfo->algorithm == NCCL_ALGO_RING && collInfo->protocol == NCCL_PROTO_SIMPLE) {
    if (collInfo->pattern == ncclPatternPipelineFrom || collInfo->pattern == ncclPatternPipelineTo) {
      // Optimize chunkSize / nSteps
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 64 && chunkSize > 262144) chunkSize /= 2;
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 32 && chunkSize > 131072) chunkSize /= 2;
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 16 && chunkSize > 65536) chunkSize /= 2;
      while (collInfo->nBytes / (collInfo->nChannels*chunkSize) < 8 && chunkSize > 32768) chunkSize /= 2;
    }
  } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
    // Optimize chunkSize / nSteps
    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 64 && chunkSize > 131072) chunkSize /= 2;
    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 65536) chunkSize /= 2;
    while (nBytes / (nChannels * collInfo->comm->channels[0].collnetDirect.nHeads * chunkSize) < collInfo->comm->channels[0].collnetDirect.depth * 8 && chunkSize > 32768) chunkSize /= 2;
  } else if (collInfo->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
    stepSize = collInfo->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
    chunkSize = std::min(256 * 1024, stepSize * chunkSteps);
    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 64 && chunkSize > 131072) chunkSize /= 2;
    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
    while (nBytes / (nChannels * chunkSize) < collInfo->comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
  } else if (collInfo->algorithm == NCCL_ALGO_NVLS) {
    int maxChunkSize = 131072;
    if (collInfo->comm->nNodes > 1 && collInfo->comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
    if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
    uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
    if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
    if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
    if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
  } else if (collInfo->algorithm == NCCL_ALGO_NVLS_TREE) {
    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow
    uint64_t concurrentOps = nChannels * collInfo->comm->channels[0].nvls.nHeads;
    int maxChunkSize = ncclParamNvlsTreeChunkSize();
    if (maxChunkSize == -2) maxChunkSize = collInfo->comm->nNodes >= 4 ? 65536 : chunkSize;
    chunkSize = std::min(chunkSize, maxChunkSize);
    if ((nBytes < (32 * (concurrentOps * chunkSize))) && (chunkSize > 262144)) chunkSize = 262144;
    if ((nBytes < (16 * (concurrentOps * chunkSize))) && (chunkSize > 131072)) chunkSize = 131072;
    if ((nBytes < (4 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
    if ((nBytes < (1 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
  } else if (collInfo->algorithm == NCCL_ALGO_TREE && collInfo->protocol == NCCL_PROTO_LL128) {
    int nNodes = collInfo->comm->nNodes;
    float ppn = collInfo->comm->nRanks / (float)nNodes;
    float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
    while (nBytes / (nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
    while (nBytes / (nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
  }

  collInfo->chunkSize = chunkSize;
  collInfo->chunkCount = chunkSize / ncclTypeSize(collInfo->datatype);
  collInfo->chunkSteps = chunkSteps;
  collInfo->sliceSteps = sliceSteps;
  collInfo->stepSize = stepSize;
  return ncclSuccess;
}

static ncclResult_t initCollProxyOp(struct ncclInfo* collInfo, int channelId, uint64_t opCount, uint32_t nsteps, struct ncclProxyOp* proxyOp) {
  proxyOp->nsteps = nsteps;
  proxyOp->sliceSteps = collInfo->sliceSteps;
  proxyOp->chunkSteps = collInfo->chunkSteps;
  proxyOp->chunkSize = collInfo->chunkSize;
  proxyOp->protocol = collInfo->protocol;
  proxyOp->dtype = collInfo->datatype;
  // Network sees avg as sum
  proxyOp->redOp = collInfo->opFull.op == ncclDevPreMulSum || collInfo->opFull.op == ncclDevSumPostDiv ? ncclSum : collInfo->opFull.proxyOp;
  proxyOp->pattern = collInfo->pattern;
  proxyOp->coll = collInfo->coll;
  proxyOp->root = collInfo->root;
  proxyOp->reg = 0;
  // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
  // because some protocols need to transmit more than the total size, plus they sometimes
  // round up
  proxyOp->nbytes = collInfo->stepSize * proxyOp->sliceSteps;
  proxyOp->channelId = channelId;
  proxyOp->opCount = opCount;
  proxyOp->connIndex = 0;
  if (collInfo->protocol == NCCL_PROTO_SIMPLE && collInfo->algorithm == NCCL_ALGO_RING) {
    if (collInfo->comm->useIntraNet && collInfo->nBytes > rcclParamIntraNetThreshold()) {
      proxyOp->connIndex = NCCL_CONN_IDX_P2P_NET;
    }
  }

  if (collInfo->pattern == ncclPatternCollnetDirect) {
    proxyOp->specifics.collnetDirect.nNodes = collInfo->comm->nNodes;
    proxyOp->specifics.collnetDirect.node = collInfo->comm->node;
    if (collInfo->coll == ncclFuncAllGather || collInfo->coll == ncclFuncReduceScatter) {
      proxyOp->specifics.collnetDirect.sizePerRank = collInfo->count * ncclTypeSize(collInfo->datatype);
    }
  }
  return ncclSuccess;
}

static ncclResult_t hostToDevRedOp(
    ncclDevRedOpFull *opFull, ncclRedOp_t op, ncclDataType_t datatype, ncclComm *comm
  ) {
  union {
    int8_t i8;
    uint8_t u8;
    int32_t i32;
    uint32_t u32;
    int64_t i64;
    uint64_t u64;
    half f16;
    float f32;
    double f64;
#if defined(RCCL_BFLOAT16)
    hip_bfloat16 bf16;
#endif
#if defined(RCCL_FLOAT8)
    rccl_float8 fp8_e4m3;
    rccl_bfloat8 fp8_e5m2;
#endif
    void *ptr;
  };
  u64 = 0;
  opFull->scalarArgIsPtr = false;
  opFull->proxyOp = op;

  int nbits = 8*ncclTypeSize(datatype);
  uint64_t allBits = uint64_t(-1)>>(64-nbits);
  uint64_t signBit = allBits^(allBits>>1);

  switch (int(op)) {
  case ncclSum:  opFull->op = ncclDevSum;  break;
  case ncclProd: opFull->op = ncclDevProd; break;
  case ncclMin:
  case ncclMax:
    opFull->op = ncclDevMinMax;
    opFull->scalarArg = 0;
    // The xormask used by ncclFuncMinMax<[u]int> is the XOR of the sign bit
    // for signed (opposed to unsigned) types and all the bits for max (opposed to min).
    if (datatype==ncclInt8 || datatype==ncclInt32 || datatype==ncclInt64) {
      opFull->scalarArg ^= signBit;
    }
    opFull->scalarArg ^= (op == ncclMax) ? allBits : 0;
    break;
  case ncclAvg:
    switch ((int)datatype) {
    case ncclInt8:  case ncclInt32:  case ncclInt64:
    case ncclUint8: case ncclUint32: case ncclUint64:
      opFull->op = ncclDevSumPostDiv;
      u64 = comm->nRanks;
      break;
    case ncclFloat16:
      opFull->op = ncclDevPreMulSum;
      f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x
      break;
#if defined(RCCL_BFLOAT16)
    case ncclBfloat16:
      opFull->op = ncclDevPreMulSum;
      bf16 = (hip_bfloat16)(float(1.0/comm->nRanks));
      break;
#endif
#if defined(RCCL_FLOAT8)
    case ncclFp8E4M3:
      opFull->op = ncclDevPreMulSum;
      fp8_e4m3 = static_cast<rccl_float8>(float(1.0/comm->nRanks));
      break;
    case ncclFp8E5M2:
      opFull->op = ncclDevPreMulSum;
      fp8_e5m2 = static_cast<rccl_bfloat8>(float(1.0/comm->nRanks));
      break;
#endif
    case ncclFloat32:
      opFull->op = ncclDevPreMulSum;
      f32 = float(1.0/comm->nRanks);
      break;
    case ncclFloat64:
      opFull->op = ncclDevPreMulSum;
      f64 = 1.0/comm->nRanks;
      break;
    }
    opFull->scalarArgIsPtr = false;
    opFull->scalarArg = u64;
    break;
  default: // user created
    int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
    ncclUserRedOp *user = &comm->userRedOps[ix];
    if (datatype != user->datatype) {
      WARN("Data type supplied to user-created ncclRedOp_t does not match type "
           "given to reduction operation");
      return ncclInvalidArgument;
    }
    *opFull = user->opFull;
    break;
  }
  return ncclSuccess;
}

static int collCmp(struct ncclInfo *a, struct ncclInfo *b) {
  if (a->coll > b->coll)
    return 1;
  else if (a->coll == b->coll && a->datatype > b->datatype)
    return 1;
  else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op > b->opFull.op)
    return 1;
  else if (a->coll == b->coll && a->datatype == b->datatype && a->opFull.op == b->opFull.op && a->count > b->count)
    return 1;
  else
    return -1;
}

// Converts `info` to a task and adds it to `comm->tasks`. The exception is with
// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
// thus don't need a task.
static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
  ncclTasks *tasks = &comm->tasks;

  if (info->count == 0 && info->coll != ncclFuncSend && info->coll != ncclFuncRecv) return ncclSuccess;
  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
    int peer = info->root;
    ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
    bool isSendNotRecv = info->coll == ncclFuncSend;

    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
    ncclGroupCommJoin(info->comm);
    struct ncclTaskP2p* p2p = ncclMemoryStackAlloc<struct ncclTaskP2p>(&comm->memScoped);
    p2p->buff = (void*)info->recvbuff;
    p2p->bytes = nBytes;
    p2p->chunk = 0;
    ncclIntruQueueEnqueue(
      isSendNotRecv ? &tasks->peers[peer].sendQueue : &tasks->peers[peer].recvQueue,
      p2p);
    tasks->nTasksP2p += 1;

    // Mark channels that need pre-connect
    if (comm->rank != peer) {
      int channelBaseId;
      NCCLCHECK(ncclChannelComputeBase(comm, peer, info->coll, &channelBaseId));
      if (!(isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen)) {
        (isSendNotRecv ? tasks->peers[peer].sendSeen : tasks->peers[peer].recvSeen) = true;
        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
          int channelId;
          NCCLCHECK(ncclChannelComputeFromBase(comm, channelBaseId, c, &channelId));
          if (isSendNotRecv) {
            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
              //comm->connectSend[peer] |= (1UL<<channelId);
	      comm->connectSend[peer].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
            if (comm->p2pNet && comm->channels[channelId].peers[peer]->send[NCCL_CONN_IDX_P2P_NET].connected == 0) {
              //comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
	      comm->connectSend[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
          } else {
            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
              //comm->connectRecv[peer] |= (1UL<<channelId);
	      comm->connectRecv[peer].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
            if (comm->p2pNet && comm->channels[channelId].peers[peer]->recv[NCCL_CONN_IDX_P2P_NET].connected == 0) {
              //comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET] |= (1UL<<channelId);
	      comm->connectRecv[peer+comm->nRanks*NCCL_CONN_IDX_P2P_NET].masks[channelId/64] |= (1UL<<(channelId%64));
              ncclGroupCommPreconnect(comm);
            }
          }
        }
      }
    }
  } else {
    // Copy reduction op state from op handle into info struct here since the
    // op handle may be destroyed before ncclGroupEnd().
    NCCLCHECK(hostToDevRedOp(&info->opFull, info->op, info->datatype, comm));

    if (comm->nRanks == 1) {
      NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, info->opFull, info->datatype, info->stream));
      return ncclSuccess;
    } else {
      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
      ncclGroupCommJoin(info->comm);
      struct ncclInfo* t = ncclMemoryStackAlloc<struct ncclInfo>(&comm->memScoped);
      info->nChannels = 0;
      info->nThreads = 0;
      info->algorithm = NCCL_ALGO_UNDEF;
      info->protocol = NCCL_PROTO_UNDEF;
      info->userTuned = false;
      memcpy(t, info, sizeof(struct ncclInfo));
      ncclIntruQueueSortEnqueue(&tasks->collQueue, t, collCmp);
      tasks->workBytesTotal += info->count * ncclTypeSize(info->datatype);
      tasks->nTasksColl += 1;
    }
  }

  if (info->stream != tasks->streamRecent || tasks->streams == nullptr) {
    tasks->streamRecent = info->stream;
    struct ncclCudaStreamList* l = tasks->streams;
    while (true) {
      if (l == nullptr) { // Got to the end, this must be a new stream.
        struct ncclCudaGraph graph;
        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream))
        if (tasks->streams != nullptr && !ncclCudaGraphSame(tasks->capturingGraph, graph)) {
          WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
          return ncclInvalidUsage;
        }
        tasks->capturingGraph = graph; // C++ struct assignment
        // Add stream to list
        l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
        l->stream = info->stream;
        l->next = tasks->streams;
        tasks->streams = l;
        tasks->numStreams++;
        break;
      }
      if (l->stream == info->stream)
        break; // Already seen stream.
      l = l->next;
    }
  }
  return ncclSuccess;
}

ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
  NCCLCHECK(ncclGroupStartInternal());
  ncclResult_t ret = ncclSuccess;
  int devOld = -1;

  NCCLCHECKGOTO(PtrCheck(info->comm, info->opName, "comm"), ret, fail);
  // Check whether communicator is ready to communicate
  NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);

  if (info->comm->checkPointers) {
    CUDACHECKGOTO(cudaGetDevice(&devOld), ret, fail);
    CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, fail);
  }
  NCCLCHECKGOTO(ArgsCheck(info), ret, fail);

  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p task %d globalrank %d",
      info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
      info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream,
      info->comm->tasks.nTasksP2p + info->comm->tasks.nTasksColl,
      info->comm->localRankToRank[info->comm->localRank]);
  TRACE_CALL("nccl%s(%" PRIx64 ",%" PRIx64 ",%zi,%d,%d,%d,%p,%p)", info->opName, reinterpret_cast<int64_t>(info->sendbuff), reinterpret_cast<int64_t>(info->recvbuff), info->count, info->datatype, info->op, info->root, info->comm, info->stream);

  NCCLCHECKGOTO(taskAppend(info->comm, info), ret, fail);

exit:
  if (devOld != -1) CUDACHECK(cudaSetDevice(devOld));
  ncclGroupErrCheck(ret);
  NCCLCHECK(ncclGroupEndInternal());
  /* if depth is 1, ncclGroupEndInternal() will trigger group ops. The state can change
   * so we have to check state here. */
  if (info->comm && !info->comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(info->comm, &ret)) };
  return ret;
fail:
  if (info->comm && !info->comm->config.blocking) (void) ncclCommSetAsyncError(info->comm, ret);
  goto exit;
}

NCCL_API(ncclResult_t, ncclRedOpCreatePreMulSum, ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
ncclResult_t ncclRedOpCreatePreMulSum_impl(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) {
  NCCLCHECK(PtrCheck(comm, "ncclRedOpCreatePreMulSum", "comm"));
  /* join init thread before creating PreMulSum op. */
  NCCLCHECK(ncclCommEnsureReady(comm));

  if (comm->userRedOpFreeHead == comm->userRedOpCapacity) {
    // double capacity and resize
    int cap = 2*comm->userRedOpCapacity;
    if (cap < 4) cap = 4;
    ncclUserRedOp *ops = new ncclUserRedOp[cap];
    std::memcpy(ops, comm->userRedOps, comm->userRedOpCapacity*sizeof(ncclUserRedOp));
    for(int ix=comm->userRedOpCapacity; ix < cap; ix++)
      ops[ix].freeNext = ix + 1;
    delete[] comm->userRedOps;
    comm->userRedOps = ops;
    comm->userRedOpCapacity = cap;
  }
  // pop from free list
  int ix = comm->userRedOpFreeHead;
  ncclUserRedOp *user = &comm->userRedOps[ix];
  comm->userRedOpFreeHead = user->freeNext;

  user->freeNext = -1; // allocated
  user->datatype = datatype;
  user->opFull.op = ncclDevPreMulSum;
  if (residence == ncclScalarHostImmediate) {
    user->opFull.scalarArgIsPtr = false;
    std::memcpy(&user->opFull.scalarArg, scalar, ncclTypeSize(datatype));
  } else {
    user->opFull.scalarArgIsPtr = true;
    user->opFull.scalarArg = reinterpret_cast<uint64_t>(scalar);
  }
  *op = ncclRedOp_t(int(ncclNumOps) + ix);
  *op = ncclUserRedOpMangle(comm, *op);
  TRACE_CALL("ncclRedOpCreatePreMulSum(%d,%p,%d,%d,%p)", *op, scalar, datatype, residence, comm);
  return ncclSuccess;
}

NCCL_API(ncclResult_t, ncclRedOpDestroy, ncclRedOp_t op, ncclComm_t comm);
ncclResult_t ncclRedOpDestroy_impl(ncclRedOp_t op, ncclComm_t comm) {
  if (0 <= int(op) && int(op) < int(ncclNumOps)) {
    WARN("ncclRedOpDestroy : operator is a NCCL builtin.");
    return ncclInvalidArgument;
  }
  if (int(op) < 0 || int(ncclMaxRedOp) < int(op)) {
    WARN("ncclRedOpDestroy :  operator is garbage.");
    return ncclInvalidArgument;
  }
  if (comm == NULL) {
    WARN("ncclRedOpDestroy : invalid communicator passed.");
    return ncclInvalidArgument;
  }

  int ix = int(ncclUserRedOpMangle(comm, op)) - int(ncclNumOps);
  if (comm->userRedOpCapacity <= ix || comm->userRedOps[ix].freeNext != -1) {
    WARN("ncclRedOpDestroy : operator unknown to this communicator.");
    return ncclInvalidArgument;
  }
  // push to free list
  comm->userRedOps[ix].freeNext = comm->userRedOpFreeHead;
  comm->userRedOpFreeHead = ix;
  TRACE_CALL("ncclRedOpDestroy(%d,%p)", op, comm);
  return ncclSuccess;
}