/************************************************************************* * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ // #include "transport.h" #include "p2p.h" #include "collectives.h" #include "nccl_tuner.h" #include "proxy.h" #include "strongstream.h" #include "nccl_net.h" #include "register.h" #include "graph.h" #include "nvmlwrap.h" #include "profiler.h" #include "allocator.h" #include "latency_profiler/CollTrace.h" #include "rccl_common.h" #include "recorder.h" #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #define HIPRT_CB #else #if CUDART_VERSION < 9000 struct cudaLaunchParams { void *func; dim3 gridDim; dim3 blockDim; void **args; size_t sharedMem; cudaStream_t stream; }; #endif #endif #define CACHE_LINE_SIZE 64 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL // Channels / LL tuning #define NCCL_LL_THREAD_THRESHOLD 8 #define NCCL_LL128_THREAD_THRESHOLD 8 #define NCCL_SIMPLE_THREAD_THRESHOLD 64 struct ncclSendMem { union { struct { uint64_t head; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; uint64_t redOpArgExchange[2]; char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; int offsFifo[NCCL_STEPS]; }; char pad3[MEM_ALIGN]; }; }; struct ncclRecvMem { union { struct { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; struct ncclConnFifo connFifo[NCCL_STEPS]; int flush; // For GDRCopy-based flush }; char pad4[MEM_ALIGN]; }; }; enum helperThreadState {ThreadStart, ThreadStop}; #define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) struct ncclGraphHelperResources { ncclComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; enum helperThreadState threadState; void* ipcBases[NCCL_IPC_POOL_SIZE]; int ipcTail; int ipcHead; }; struct ncclUserRedOp { int freeNext; // -1=allocated, otherwise index of next free entry in array ncclDataType_t datatype; ncclDevRedOpFull opFull; }; struct ncclNodeRanks { int localRanks; int* localRankToRank; }; struct cliqueInfo { int id; int size; int *ranks; }; struct ncclDestructor { struct ncclDestructor* next; void* obj; ncclResult_t(*fn)(struct ncclDestructor* me); }; struct ncclCommCallback { struct ncclCommCallback* next; ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); }; struct ncclCommEventCallback { struct ncclCommEventCallback* next; cudaEvent_t event; ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommEventCallback* cb); }; struct ncclSharedResources { int refCount; struct ncclComm* owner; /* comm which creates this shared res. */ struct ncclChannelPeer* peers[MAXCHANNELS]; struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; /* P2P operation counter, one per channel */ uint64_t p2pOpCount[MAXCHANNELS]; /* Collective operation counter */ uint64_t collOpCount; int tpNRanks; int tpNLocalRanks; int tpNChannels; int tpP2pNChannels; int tpP2pChunkSize; uint64_t magic; // top parent rank to localRank translation table int* tpRankToLocalRank; // Internal streams struct ncclStrongStream deviceStream, hostStream; int persistentRefs; cudaEvent_t launchEvent, scratchEvent; /* proxy related shared res */ struct ncclProxyState* proxyState; }; struct ncclChannel { struct ncclChannelPeer** peers; struct ncclDevChannelPeer** devPeers; /* devPeer pointer array used for host side access */ struct ncclDevChannelPeer** devPeersHostPtr; struct ncclRing ring; int* devRingUserRanks; struct ncclTree tree; struct ncclTree collnetChain; struct ncclDirect collnetDirect; struct ncclTree binTree; struct ncclNvls nvls; int id; // index of this channel uint32_t workFifoProduced; // +1 successor of last used work fifo byte /* comm split sharable resources */ struct ncclChannelPeer* collnetPeers; struct ncclDevChannelPeer* collnetDevPeers; struct ncclChannelPeer* nvlsPeers; struct ncclDevChannelPeer* nvlsDevPeers; }; struct ncclWorkBatchList { struct ncclWorkBatchList* next; struct ncclDevWorkBatch batch; }; struct alignas(16) ncclWorkList { struct ncclWorkList* next; enum ncclDevWorkType workType; int size; // Size of struct following this node // ncclDevWorkColl, ncclDevWorkColLReg, ncclDevWorkP2p[]... }; struct ncclCollnetHandleList { struct ncclCollnetHandleList *next; void* collnetHandle; size_t size; const void* buffer; struct ncclProxyConnector* proxyconn; }; struct ncclTaskColl { struct ncclTaskColl* next; ncclFunc_t func; void const* sendbuff; void* recvbuff; void const* acc; size_t count; int root; ncclDataType_t datatype; ncclRedOp_t opHost; struct ncclDevRedOpFull opDev; int chunkSteps, sliceSteps; // Computed later: size_t trafficBytes; int32_t nMaxChannels:8; int32_t nWarps:8; int32_t algorithm:8, protocol:8, pipeline:8; uint32_t isCollnet:1, isNvls:1; uint32_t devFuncId:30; int regBufType; uint64_t opCount; // number of elements in planner->ipcMemQueue associated with this collective int nCleanupQueueElts; void* sendMhandle; void* recvMhandle; void** sendNetHandles; void** recvNetHandles; void** srecvNetHandles; // index for IPC record lookup uintptr_t sendbuffOffset; uintptr_t recvbuffOffset; uintptr_t* sendbuffRmtAddrs; uintptr_t* recvbuffRmtAddrs; // Profiler plugin int eActivationMask; void* eventHandle; uint8_t nChannels; }; struct ncclTaskP2p { struct ncclTaskP2p* next; ncclFunc_t func; void* buff; size_t count; ncclDataType_t datatype; int root; size_t bytes; uint64_t opCount; // Profiler plugin int eActivationMask; void* eventHandle; uint8_t nChannels; }; struct ncclKernelPlan { // A kernel plan is also a callback that reclaims itself. Hence this must // be the first member. struct ncclCommCallback reclaimer; struct ncclComm* comm; struct ncclKernelPlan* next; bool persistent; // aka captured in a graph bool isHostCbEnq; bool isSymColl; enum ncclDevWorkStorageType workStorageType; bool kernelSpecialized; void* kernelFn; union { struct ncclDevKernelArgs* kernelArgs; struct ncclSymDevArgs* kernelSymArgs; }; size_t kernelArgsSize; struct channelMasks channelMask; bool hasProxyOps; // does any channel have a non-empty proxyOpQueue int threadPerBlock; int collOpCount; // Number of collectives in this plan. int nWorkBatches; // Number of work batches. size_t workBytes; // Sum size of all work (in the fifo) in bytes. struct ncclIntruQueue workQueue; struct ncclIntruQueue cleanupQueue; void* workBufPersistent; struct ncclIntruQueue p2pTaskQueue; struct ncclIntruQueue collTaskQueue; struct ncclIntruQueue proxyOpQueue; // Profiler plugin void* groupEventHandle; }; //////////////////////////////////////////////////////////////////////////////// // Roughly sorts ncclTaskColl's by their size descending. This structure is // self-referential, meaning that pointers it contains internally may point // into the structure itself. This means that it is NOT memcpy-moveable: struct ncclTaskCollSorter { static constexpr int UnitLog2 = 10; // 1K static constexpr size_t UnitSize = 1<>UnitLog2, BitsPerPow2); bin = BinCount-1 - bin; // descending bin if (me->bins[bin] == nullptr) { if (me->binEdge <= bin) { me->binEdge = bin+1; me->bins[bin] = me->tail ? &me->tail->next : &me->head; me->tail = x; } else { // Find successor non-empty bin after this one. int succ = bin+1; while (me->bins[succ] == nullptr) succ++; // What was our successor's head's previous is now our head's previous. me->bins[bin] = me->bins[succ]; // The first node we insert is our tail, so that becomes our successor's // head's new previous. me->bins[succ] = &x->next; } } // Push a new head for this bin. x->next = *me->bins[bin]; *me->bins[bin] = x; } inline bool ncclTaskCollSorterEmpty(struct ncclTaskCollSorter* me) { return me->head == nullptr; } // Reset sorter and return sorted linked list of its coll tasks. inline struct ncclTaskColl* ncclTaskCollSorterDequeueAll(struct ncclTaskCollSorter* me) { struct ncclTaskColl* head = me->head; if (head != nullptr) memset(me, 0, sizeof(*me)); return head; } //////////////////////////////////////////////////////////////////////////////// struct ncclCudaStreamList { struct ncclCudaStreamList *next; cudaStream_t stream; }; struct ncclKernelPlanner { ////////////////////////////////////////////////////////////////////////////// // State for accumulating tasks between ncclGroupStart/End() ////////////////////////////////////////////////////////////////////////////// struct Peer { bool sendSeen, recvSeen; struct ncclIntruQueue sendQueue; struct ncclIntruQueue recvQueue; }; struct ncclTaskCollSorter collSorter; struct Peer* peers/*[nRanks]*/; int nTasksColl, nTasksP2p; bool persistent; bool isSymColl; // The list of user streams aggregated over all tasks present. struct ncclCudaStreamList* streams; // Keep track of the number of user streams int numStreams; // The most recent user stream. Ignored if streams==nullptr cudaStream_t streamRecent; // The graph capturing all user streams or invalid if none. Thus we restrict the // user that all streams must be captured in the same graph or not captured // at all. Technically we could probably relax this, but that would mean // collecting a different `ncclTasks` per graph and one for non-graph. struct ncclCudaGraph capturingGraph; ////////////////////////////////////////////////////////////////////////////// // Lists of tasks to be assembled into plans. ////////////////////////////////////////////////////////////////////////////// struct ncclIntruQueue collTaskQueue; struct ncclIntruQueue collWorkQueue; struct ncclIntruQueue tmpCollWorkQueue; struct ncclIntruQueue collCleanupQueue; ////////////////////////////////////////////////////////////////////////////// // State for building current (Work-In-Progress) plan: ////////////////////////////////////////////////////////////////////////////// struct WipPlan { struct Channel { struct { int workBytes; // Sum size of work metadata referenced by this batch. int nP2ps; // Number of p2p works in this batch int p2pRounds[NCCL_MAX_DEV_WORK_P2P_PER_BATCH]; // which rounds are present in this batch. } wipBatch; // work-in-progress batch which will be next tail of workBatchQueue int nWorkBatchesP2p; // number of p2p batches for this channel. struct ncclIntruQueue workBatchQueue; struct ncclIntruQueue proxyOpQueue; } channels[MAXCHANNELS]; } wipPlan; ////////////////////////////////////////////////////////////////////////////// // State for launching built plans: ////////////////////////////////////////////////////////////////////////////// // List of kernel plans built form tasks. struct ncclIntruQueue planQueue; // First of the unlaunched kernels in `planQueue` struct ncclKernelPlan* unlaunchedPlansHead; }; #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28. struct ncclPeerInfo { int rank; int cudaDev; int nvmlDev; int gdrSupport; bool hasFineGrain; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; int64_t busId; struct ncclComm* comm; int cudaCompCap; size_t totalGlobalMem; // MNNVL support nvmlGpuFabricInfoV_t fabricInfo; int cuMemSupport; int version; }; typedef enum ncclGroupTaskType { ncclGroupTaskTypeCollective = 0, ncclGroupTaskTypeSymRegister = 1, ncclGroupTaskTypeNum = 2, } ncclGroupTaskType_t; struct ncclComm { uint64_t startMagic; struct ncclMemoryStack memPermanent, memScoped; // List of destructors to run when comm is destructed struct ncclDestructor* destructorHead; struct ncclCudaContext* context; struct ncclSharedResources* sharedRes; /* map to top parent ranks. */ int* topParentRanks; int* topParentLocalRanks; struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; struct ncclTopoSystem* topo; struct ncclProxyConnector* gproxyConn; struct ncclIntruQueue legacyRegCleanupQueue; bool peerInfoValid; ncclNet_t* ncclNet; int netPluginIndex; int ncclNetVer; ncclNetDeviceType netDeviceType; ncclCollNet_t* ncclCollNet; void* bootstrap; // Bitmasks for ncclTransportP2pSetup struct channelMasks* connectSend; struct channelMasks* connectRecv; struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS]; int maxTreePattern; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported bool directMode; int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. uint64_t commHash; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index int nvmlDev; // my nvml device index int compCap; // compute capability of the GPU int minCompCap, maxCompCap; // min/max compute capability in the communicator int64_t busId; // my PCI bus ID in int format cpu_set_t cpuAffinity; // CPU affinity of the GPU int WarpSize; int cudaArch; // matches __CUDA_ARCH__ of device int cpuArch; // architecture - As defined in src/include/graph.h, e.g. x86/arm/ppc/mixed int cpuVendor; // vendor - As defined in src/include/graph.h int node; int nNodes; int rcclUseOneSlice; // RCCL: true if this comm is using one slice per primitive int gfx942CheapFenceOff; // RCCL: true if gfx942 cheap fence is disabled int localRank; int localRanks; int maxLocalRanks; int* rankToNode; int* rankToLocalRank; int* localRankToRank; // localRanks and localRanktoRank for all nodes struct ncclNodeRanks* nodeRanks; // MNNVL: Multi-Node NVLink int MNNVL; // true when MNNVL is available struct cliqueInfo clique; // Our MNNVL clique information int cliqueRank; // Our rank within the MNNVL clique bool checkPointers; bool dmaBufSupport; // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; // Collective operation counter uint64_t collOpCount; // Channels for collectives int nChannels; // connection nChannels int collChannels; // enqueue nChannels int nvlsChannels; // enqueue nChannels // all nvls heads stored to check if we can splitShare int nvlsHeads[MAXCHANNELS]; // Channels (per peer) for p2p int p2pnChannels; int p2pnChannelsPerPeer; // Should this comm allocate LL buffers for network P2P connections? bool allocP2pNetLLBuffers; // Buffer sizes int buffSizes[NCCL_NUM_PROTOCOLS]; int p2pChunkSize; int nvlsChunkSize; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE]; /* This attribute can indicate the states of communicators and return code of * asynchronous NCCL operations. */ ncclResult_t asyncResult; // Flag to ask NCCL kernels to abort uint32_t* abortFlag; uint32_t* abortFlagDev; int* abortFlagRefCount; uint32_t* childAbortFlag; uint32_t* childAbortFlagDev; uint32_t destroyFlag; // Flags for enable P2P NET uint32_t p2pNet; uint32_t useIntraNet; bool hasFineGrain; // Device side of the communicator (for cudaFree's) struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm struct ncclSymDevComm symDevComm; uint32_t workArgsBytes; // max size of kernel args uint32_t workFifoBytes; // size of workFifoBuf, power of 2 void* workFifoBuf; void* workFifoBufDev; void* workFifoBufGdrHandle; // Monotonic number of bytes (mod 1<<32) sent to fifo. uint32_t workFifoProduced; uint32_t workFifoProducedLastRecorded; uint32_t workFifoConsumed; // Intra-process sync struct ncclComm* intraComm0; // leader of intra-process comms (self possible) struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head int intraRank; int intraRanks; uint32_t intraBarrierPhase; char intraPad1[64 - sizeof(uint64_t)]; uint64_t intraBarrierCounter; // only used if this is intraComm0 char intraPad2[64 - sizeof(uint64_t)]; uint64_t intraBarrierGate; // only used if this is intraComm0 struct ncclProxyState* proxyState; int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ // Whether this communicator uses collNet bool isOneRPN; uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes]; int* collNetHeads; int collNetHeadsNum; int* collNetDenseToUserRank; int* collNetUserToDenseRank; /* sharable collNet proxy progress resource. */ struct ncclCollNetSharedRes* collNetSharedRes; // NVLink SHARP (NVLS) support int nvlsSupport; int nvlsRegSupport; /* sharable NVLS resource. */ struct ncclNvlsSharedRes* nvlsResources; // pools backed by comm->memPermanent struct ncclMemoryPool memPool_ncclTaskColl; struct ncclMemoryPool memPool_ncclTaskP2p; struct ncclMemoryPool memPool_ncclProxyOp; struct ncclMemoryPool memPool_ncclKernelPlan; // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when // this comm is not yet in a group. struct ncclComm* groupNext[ncclGroupTaskTypeNum]; // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. struct ncclComm* preconnectNext; int localPersistentRefs; // number of persistent plan-lists capturing this comm struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule; struct ncclKernelPlanner planner; hipStream_t sideStream; // [RCCL] Cached non-captured stream cudaMemPool_t memPool; // Queue of events and associated callbacks for cleaning up asynchronous work. // Using this is preferable to using CUDA host callbacks because host callbacks // won't allow the work following the callback to run until the callback completes, // which comes at expense to perf. struct ncclIntruQueue eventCallbackQueue; // user-created reduction ops int userRedOpCapacity, userRedOpFreeHead; ncclUserRedOp *userRedOps; // Queue of things for the main thread to do int reclaimSteps; struct ncclIntruQueueMpsc callbackQueue; hipEvent_t doneEvent; hipStream_t lastStream; std::unique_ptr ctrace; #ifdef ENABLE_COLLTRACE struct ncclCollTrace* collTrace; union ncclCollTraceTail *collTraceTail; pthread_t collTraceThread; volatile bool collTraceExit; bool collTraceEnabled; #endif #ifdef ENABLE_FAULT_INJECTION uint64_t faults; #endif ncclConfig_t config; // initState is to more conveniently reclaim resources when errors happen. ncclResult_t initState; // flag to indicate if ncclCommFinalize() is called bool finalizeCalled; // shared structures for finalization int finalizeRankCnt; #if defined(ENABLE_MSCCLPP) // Whether this comm is compatible with MSCCLPP bool mscclppCompatible; struct mscclppComm* mscclpp_comm; size_t mscclpp_threshold; bool mscclppForceEnable; #endif // Whether this comm is compatible with MSCCL bool mscclCompatible; // group job to support multi-thread FT struct ncclGroupJob *groupJob; // Flag indicating if this communicator shares resources with parent or children bool shareResources; // Tuning plugin int tunerPluginLoaded; ncclTuner_t* tuner; void *tunerContext; // Profiler plugin void* profilerContext; uint64_t seqNumber[NCCL_NUM_FUNCTIONS]; struct ncclProfilerProxy profiler; // buffer registration cache struct ncclRegCache regCache; int isAllNvlink; bool isAllDirectP2p; int symmetricSupport; bool useNetPXN; bool useGdr; int splitCount; // symmetric buffer uint8_t* baseUCSymPtr; uint8_t* baseMCSymPtr; size_t baseStride; size_t symAllocHead; CUmemGenericAllocationHandle symMCHandle; struct ncclIntruQueue symRegTaskQueue; // Unroll factor for comm [RCCL] int unroll; // custom collective [RCCL] bool enableCustColl; uint64_t endMagic; }; static_assert(offsetof(struct ncclComm, startMagic) == 0, "startMagic must be the first field of ncclComm"); static_assert(offsetof(struct ncclComm, endMagic) == sizeof(struct ncclComm) - sizeof(uint64_t), "endMagic must be the last field of ncclComm"); enum ncclLaunchMode { ncclLaunchModeInvalid=0, ncclLaunchModeParallel, ncclLaunchModeGroup }; extern enum ncclLaunchMode ncclParamLaunchMode; void ncclCommPushFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { ncclResult_t result = ncclSuccess; struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); while (cb != nullptr) { struct ncclCommCallback* next = cb->next; ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb if (res1 != ncclSuccess) result = res1; cb = next; } NCCLCHECK(result); return ncclSuccess; } inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) { ncclResult_t result = ncclSuccess; cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); while (true) { struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue); if (cb == nullptr) break; cudaError_t ok; if (waitSome) { ok = cudaEventSynchronize(cb->event); waitSome = false; } else { ok = cudaEventQuery(cb->event); if (ok == cudaErrorNotReady) break; } ncclIntruQueueDequeue(&comm->eventCallbackQueue); if (ok == cudaSuccess) { NCCLCHECKGOTO(cb->fn(comm, cb), result, finish); } else { CUDACHECKGOTO(ok, result, finish); } } finish: CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); return ncclSuccess; } inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { int phase = comm->intraBarrierPhase; if (comm->intraRanks == 1) { // Release everyone (just me). comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); } else { struct ncclComm* comm0 = comm->intraComm0; uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); if (uint32_t(count) == uint32_t(comm->intraRanks)) { // Reset. __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); // Release everyone. __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); } } } // returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { struct ncclComm* comm0 = comm->intraComm0; comm->intraBarrierPhase ^= 1; uint32_t phase = comm->intraBarrierPhase; uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); if ((gate & 1) != phase) { uint64_t t0 = clockNano(); do { // Spin vigorously for first 5us. if (clockNano()-t0 >= 5*1000) sched_yield(); gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); } while ((gate & 1) != phase); } if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); return gate>>32; } // Scrambles the bits of non-builtin values of ncclRedOp_t according to the // communicator memory address. Used to catch bugs so that integer handles // associated with this communicator won't collide with handles of other // communicatrs. This function is its own inverse. static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { // Preserve the built-in values. if(int(op) < int(ncclNumOps)) return op; uint64_t h = reinterpret_cast(comm); h ^= h >> 32; h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant h >>= 32; // h is now an excellent 32-bit hash of the comm pointer h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1 int op1 = int(h) ^ int(op); // Since builtin values are preserved, we also have to preserve their preimage. return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); } ncclResult_t ncclCommEnsureReady(ncclComm_t comm); ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); #endif