Merge remote-tracking branch 'nccl/master' into develop
Este commit está contenido en:
+2
-7
@@ -175,12 +175,6 @@ set(CC_SOURCES
|
||||
src/collectives/all_to_all_api.cc
|
||||
src/collectives/all_to_allv_api.cc
|
||||
src/channel.cc
|
||||
#src/clique/CliqueManager.cc # RCCL
|
||||
#src/clique/HandleCache.cc # RCCL
|
||||
#src/clique/HandleShm.cc # RCCL
|
||||
#src/clique/Hash.cc # RCCL
|
||||
#src/clique/MsgQueue.cc # RCCL
|
||||
#src/clique/ShmObject.cc # RCCL
|
||||
src/misc/argcheck.cc
|
||||
src/misc/nvmlwrap_stub.cc
|
||||
src/misc/utils.cc
|
||||
@@ -193,6 +187,8 @@ set(CC_SOURCES
|
||||
src/misc/signals.cc # RCCL
|
||||
src/misc/socket.cc
|
||||
src/misc/param.cc
|
||||
src/misc/rocmwrap.cc
|
||||
src/misc/strongstream.cc
|
||||
src/transport/coll_net.cc
|
||||
src/transport/net.cc
|
||||
src/transport/net_ib.cc
|
||||
@@ -208,7 +204,6 @@ set(CC_SOURCES
|
||||
src/enqueue.cc
|
||||
${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
|
||||
|
||||
|
||||
foreach(filename ${CC_SOURCES})
|
||||
list(APPEND CPP_SOURCES ${filename})
|
||||
endforeach(filename)
|
||||
|
||||
Archivo normal → Archivo ejecutable
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 12
|
||||
NCCL_PATCH := 12
|
||||
NCCL_MINOR := 13
|
||||
NCCL_PATCH := 4
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+2
-1
@@ -10,7 +10,8 @@ include ../makefiles/version.mk
|
||||
##### src files
|
||||
INCEXPORTS := nccl.h nccl_net.h
|
||||
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
|
||||
misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
|
||||
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
|
||||
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
|
||||
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
|
||||
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
|
||||
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
|
||||
|
||||
+4
-1
@@ -106,6 +106,7 @@ static void *bootstrapRoot(void* args) {
|
||||
do {
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = NULL;
|
||||
/* bootstrap root thread always uses blocking ncclSocketAccept. */
|
||||
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
|
||||
close(sock.fd);
|
||||
@@ -140,6 +141,7 @@ static void *bootstrapRoot(void* args) {
|
||||
int next = (r+1) % nranks;
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = NULL;
|
||||
sock.asyncFlag = 0;
|
||||
memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
|
||||
NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
|
||||
@@ -289,7 +291,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
|
||||
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
|
||||
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
|
||||
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -324,6 +326,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
|
||||
struct bootstrapState* state = (struct bootstrapState*)commState;
|
||||
struct ncclSocket sock;
|
||||
sock.abortFlag = state->abortFlag;
|
||||
sock.asyncFlag = 0;
|
||||
memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketConnect(&sock));
|
||||
NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
|
||||
|
||||
+24
-46
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,75 +8,54 @@
|
||||
#include "param.h"
|
||||
#include "gdrwrap.h"
|
||||
|
||||
// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
|
||||
NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
|
||||
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
|
||||
struct ncclChannel* channel = comm->channels+channelid;
|
||||
ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
if (channel->id != -1) return ncclSuccess;
|
||||
channel->id = channelid;
|
||||
|
||||
// Ring index to user rank table.
|
||||
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
|
||||
int nRanks = comm->nRanks;
|
||||
channel->id = channelId;
|
||||
channel->workFifoSent = 0;
|
||||
|
||||
// Communication structures with peers.
|
||||
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
|
||||
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
|
||||
for (size_t i=0; i<comm->nRanks+1; ++i) {
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
channel->peers[i].send[b].comm = comm;
|
||||
channel->peers[i].recv[b].comm = comm;
|
||||
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
|
||||
|
||||
// The extra on nRanks+1 is for collnet root (i.e. network)
|
||||
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream));
|
||||
ncclCommPushCudaFree(comm, channel->devPeers);
|
||||
|
||||
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
||||
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
|
||||
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
|
||||
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
|
||||
|
||||
for (int r=0; r < nRanks+1; ++r) {
|
||||
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
||||
channel->peers[r].send[b].comm = comm;
|
||||
channel->peers[r].recv[b].comm = comm;
|
||||
}
|
||||
}
|
||||
|
||||
// Per-channel operation list.
|
||||
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
|
||||
if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
|
||||
// GDRCOPY support
|
||||
// We allocate a workFifo in GDR mapped CUDA memory
|
||||
// But we still allocate the Host workFifo so that we
|
||||
// can copy the work elements to CUDA memory on kernel launch
|
||||
NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
|
||||
} else {
|
||||
// The device workFifo is the Host one
|
||||
channel->workFifoDev = channel->workFifo;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
|
||||
if (channel->id == -1) return ncclSuccess;
|
||||
// Operation list
|
||||
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
|
||||
if (channel->gdrMemDesc) {
|
||||
// GDRCOPY support
|
||||
NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
|
||||
}
|
||||
|
||||
// Free Ring index to rank tables
|
||||
free(channel->ring.userRanks);
|
||||
CUDACHECK(hipFree(channel->ring.devUserRanks));
|
||||
|
||||
// Free transport proxy resources
|
||||
// Note: free all send resources first due to CollNet arrangement
|
||||
for (int r=0; r<nRanks+1; r++) {
|
||||
struct ncclPeer* peer = channel->peers+r;
|
||||
struct ncclChannelPeer* peer = channel->peers+r;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
|
||||
}
|
||||
}
|
||||
for (int r=0; r<nRanks+1; r++) {
|
||||
struct ncclPeer* peer = channel->peers+r;
|
||||
struct ncclChannelPeer* peer = channel->peers+r;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
|
||||
}
|
||||
}
|
||||
|
||||
// Free the peer structures.
|
||||
CUDACHECK(hipFree(channel->devPeers));
|
||||
free(channel->peers);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
|
||||
BUILDDIR ?= $(abspath ../../../build)
|
||||
OBJDIR := $(BUILDDIR)/obj/collectives/device
|
||||
|
||||
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu alltoall_pivot.cu
|
||||
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
|
||||
|
||||
LIBSRCFILES += functions.cu
|
||||
|
||||
|
||||
@@ -13,11 +13,11 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
const int *ringRanks = ring->devUserRanks;
|
||||
const int *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "devcomm.h"
|
||||
#include "collectives.h"
|
||||
#include "primitives.h"
|
||||
//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
@@ -18,7 +17,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
@@ -187,11 +186,6 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = modRanks(ringIx + 1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
|
||||
if (tid == 0) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
@@ -200,6 +194,10 @@ namespace {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make final copy from buffer to dest.
|
||||
chunk = modRanks(ringIx + 1);
|
||||
offset = calcOffset(chunk);
|
||||
nelem = min(realChunkSize, size-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
|
||||
@@ -223,7 +221,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem->channel.tree;
|
||||
@@ -375,7 +373,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclTree *tree = &ncclShmem->channel.tree;
|
||||
@@ -600,9 +598,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
|
||||
const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
|
||||
const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
|
||||
const int nThreadsScatter = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
|
||||
const int nThreadsBcast = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
|
||||
const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
|
||||
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
|
||||
const int tidStartBcast = nThreadsGather;
|
||||
const int tidStartScatter = tidStartBcast + nThreadsBcast;
|
||||
const int tidStartReduce = tidStartScatter + nThreadsScatter;
|
||||
|
||||
@@ -8,4 +8,4 @@
|
||||
#include "common.h"
|
||||
#include "collectives.h"
|
||||
|
||||
IMPL_COLL_ALLTOALL_PIVOT(AllToAllPivot);
|
||||
IMPL_COLL_F(AllToAllPivot);
|
||||
|
||||
@@ -12,7 +12,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nranks = ncclShmem->comm.nRanks;
|
||||
const ncclRing *ring = &ncclShmem->channel.ring;
|
||||
@@ -29,11 +29,11 @@ namespace {
|
||||
const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->connIndex << 16);
|
||||
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);
|
||||
|
||||
for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) {
|
||||
const int src_rank = ring->devUserRanks[(nranks - num_hops) % nranks];
|
||||
const int dst_rank = ring->devUserRanks[num_hops];
|
||||
const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
|
||||
const int dst_rank = ring->userRanks[num_hops];
|
||||
const ssize_t send_offset =
|
||||
dst_rank * num_elems * elem_size + chunk_offset +
|
||||
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
|
||||
|
||||
@@ -12,7 +12,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
@@ -20,8 +20,8 @@ namespace {
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ring->devUserRanks[0];
|
||||
const int nextRank = ring->devUserRanks[1];
|
||||
const int rank = ring->userRanks[0];
|
||||
const int nextRank = ring->userRanks[1];
|
||||
const int root = args->root;
|
||||
|
||||
T *inputBuf = (T*)args->sendbuff;
|
||||
|
||||
+192
-197
@@ -10,7 +10,6 @@
|
||||
|
||||
#include "collectives.h"
|
||||
#include "devcomm.h"
|
||||
#include "op128.h"
|
||||
|
||||
#define COLL_UNROLL 2
|
||||
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
|
||||
@@ -320,154 +319,71 @@ class ncclFunction {
|
||||
};
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
#define traceColl(elem,launch_type) \
|
||||
#define traceColl(launch_type) { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.comm.collTrace[pos].bid = blockIdx.x; \
|
||||
shmem.comm.collTrace[pos].funcIndex = shmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (shmem.comm.collTrace[pos].data_0)); \
|
||||
if (elem.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = (struct ncclWorkElemP2p *)&elem; \
|
||||
shmem.comm.collTrace[pos].p2p[0].connIndex = p2pElems[0].connIndex; \
|
||||
shmem.comm.collTrace[pos].p2pOpCount[0] = p2pElems[0].opCount; \
|
||||
shmem.comm.collTrace[pos].p2p[0].ngroups = p2pElems[0].ngroups; \
|
||||
shmem.comm.collTrace[pos].p2p[0].nWarps = p2pElems[0].nWarps; \
|
||||
shmem.comm.collTrace[pos].p2p[0].warpStart = p2pElems[0].warpStart; \
|
||||
shmem.comm.collTrace[pos].p2p[0].peer = (uint16_t)(p2pElems[0].peer); \
|
||||
shmem.comm.collTrace[pos].p2p[1].connIndex = p2pElems[1].connIndex; \
|
||||
shmem.comm.collTrace[pos].p2pOpCount[1] = p2pElems[1].opCount; \
|
||||
shmem.comm.collTrace[pos].p2p[1].ngroups = p2pElems[1].ngroups; \
|
||||
shmem.comm.collTrace[pos].p2p[1].nWarps = p2pElems[1].nWarps; \
|
||||
shmem.comm.collTrace[pos].p2p[1].warpStart = p2pElems[1].warpStart; \
|
||||
shmem.comm.collTrace[pos].p2p[1].peer = (uint16_t)(p2pElems[1].peer); \
|
||||
shmem.comm.collTrace[pos].type = (ncclCollTraceP2pElemType|launch_type); \
|
||||
} else { \
|
||||
shmem.comm.collTrace[pos].opCount = elem.opCount; \
|
||||
shmem.comm.collTrace[pos].coll.nWarps = elem.header.nWarps; \
|
||||
shmem.comm.collTrace[pos].coll.bid = elem.bid; \
|
||||
shmem.comm.collTrace[pos].coll.nChannels = elem.nChannels; \
|
||||
shmem.comm.collTrace[pos].type = (ncclCollTraceCollElemType|launch_type); \
|
||||
}
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->funcIndex = shmem.work.header.funcIndex; \
|
||||
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
|
||||
if (shmem.work.header.type == ncclWorkTypeP2p) { \
|
||||
struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
|
||||
collTrace->p2p[0].connIndex = 0; \
|
||||
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
|
||||
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
|
||||
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
|
||||
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
|
||||
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
|
||||
collTrace->p2p[1].connIndex = 0; \
|
||||
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
|
||||
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
|
||||
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
|
||||
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
|
||||
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
|
||||
} else if (shmem.work.header.type == ncclWorkTypeColl) { \
|
||||
struct ncclWorkElem *elems = shmem.work.elems; \
|
||||
collTrace->opCount = elems[0].opCount; \
|
||||
collTrace->coll.nWarps = elems[0].nWarps; \
|
||||
collTrace->coll.bid = elems[0].bid; \
|
||||
collTrace->coll.nChannels = elems[0].nChannels; \
|
||||
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define traceKernelLaunch(elem,firstLaunch) { \
|
||||
traceColl(elem,(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType)); \
|
||||
#define traceKernelLaunch(firstLaunch) { \
|
||||
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
|
||||
}
|
||||
#define traceKernelEnd() { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.comm.collTrace[pos].bid = bid; \
|
||||
shmem.comm.collTrace[pos].type = ncclCollTraceKernelEndType; \
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceKernelEndType; \
|
||||
}
|
||||
#define traceAbort() { \
|
||||
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.comm.collTrace[pos].bid = bid; \
|
||||
shmem.comm.collTrace[pos].type = ncclCollTraceAbortType; \
|
||||
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->type = ncclCollTraceAbortType; \
|
||||
}
|
||||
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
|
||||
#define traceData(data2, data4, data8_0, data8_1) { \
|
||||
uint32_t pos = atomicAdd(ncclShmem->comm.collTraceTail, 1)%COLLTRACE_NUM_ITEMS; \
|
||||
ncclShmem->comm.collTrace[pos].bid = blockIdx.x; \
|
||||
ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
ncclShmem->comm.collTrace[pos].funcIndex = data2; \
|
||||
ncclShmem->comm.collTrace[pos].data_0 = data4; \
|
||||
ncclShmem->comm.collTrace[pos].opCount = data8_0; \
|
||||
ncclShmem->comm.collTrace[pos].data_1 = data8_1; \
|
||||
ncclShmem->comm.collTrace[pos].type = ncclCollTraceDataType; \
|
||||
uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
|
||||
struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
|
||||
collTrace->bid = blockIdx.x; \
|
||||
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
collTrace->funcIndex = data2; \
|
||||
collTrace->data_0 = data4; \
|
||||
collTrace->opCount = data8_0; \
|
||||
collTrace->data_1 = data8_1; \
|
||||
collTrace->type = ncclCollTraceDataType; \
|
||||
}
|
||||
#else
|
||||
#define traceKernelLaunch()
|
||||
#define traceAbort()
|
||||
#define traceData(data2, data4, data8_0, data8_1)
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
|
||||
shmem.prof.elem[shmem.prof.count].line = line_num; \
|
||||
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.prof.count++; \
|
||||
} \
|
||||
} while(0);
|
||||
#else
|
||||
#define __insert_timestamp(line_num)
|
||||
#endif
|
||||
|
||||
// Copy src to dst and fill extra size with zeroes
|
||||
template<typename Tdst, typename Tsrc>
|
||||
__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
|
||||
static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
|
||||
"copyToShmem needs sizes which are multiple of 16B");
|
||||
static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
|
||||
static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
|
||||
uint64_t *d = reinterpret_cast<uint64_t*>(dst);
|
||||
uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
|
||||
uint64_t *shmemPtr = d;
|
||||
int offset = 2*tid;
|
||||
uint64_t v0, v1;
|
||||
if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
|
||||
v0 = v1 = 0ULL;
|
||||
} else {
|
||||
v0 = s[offset] ; v1 = s[offset+1];
|
||||
}
|
||||
if (offset < sizeof(Tdst)/sizeof(uint64_t)) {
|
||||
shmemPtr[offset] = v0; shmemPtr[offset+1] = v1;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
|
||||
static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
|
||||
uint64_t *d = reinterpret_cast<uint64_t*>(dst);
|
||||
uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
|
||||
int t = threadIdx.x - turn;
|
||||
if (t < 0) t += blockDim.x;
|
||||
int n = sizeof(T)/sizeof(uint64_t);
|
||||
|
||||
int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0
|
||||
if (delta < blockDim.x) {
|
||||
turn += delta;
|
||||
if (turn >= blockDim.x) turn -= blockDim.x;
|
||||
}
|
||||
else
|
||||
turn = 0;
|
||||
|
||||
n -= t;
|
||||
d += t;
|
||||
s += t;
|
||||
#pragma unroll
|
||||
for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) {
|
||||
if (n > 0) {
|
||||
*d = *s;
|
||||
d += blockDim.x;
|
||||
s += blockDim.x;
|
||||
n -= blockDim.x;
|
||||
}
|
||||
}
|
||||
return turn;
|
||||
}
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWorkElement {
|
||||
__device__ void run(ncclWorkElem*) {
|
||||
// Put NOT IMPLEMENTED behavior here.
|
||||
}
|
||||
};
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWork {
|
||||
// This __forceinline__ is necessary. The compiler was inserting a function call
|
||||
// here from the LL ncclKernel.
|
||||
__device__ __forceinline__ void run(ncclWork *w) {
|
||||
int wid = threadIdx.x / WARP_SIZE;
|
||||
int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
|
||||
#pragma unroll 1
|
||||
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
|
||||
if (wid < w->header.nWarps)
|
||||
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ncclShmemGroup {
|
||||
ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
|
||||
@@ -484,18 +400,67 @@ struct ncclShmemData {
|
||||
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
|
||||
};
|
||||
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
struct ncclDevComm comm;
|
||||
struct ncclChannel channel;
|
||||
uint64_t pad[2];
|
||||
struct ncclWork work;
|
||||
int channelId;
|
||||
alignas(16) struct ncclDevComm comm;
|
||||
alignas(16) struct ncclDevChannel channel;
|
||||
alignas(16) struct ncclWork work;
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf prof;
|
||||
#endif
|
||||
};
|
||||
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
#define __insert_timestamp(line_num) do { \
|
||||
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
|
||||
shmem.prof.elem[shmem.prof.count].line = line_num; \
|
||||
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
|
||||
shmem.prof.count++; \
|
||||
} \
|
||||
} while(0);
|
||||
#else
|
||||
#define __insert_timestamp(line_num)
|
||||
#endif
|
||||
|
||||
// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
|
||||
inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
|
||||
int offset = 16*tid;
|
||||
if (offset < bytes) {
|
||||
ulong2 *src2, *dst2;
|
||||
src2 = (ulong2*)((char const*)src + offset);
|
||||
dst2 = (ulong2*)((char*)dst + offset);
|
||||
dst2->x = src2->x;
|
||||
dst2->y = src2->y;
|
||||
}
|
||||
}
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWorkElement {
|
||||
__device__ void run(ncclWorkElem*) {
|
||||
// Put NOT IMPLEMENTED behavior here.
|
||||
}
|
||||
};
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
|
||||
struct RunWork {
|
||||
// This __forceinline__ is necessary. The compiler was inserting a function call
|
||||
// here from the LL ncclKernel.
|
||||
__device__ __forceinline__ void run(ncclWork *w) {
|
||||
int wid = threadIdx.x / WARP_SIZE;
|
||||
ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
|
||||
int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
|
||||
#pragma unroll 1
|
||||
while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
|
||||
if (wid < we->nWarps) {
|
||||
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
|
||||
}
|
||||
we = (ncclWorkElem*)((char*)we + stride);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
|
||||
if (we->isUsed && we->redOpArgIsPtr) {
|
||||
/* redOpArg is a pointer to the scalar value, so we'll dereference it
|
||||
* here so that redOpArg holds the bits of the scalar going forward.
|
||||
* The tricky thing is we don't know its type T since that's encoded in
|
||||
@@ -518,10 +483,10 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
|
||||
extern __device__ struct ncclShmemData *ncclShmem;
|
||||
|
||||
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
|
||||
__device__ void ncclKernel(struct ncclDevComm* comm) {
|
||||
__device__ void ncclKernel(
|
||||
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
|
||||
) {
|
||||
int tid = threadIdx.x;
|
||||
int nthreads = blockDim.x;
|
||||
int bid = blockIdx.x;
|
||||
__shared__ struct ncclShmemData shmem;
|
||||
ncclShmem = &shmem;
|
||||
if (tid == 0) {
|
||||
@@ -529,47 +494,72 @@ __device__ void ncclKernel(struct ncclDevComm* comm) {
|
||||
shmem.groups[i].barrier = 0;
|
||||
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// To map blockId to channelId, we need the n'th set bit of channelMask which
|
||||
// is the inverse of counting the number of set bits among the the first n.
|
||||
if (tid < WARP_SIZE) {
|
||||
int x = tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) shmem.channelId = x;
|
||||
}
|
||||
if (32 < MAXCHANNELS) {
|
||||
x = 32 + tid;
|
||||
if (channelMask & (1ull<<x)) {
|
||||
int y = __popcll(channelMask & ((1ull<<x)-1));
|
||||
if (blockIdx.x == y) shmem.channelId = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads(); // publish shmem.channelId
|
||||
int channelId = shmem.channelId;
|
||||
|
||||
int turn = copyToShmem(&shmem.comm, comm);
|
||||
if (true) {
|
||||
void *dst, *src;
|
||||
int bytes;
|
||||
// Use first 3 warps to load comm, channel, and work into shmem
|
||||
switch (tid/WARP_SIZE) {
|
||||
case 0:
|
||||
dst = &shmem.comm;
|
||||
src = comm;
|
||||
bytes = sizeof(ncclDevComm);
|
||||
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 1:
|
||||
// Get address of channel without incurring indirect load from ncclDevComm::channels
|
||||
dst = &shmem.channel;
|
||||
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
|
||||
bytes = sizeof(ncclDevChannel);
|
||||
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
case 2:
|
||||
dst = &shmem.work;
|
||||
src = workHead + blockIdx.x;
|
||||
bytes = sizeof(ncclWork);
|
||||
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
|
||||
break;
|
||||
default:
|
||||
bytes = 0;
|
||||
break;
|
||||
}
|
||||
copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
|
||||
}
|
||||
__syncthreads(); // publish shmem
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (tid == 0) {
|
||||
shmem.prof.count = 0;
|
||||
shmem.prof.seq = shmem.comm.devProf[bid].seq;
|
||||
shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
|
||||
}
|
||||
#endif
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
// get address of channel without incurring indirect load from ncclDevCom::channels
|
||||
ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
|
||||
turn = copyToShmem(&shmem.channel, channel, turn);
|
||||
|
||||
__syncthreads(); // publish ncclShmem
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
|
||||
ncclWork *workFifoHost = shmem.channel.workFifo;
|
||||
ncclWork *workFifoDev = shmem.channel.workFifoDev;
|
||||
int workFifoIx = shmem.channel.index;
|
||||
bool firstLaunch = true;
|
||||
if (COLLTRACE && tid == 0) traceKernelLaunch(true);
|
||||
|
||||
while (true) {
|
||||
copyToShmem(&shmem.work, &workFifoDev[workFifoIx], tid, nthreads);
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
{ // Check whether the last operation was aborted and make sure all threads exit
|
||||
int aborted = tid == 0 ? *comm->abortFlag : 0;
|
||||
if (__any(aborted)) { // publish shmem.work
|
||||
if (COLLTRACE && tid == 0) traceAbort();
|
||||
break;
|
||||
}
|
||||
if (tid == 0)
|
||||
workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
|
||||
// Notify host that all fifo reads are complete.
|
||||
if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
|
||||
*shmem.channel.workFifoDone = shmem.work.header.doneAcks;
|
||||
}
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
|
||||
workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
|
||||
if (tid == 0)
|
||||
channel->index = workFifoIx; // write back to real channel, not shmem shadow
|
||||
|
||||
__syncwarp();
|
||||
if (shmem.work.header.type == ncclWorkTypeColl) {
|
||||
@@ -579,52 +569,57 @@ __device__ void ncclKernel(struct ncclDevComm* comm) {
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (COLLTRACE && tid == 0) {
|
||||
traceKernelLaunch(shmem.work.elems[0],firstLaunch);
|
||||
firstLaunch = false;
|
||||
#pragma unroll 1
|
||||
for(int e=1; e < NCCL_MAX_WORK_ELEMENTS && shmem.work.elems[e].header.type != ncclWorkTypeUnused; e ++) {
|
||||
traceColl(shmem.work.elems[e], 0);
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (shmem.work.header.funcIndex == FnIndex) {
|
||||
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
|
||||
} else {
|
||||
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
|
||||
}
|
||||
|
||||
int workIxNext = shmem.work.header.workNext;
|
||||
__syncthreads();
|
||||
if (shmem.work.header.isLast) break;
|
||||
|
||||
copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
|
||||
|
||||
{ // Check whether the last operation was aborted and make sure all threads exit
|
||||
int aborted = tid == 0 ? *comm->abortFlag : 0;
|
||||
if (__any(aborted)) { // publish shmem.work
|
||||
traceAbort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (tid == 0) __insert_timestamp(__LINE__);
|
||||
if (shmem.work.header.funcIndex == FnIndex)
|
||||
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
|
||||
else
|
||||
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
|
||||
|
||||
if (shmem.work.header.isLast) break;
|
||||
__syncthreads();
|
||||
if (COLLTRACE && tid == 0) traceColl(false);
|
||||
}
|
||||
if (COLLTRACE && tid == 0) traceKernelEnd()
|
||||
if (COLLTRACE && tid == 0) traceKernelEnd();
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
|
||||
__syncthreads();
|
||||
copyToShmem(shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof);
|
||||
if (tid == 0) shmem.comm.devProf[bid].seq++;
|
||||
copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
|
||||
if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm); \
|
||||
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm, channelMask, workHead); \
|
||||
} \
|
||||
\
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm); \
|
||||
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm, channelMask, workHead); \
|
||||
} \
|
||||
\
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm); \
|
||||
__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm, channelMask, workHead); \
|
||||
} \
|
||||
\
|
||||
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
|
||||
__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm); \
|
||||
__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
|
||||
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm, channelMask, workHead); \
|
||||
}
|
||||
|
||||
// Examples : AllReduce, RING, LL, Sum, uint8
|
||||
@@ -683,7 +678,7 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
|
||||
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
|
||||
|
||||
// AllToAll Pivot primitive only has one function.
|
||||
#define IMPL_COLL_ALLTOALL_PIVOT(func) \
|
||||
#define IMPL_COLL_F(func) \
|
||||
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -23,7 +23,7 @@ __device__ struct ncclShmemData* ncclShmem;
|
||||
NCCL_FUNC5(func, RING, devredop, type, nullify), \
|
||||
NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
|
||||
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
// Must be consistent with ncclDataType_t
|
||||
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
|
||||
NCCL_FUNC4(func, devredop, int8_t, 0), \
|
||||
@@ -35,7 +35,7 @@ __device__ struct ncclShmemData* ncclShmem;
|
||||
NCCL_FUNC4(func, devredop, half, nullForFloat), \
|
||||
NCCL_FUNC4(func, devredop, float, nullForFloat), \
|
||||
NCCL_FUNC4(func, devredop, double, nullForFloat), \
|
||||
NCCL_FUNC4(func, devredop, rccl_bfloat16, nullForFloat)
|
||||
NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
|
||||
#define NCCL_FUNCS3B(func, devredop) \
|
||||
NCCL_FUNC4(func, devredop, int8_t, 0), \
|
||||
NCCL_FUNC4(func, devredop, int8_t, 0), \
|
||||
@@ -89,13 +89,12 @@ __device__ struct ncclShmemData* ncclShmem;
|
||||
NCCL_FUNCS3B(func, Sum)
|
||||
|
||||
// Must be consistent with the ncclFuncSet enum
|
||||
__device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
|
||||
// Don't try to initialize the host shadow copy of this device-side global
|
||||
// variable. There is no host pointer to a device-side function, which
|
||||
// confuses clang. This will be fixed in the next clang release.
|
||||
#if __CUDA_ARCH__
|
||||
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_FUNC_NAME(AllToAllPivot, RING, SIMPLE, Sum, int8_t),
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
|
||||
@@ -105,8 +104,8 @@ __device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedO
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16),
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
|
||||
#endif
|
||||
NCCL_FUNCS2B(Broadcast),
|
||||
NCCL_FUNCS2A(Reduce),
|
||||
|
||||
@@ -17,7 +17,7 @@ namespace {
|
||||
int tid = threadIdx.x;
|
||||
int tn = blockDim.x;
|
||||
#pragma unroll 1
|
||||
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
|
||||
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
|
||||
ncclWorkElem *we = &w->elems[e];
|
||||
intptr_t eltN = we->count;
|
||||
int bid = we->bid;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,8 +7,6 @@
|
||||
#ifndef OP128_H_
|
||||
#define OP128_H_
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
|
||||
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
|
||||
: "=l"(v0), "=l"(v1) : "l"(ptr));
|
||||
@@ -67,6 +64,5 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
|
||||
v0 = tmp8[0];
|
||||
v1 = tmp8[1];
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -155,5 +155,4 @@ struct PrimitivesWithoutDirect {
|
||||
#include "prims_simple.h"
|
||||
#include "prims_ll.h"
|
||||
#include "prims_ll128.h"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -183,7 +183,7 @@ private:
|
||||
|
||||
template<int BeginIx>
|
||||
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
|
||||
#pragma unroll
|
||||
#pragma unroll 1
|
||||
for (int i=BeginIx; i < MaxRecv; i++) {
|
||||
if (i < fan.nrecv()) {
|
||||
union ncclLLFifoLine* src = recvPtr(i) + offset;
|
||||
@@ -412,7 +412,7 @@ private:
|
||||
}
|
||||
if (RECV) {
|
||||
data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
|
||||
#pragma unroll MaxRecv
|
||||
#pragma unroll 1
|
||||
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
|
||||
peerData = readLLFinish(offset, line, i);
|
||||
data = MULTI<RedOp,T>()(redOp, peerData, data);
|
||||
@@ -502,11 +502,11 @@ private:
|
||||
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
|
||||
nrecv++;
|
||||
}
|
||||
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
|
||||
loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
@@ -5,11 +5,12 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "op128.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
|
||||
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
|
||||
|
||||
|
||||
template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
|
||||
class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
|
||||
@@ -53,6 +54,15 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
|
||||
uint64_t* barriers;
|
||||
uint64_t* barrier_next;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
public:
|
||||
int npKitCtxIdx = 0;
|
||||
uint64_t npKitDataProcessEntryTime = 0;
|
||||
uint64_t npKitDataProcessExitTime = 0;
|
||||
uint64_t npKitDataProcessTotalTime = 0;
|
||||
private:
|
||||
#endif
|
||||
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (nthreads != WARP_SIZE)
|
||||
@@ -405,11 +415,11 @@ public:
|
||||
auto *channel = &ncclShmem->channel;
|
||||
int nrecv=0, nsend=0;
|
||||
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
|
||||
loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
|
||||
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
|
||||
nrecv++;
|
||||
}
|
||||
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
|
||||
loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
|
||||
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
|
||||
nsend++;
|
||||
}
|
||||
this->fan = Fan(nrecv, nsend);
|
||||
|
||||
@@ -50,7 +50,6 @@ class Primitives<
|
||||
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
|
||||
uint64_t* barriers;
|
||||
uint64_t* barrier_next;
|
||||
const uint64_t opCount;
|
||||
uint32_t* next_hdp_reg;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
@@ -377,6 +376,7 @@ private:
|
||||
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
|
||||
subBarrier();
|
||||
#pragma unroll 1
|
||||
// Loop over peers
|
||||
for (int j=0; j<fan.nsend(); j++) {
|
||||
int i = (j+shift)%fan.nsend();
|
||||
int peerOffset = i*peerElem;
|
||||
@@ -423,9 +423,9 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) {
|
||||
auto *conn = &peer->recv[connIndex].conn;
|
||||
auto *conn = &peer->recv[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
if (flags & RolePostRecv) {
|
||||
@@ -463,14 +463,14 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
|
||||
if (flags & (RoleWaitSend|RolePostSend)) {
|
||||
auto *conn = &peer->send[connIndex].conn;
|
||||
auto *conn = &peer->send[connIndex];
|
||||
step = conn->step;
|
||||
step = roundUp(step, SlicePerChunk*StepPerSlice);
|
||||
if (flags & RolePostSend) {
|
||||
connStepPtr = conn->tail;
|
||||
next_hdp_reg = conn->next_hdp_reg;
|
||||
next_hdp_reg = conn->next_hdp_reg;
|
||||
}
|
||||
if (flags & RoleWaitSend) {
|
||||
ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
|
||||
@@ -513,8 +513,7 @@ private:
|
||||
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
|
||||
):
|
||||
tid(tid),
|
||||
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)),
|
||||
opCount(ncclShmem->work.elems[0].opCount) {
|
||||
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
|
||||
|
||||
// For send operations, we need an extra warp to overlap the threadfence and the copy
|
||||
this->nthreads = nthreads;
|
||||
@@ -552,8 +551,8 @@ private:
|
||||
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
|
||||
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
|
||||
|
||||
loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
|
||||
loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
|
||||
loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
|
||||
loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);
|
||||
|
||||
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
@@ -23,7 +23,7 @@ namespace {
|
||||
const ssize_t loopSize = nChannels*chunkSize;
|
||||
const ssize_t size = args->count;
|
||||
const int rank = ncclShmem->comm.rank;
|
||||
const int prevRank = ring->devUserRanks[nranks-1];
|
||||
const int prevRank = ring->userRanks[nranks-1];
|
||||
const int root = args->root;
|
||||
|
||||
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
|
||||
|
||||
@@ -13,11 +13,11 @@ namespace {
|
||||
template<typename T, typename RedOp, typename Proto>
|
||||
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
|
||||
const int tid = threadIdx.x;
|
||||
const int nthreads = args->header.nWarps*WARP_SIZE;
|
||||
const int nthreads = args->nWarps*WARP_SIZE;
|
||||
const int bid = args->bid;
|
||||
const int nChannels = args->nChannels;
|
||||
ncclRing *ring = &ncclShmem->channel.ring;
|
||||
int const *ringRanks = ring->devUserRanks;
|
||||
int const *ringRanks = ring->userRanks;
|
||||
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
|
||||
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
|
||||
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
template<typename T, typename RedOp>
|
||||
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
__device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
bool isNpKitThread = (tid == 0);
|
||||
@@ -38,34 +40,35 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
if (args->peer == ncclShmem->comm.rank) {
|
||||
struct ncclWorkElemP2p* recvArgs = args-1;
|
||||
if (args->buff != recvArgs->buff) {
|
||||
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
|
||||
if (buff != recvBuff) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
|
||||
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
|
||||
if (isNpKitThread) {
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
|
||||
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
|
||||
}
|
||||
#endif
|
||||
@@ -73,11 +76,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
}
|
||||
} else {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
ssize_t const count = args->count;
|
||||
int const chunkSize = args->chunkSize/sizeof(T);
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
|
||||
(tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
|
||||
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -93,9 +95,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
}
|
||||
#endif
|
||||
|
||||
ssize_t offset = 0;
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(chunkSize, count-offset);
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directSend(offset, offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
@@ -133,11 +135,12 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
|
||||
if (args->peer != ncclShmem->comm.rank) {
|
||||
using Proto = ProtoSimple<1, 1>;
|
||||
ssize_t const count = args->count;
|
||||
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
|
||||
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
|
||||
int const chunkSize = args->chunkSize/sizeof(T);
|
||||
int const peer = args->peer;
|
||||
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
|
||||
(tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
|
||||
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
if (isNpKitThread) {
|
||||
@@ -153,9 +156,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
}
|
||||
#endif
|
||||
|
||||
ssize_t offset = 0;
|
||||
size_t offset = 0;
|
||||
do {
|
||||
int nelem = min(chunkSize, count-offset);
|
||||
int nelem = min(size_t(chunkSize), count-offset);
|
||||
prims.directRecv(offset, nelem);
|
||||
offset += nelem;
|
||||
} while(offset < count);
|
||||
@@ -182,11 +185,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
|
||||
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
|
||||
args += group;
|
||||
if (args->header.type == ncclWorkTypeUnused) return;
|
||||
|
||||
tid -= args->warpStart * WARP_SIZE;
|
||||
int nthreads = args->nWarps * WARP_SIZE;
|
||||
group |= (args->connIndex<<16); // Used to select connIndex 1
|
||||
|
||||
if (args->p2pType == ncclWorkP2pTypeUnused) return;
|
||||
if (tid >= nthreads || args->peer == -1) return;
|
||||
if ((group%2) == 0) {
|
||||
runRecv(tid, nthreads, group, args);
|
||||
|
||||
+59
-39
@@ -9,29 +9,37 @@
|
||||
#include "nccl_net.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
int ncclDebugLevel = -1;
|
||||
static int pid = -1;
|
||||
static char hostname[1024];
|
||||
thread_local int ncclDebugNoWarn = 0;
|
||||
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
|
||||
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
|
||||
FILE *ncclDebugFile = stdout;
|
||||
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
std::chrono::steady_clock::time_point ncclEpoch;
|
||||
|
||||
static __thread int tid = -1;
|
||||
|
||||
void ncclDebugInit() {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
|
||||
const char* nccl_debug = getenv("NCCL_DEBUG");
|
||||
int tempNcclDebugLevel = -1;
|
||||
if (nccl_debug == NULL) {
|
||||
ncclDebugLevel = NCCL_LOG_NONE;
|
||||
tempNcclDebugLevel = NCCL_LOG_NONE;
|
||||
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_VERSION;
|
||||
tempNcclDebugLevel = NCCL_LOG_VERSION;
|
||||
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_WARN;
|
||||
tempNcclDebugLevel = NCCL_LOG_WARN;
|
||||
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_INFO;
|
||||
tempNcclDebugLevel = NCCL_LOG_INFO;
|
||||
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_ABORT;
|
||||
tempNcclDebugLevel = NCCL_LOG_ABORT;
|
||||
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
|
||||
ncclDebugLevel = NCCL_LOG_TRACE;
|
||||
tempNcclDebugLevel = NCCL_LOG_TRACE;
|
||||
}
|
||||
|
||||
/* Parse the NCCL_DEBUG_SUBSYS env var
|
||||
@@ -65,6 +73,8 @@ void ncclDebugInit() {
|
||||
mask = NCCL_ENV;
|
||||
} else if (strcasecmp(subsys, "ALLOC") == 0) {
|
||||
mask = NCCL_ALLOC;
|
||||
} else if (strcasecmp(subsys, "CALL") == 0) {
|
||||
mask = NCCL_CALL;
|
||||
} else if (strcasecmp(subsys, "ALL") == 0) {
|
||||
mask = NCCL_ALL;
|
||||
}
|
||||
@@ -76,12 +86,16 @@ void ncclDebugInit() {
|
||||
free(ncclDebugSubsys);
|
||||
}
|
||||
|
||||
// Cache pid and hostname
|
||||
getHostName(hostname, 1024, '.');
|
||||
pid = getpid();
|
||||
|
||||
/* Parse and expand the NCCL_DEBUG_FILE path and
|
||||
* then create the debug file. But don't bother unless the
|
||||
* NCCL_DEBUG level is > VERSION
|
||||
*/
|
||||
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
|
||||
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
|
||||
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
|
||||
int c = 0;
|
||||
char debugFn[PATH_MAX+1] = "";
|
||||
char *dfn = debugFn;
|
||||
@@ -95,12 +109,10 @@ void ncclDebugInit() {
|
||||
*dfn++ = '%';
|
||||
break;
|
||||
case 'h': // %h = hostname
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
|
||||
break;
|
||||
case 'p': // %p = pid
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
|
||||
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
|
||||
break;
|
||||
default: // Echo everything we don't understand
|
||||
*dfn++ = '%';
|
||||
@@ -111,15 +123,15 @@ void ncclDebugInit() {
|
||||
*dfn = '\0';
|
||||
if (debugFn[0] != '\0') {
|
||||
FILE *file = fopen(debugFn, "w");
|
||||
if (file != NULL) {
|
||||
if (file != nullptr) {
|
||||
setbuf(file, nullptr); // disable buffering
|
||||
ncclDebugFile = file;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
ncclEpoch = std::chrono::high_resolution_clock::now();
|
||||
#endif
|
||||
ncclEpoch = std::chrono::steady_clock::now();
|
||||
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
@@ -128,45 +140,53 @@ void ncclDebugInit() {
|
||||
* they can share the debugging mechanisms and output files
|
||||
*/
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
|
||||
if (ncclDebugLevel == -1) ncclDebugInit();
|
||||
if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
|
||||
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
|
||||
|
||||
// Save the last error (WARN) as a human readable string
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
|
||||
va_end(vargs);
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
|
||||
|
||||
// Gather the rank information. This can take > 1us so we want to make sure
|
||||
// we only do it when needed.
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024, '.');
|
||||
if (tid == -1) {
|
||||
tid = syscall(SYS_gettid);
|
||||
}
|
||||
|
||||
int cudaDev;
|
||||
hipGetDevice(&cudaDev);
|
||||
int pid = getpid();
|
||||
int tid = syscall(SYS_gettid);
|
||||
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
|
||||
hipGetDevice(&cudaDev);
|
||||
}
|
||||
|
||||
char buffer[1024];
|
||||
size_t len = 0;
|
||||
pthread_mutex_lock(&ncclDebugLock);
|
||||
if (level == NCCL_LOG_WARN)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
|
||||
else if (level == NCCL_LOG_INFO)
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
#ifdef ENABLE_TRACE
|
||||
else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
|
||||
if (level == NCCL_LOG_WARN) {
|
||||
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
|
||||
hostname, pid, tid, cudaDev, filefunc, line);
|
||||
} else if (level == NCCL_LOG_INFO) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
|
||||
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
|
||||
} else if (level == NCCL_LOG_TRACE) {
|
||||
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
|
||||
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
|
||||
len = snprintf(buffer, sizeof(buffer),
|
||||
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
|
||||
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (len) {
|
||||
va_list vargs;
|
||||
va_start(vargs, fmt);
|
||||
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
|
||||
va_end(vargs);
|
||||
fprintf(ncclDebugFile,"%s\n", buffer);
|
||||
fflush(ncclDebugFile);
|
||||
buffer[len++] = '\n';
|
||||
fwrite(buffer, 1, len, ncclDebugFile);
|
||||
}
|
||||
pthread_mutex_unlock(&ncclDebugLock);
|
||||
}
|
||||
|
||||
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
|
||||
|
||||
+1027
-942
La diferencia del archivo ha sido suprimido porque es demasiado grande
Cargar Diff
+12
-13
@@ -448,10 +448,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
|
||||
|
||||
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
|
||||
// remote proxies without risking deadlocks
|
||||
int ncclPxnDisable() {
|
||||
int ncclPxnDisable(struct ncclComm* comm) {
|
||||
static int pxnDisable = -1;
|
||||
if (pxnDisable == -1) {
|
||||
if (ncclNetVersion() == 4) {
|
||||
if (comm && ncclNetVersion(comm) == 4) {
|
||||
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
|
||||
pxnDisable = 1;
|
||||
} else {
|
||||
@@ -490,7 +490,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
|
||||
// Precompute paths between GPUs/NICs.
|
||||
|
||||
// Remove everything in case we're re-computing
|
||||
@@ -518,16 +518,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
}
|
||||
}
|
||||
|
||||
if (peerInfos == NULL) continue;
|
||||
if (comm == NULL) continue;
|
||||
// Remove GPUs we can't talk to because of containers.
|
||||
struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank[0];
|
||||
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
if (p == g) continue;
|
||||
struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank[0];
|
||||
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
|
||||
int shm;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
|
||||
int p2p;
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
|
||||
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
|
||||
if (shm == 0 && p2p == 0) {
|
||||
// Mark this peer as inaccessible. We'll trim it later.
|
||||
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
|
||||
@@ -543,7 +543,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
|
||||
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
|
||||
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
||||
if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
|
||||
int pxnGpu = -1;
|
||||
|
||||
for (int p=0; p<system->nodes[GPU].count; p++) {
|
||||
@@ -556,7 +556,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
|
||||
pxnGpu = p;
|
||||
|
||||
int netDev;
|
||||
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev));
|
||||
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
|
||||
if (netDev == netNode->id) break;
|
||||
@@ -602,8 +601,8 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
|
||||
}
|
||||
for (int j=0; j<gpu->gpu.nRanksPerGpu; j++ ) {
|
||||
if (gpu->gpu.rank[j] == comm->rank) {
|
||||
myDomain = domains[g];
|
||||
break;
|
||||
myDomain = domains[g];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -768,7 +767,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
|
||||
// We want to spread channels used when there aren't many and progressively
|
||||
// fill the whole space of nChannels. To do so we mirror the bits in the
|
||||
// nChannels space.
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
for (int c=0; c<comm->p2pnChannels; c++) {
|
||||
int mirror = 0;
|
||||
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
|
||||
comm->p2pChannels[c] = mirror;
|
||||
|
||||
@@ -275,8 +275,8 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) {
|
||||
*g = i;
|
||||
return ncclSuccess;
|
||||
*g = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1103,10 +1103,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
|
||||
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
|
||||
*proxyRank = rank;
|
||||
|
||||
int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
|
||||
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
|
||||
// See whether we can use the remote rank preferred device.
|
||||
if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
|
||||
int netDev = comm->peerInfo[peerRank].netDev;
|
||||
// Find local NIC number close to local cudaDev
|
||||
int cudaDev = comm->peerInfo[peerRank].cudaDev;
|
||||
int localRank;
|
||||
if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
|
||||
int netDev = comm->peerInfo[localRank].netDev;
|
||||
int n;
|
||||
// Check that device exists on our node
|
||||
if (ncclParamCrossNic() == 0) {
|
||||
|
||||
+13
-9
@@ -724,11 +724,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
|
||||
// so we start with collnet so that it has precedence.
|
||||
int netDevCount = 0;
|
||||
if (collNetSupport()) {
|
||||
NCCLCHECK(collNetDevices(&netDevCount));
|
||||
if (collNetSupport(comm)) {
|
||||
NCCLCHECK(collNetDevices(comm, &netDevCount));
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(collNetGetProperties(n, &props));
|
||||
NCCLCHECK(collNetGetProperties(comm, n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
@@ -737,16 +737,18 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
|
||||
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
|
||||
}
|
||||
}
|
||||
if (netDevCount == 0) {
|
||||
NCCLCHECK(ncclNetDevices(&netDevCount));
|
||||
NCCLCHECK(ncclNetDevices(comm, &netDevCount));
|
||||
}
|
||||
for (int n=0; n<netDevCount; n++) {
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(n, &props));
|
||||
NCCLCHECK(ncclNetGetProperties(comm, n, &props));
|
||||
struct ncclXmlNode* netNode;
|
||||
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
|
||||
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
|
||||
@@ -756,7 +758,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
|
||||
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
|
||||
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
|
||||
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
|
||||
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
|
||||
}
|
||||
|
||||
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
|
||||
@@ -903,8 +907,8 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
for ( int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){
|
||||
if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
|
||||
*localRank = g;
|
||||
return ncclSuccess;
|
||||
*localRank = g;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+14
-3
@@ -198,20 +198,31 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) {
|
||||
*index = i;
|
||||
return ncclSuccess;
|
||||
*index = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
|
||||
*rank = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
|
||||
*rank = system->nodes[GPU].nodes[i].gpu.rank[0];
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
// Returns XGMI speed in GB/s
|
||||
static float ncclTopoXGMISpeed(int gcn) {
|
||||
return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
|
||||
}
|
||||
|
||||
#define ncclGetKernelIndex(p_comm) \
|
||||
(((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->hostDevComm.collTraceThread ? 1 : 0))
|
||||
(((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->collTraceThread ? 1 : 0))
|
||||
|
||||
#endif
|
||||
|
||||
@@ -235,11 +235,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
|
||||
#else
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
||||
|
||||
+212
-396
@@ -11,446 +11,262 @@
|
||||
#include "transport.h"
|
||||
#include "channel.h"
|
||||
|
||||
#define MAX_ASYNC_OPS 128
|
||||
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
|
||||
thread_local int ncclGroupIndex = 0;
|
||||
thread_local int ncclGroupMode = 0;
|
||||
thread_local ncclResult_t ncclGroupError = ncclSuccess;
|
||||
extern struct allocationTracker allocTracker[];
|
||||
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
|
||||
__thread ncclResult_t ncclGroupError = ncclSuccess;
|
||||
__thread struct ncclComm* ncclGroupCommHead = nullptr;
|
||||
__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
|
||||
__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
|
||||
|
||||
bool ncclAsyncMode() {
|
||||
return ncclGroupMode > 0;
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ncclInitArgs {
|
||||
ncclInitFunc_t func;
|
||||
int cudaDev;
|
||||
ncclComm_t* newcomm;
|
||||
int ndev;
|
||||
ncclUniqueId commId;
|
||||
int myrank;
|
||||
int virtualId;
|
||||
};
|
||||
struct ncclCollArgs {
|
||||
ncclComm_t comm;
|
||||
uint16_t connIndex;
|
||||
};
|
||||
|
||||
enum ncclAsyncFuncType {
|
||||
ASYNC_FUNC_INVALID = 0,
|
||||
ASYNC_FUNC_INIT = 1,
|
||||
ASYNC_FUNC_COLL = 2,
|
||||
};
|
||||
struct ncclAsyncArgs {
|
||||
ncclResult_t ret;
|
||||
enum ncclAsyncFuncType funcType;
|
||||
union {
|
||||
ncclCollArgs coll;
|
||||
ncclInitArgs init;
|
||||
};
|
||||
};
|
||||
|
||||
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
|
||||
|
||||
void* ncclAsyncThreadMain(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank,
|
||||
args->init.cudaDev, args->init.virtualId));
|
||||
return args;
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId) {
|
||||
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
|
||||
return ncclAsyncErrCheck(ncclInvalidUsage);
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
struct ncclAsyncJob* job,
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*),
|
||||
void(*undo)(struct ncclAsyncJob*),
|
||||
void(*destructor)(void*)
|
||||
) {
|
||||
if (0 == ncclGroupDepth) {
|
||||
ncclResult_t res = func(job);
|
||||
if (res != ncclSuccess && undo) undo(job);
|
||||
if (destructor) destructor(job);
|
||||
return res;
|
||||
} else {
|
||||
job->func = func;
|
||||
job->undo = undo;
|
||||
job->destructor = destructor;
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
|
||||
return ncclSuccess;
|
||||
}
|
||||
int index = ncclGroupIndex++;
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+index;
|
||||
args->funcType = ASYNC_FUNC_INIT;
|
||||
args->init.func = func;
|
||||
args->init.cudaDev = cudaDev;
|
||||
args->init.newcomm = newcomm;
|
||||
args->init.ndev = ndev;
|
||||
memcpy(&args->init.commId, &commId, sizeof(commId));
|
||||
args->init.myrank = myrank;
|
||||
args->init.virtualId = virtualId;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncColl(ncclComm_t comm) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs;
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
if (args->coll.comm == comm) return ncclSuccess;
|
||||
args++;
|
||||
void* ncclAsyncJobMain(void* arg) {
|
||||
struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg;
|
||||
job->result = job->func(job);
|
||||
if (job->result != ncclSuccess) {
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
|
||||
}
|
||||
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
|
||||
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
|
||||
return ncclAsyncErrCheck(ncclInvalidUsage);
|
||||
}
|
||||
ncclGroupIndex++;
|
||||
args->funcType = ASYNC_FUNC_COLL;
|
||||
args->coll.comm = comm;
|
||||
return ncclSuccess;
|
||||
return arg;
|
||||
}
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupStart);
|
||||
ncclResult_t ncclGroupStart() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
|
||||
}
|
||||
ncclGroupMode++;
|
||||
NCCLCHECK(ncclGroupStartInternal());
|
||||
TRACE_CALL("ncclGroupStart()");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
struct ncclInfo info = { ncclFuncSend, "Send",
|
||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
info.channelId = channelId;
|
||||
info.opCount = opCount;
|
||||
info.connIndex = connIndex;
|
||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
|
||||
struct ncclInfo info = { ncclFuncRecv, "Recv",
|
||||
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
|
||||
1, 1 };
|
||||
info.channelId = channelId;
|
||||
info.opCount = opCount;
|
||||
info.connIndex = connIndex;
|
||||
NCCLCHECK(ncclSetupP2pKernel(&info));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* ncclAsyncThreadPreconnect(void* args_) {
|
||||
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
|
||||
return args;
|
||||
}
|
||||
|
||||
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
|
||||
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
|
||||
int nChannels = minChannels;
|
||||
while (size > maxSize && nChannels <= maxChannels/2) {
|
||||
nChannels *= 2;
|
||||
size = DIVUP(totalSize, nChannels);
|
||||
}
|
||||
ALIGN_SIZE(size, minSize);
|
||||
return size;
|
||||
}
|
||||
|
||||
RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
|
||||
|
||||
NCCL_API(ncclResult_t, ncclGroupEnd);
|
||||
ncclResult_t ncclGroupEnd() {
|
||||
NVTX3_FUNC_RANGE_IN(nccl_domain);
|
||||
if (ncclGroupMode == 0) {
|
||||
NCCLCHECK(ncclGroupEndInternal());
|
||||
TRACE_CALL("ncclGroupEnd()");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclPreconnectJob {
|
||||
struct ncclAsyncJob base;
|
||||
struct ncclComm* comm;
|
||||
};
|
||||
ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
|
||||
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
|
||||
struct ncclComm* comm = job->comm;
|
||||
CUDACHECK(hipSetDevice(comm->cudaDev));
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
|
||||
if (comm->p2pNet) NCCLCHECK(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P_NET));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t doLaunches(struct ncclComm* head) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
struct ncclComm* cliqueComm0 = head->intraComm0;
|
||||
struct ncclComm* cliqueHead = head;
|
||||
struct ncclComm* cliqueNextHead;
|
||||
bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
|
||||
// This outer loop iterates over cliques of comms which are siblings of the
|
||||
// same global entity. We calculate a clique as all comms which have the same
|
||||
// `intraComm0` value.
|
||||
do {
|
||||
struct ncclComm* comm = cliqueHead;
|
||||
bool capturingYes = false, capturingNo = false;
|
||||
do {
|
||||
(ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
|
||||
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
|
||||
comm = comm->groupNext;
|
||||
} while (comm != nullptr && comm->intraComm0 == cliqueComm0);
|
||||
cliqueNextHead = comm;
|
||||
|
||||
if (capturingYes && capturingNo) {
|
||||
// We have entered barriers but are aborting without leaving them. Thus
|
||||
// these comms are permanently trashed. We need a good mechanism for
|
||||
// tracking and reporting that.
|
||||
WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured.");
|
||||
result = ncclInvalidUsage;
|
||||
goto failure;
|
||||
}
|
||||
|
||||
while (true) { // Iterate rounds of launches for clique.
|
||||
bool moreRounds;
|
||||
comm = cliqueHead;
|
||||
do { // Iterate clique members.
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
if (useBarrier) {
|
||||
// Barrier reduction result tells us if this was the final round.
|
||||
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
|
||||
} else {
|
||||
moreRounds = comm->unlaunchedPlansHead != nullptr;
|
||||
}
|
||||
if (moreRounds) {
|
||||
// Pop next unlaunched kernel
|
||||
struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
|
||||
if (plan != nullptr) {
|
||||
comm->unlaunchedPlansHead = plan->next;
|
||||
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
|
||||
}
|
||||
// Barrier reduction input indicates if we require further rounds.
|
||||
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
|
||||
if (plan != nullptr) {
|
||||
NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
|
||||
}
|
||||
} else { // Final round.
|
||||
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
|
||||
NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
|
||||
}
|
||||
comm = next;
|
||||
} while (comm != cliqueNextHead);
|
||||
if (!moreRounds) break;
|
||||
}
|
||||
cliqueHead = cliqueNextHead;
|
||||
} while (cliqueHead != nullptr);
|
||||
failure:
|
||||
return result;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGroupEndInternal() {
|
||||
if (ncclGroupDepth == 0) {
|
||||
WARN("ncclGroupEnd: not in a group call.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
ncclGroupMode--;
|
||||
if (ncclGroupMode > 0) return ncclSuccess;
|
||||
ncclGroupDepth--;
|
||||
if (ncclGroupDepth > 0) return ncclSuccess;
|
||||
|
||||
int savedDev;
|
||||
CUDACHECK(hipGetDevice(&savedDev));
|
||||
int activeThreads = 0;
|
||||
int doneArray[MAX_ASYNC_OPS];
|
||||
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
|
||||
|
||||
ncclResult_t ret = ncclGroupError;
|
||||
int usingCudaGraphAll = -1;
|
||||
hipGraph_t* graphs = NULL;
|
||||
if (ret != ncclSuccess) goto group_cleanup;
|
||||
bool jobsDone = false;
|
||||
if (ret != ncclSuccess) goto failure;
|
||||
|
||||
/* Launch async ncclCommInitRank */
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT) {
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
|
||||
activeThreads++;
|
||||
doneArray[i] = 0;
|
||||
}
|
||||
}
|
||||
/* For init, since we use threads, we just wait for threads to complete */
|
||||
while (activeThreads) {
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
|
||||
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
|
||||
if (err == EBUSY) continue;
|
||||
if (err != 0) ret = ncclSystemError;
|
||||
if (args->ret != ncclSuccess) ret = args->ret;
|
||||
doneArray[i] = 1;
|
||||
activeThreads--;
|
||||
}
|
||||
}
|
||||
if (ncclGroupCommPreconnectHead != nullptr) {
|
||||
struct ncclComm* comm = ncclGroupCommPreconnectHead;
|
||||
do {
|
||||
struct ncclPreconnectJob* job;
|
||||
NCCLCHECK(ncclCalloc(&job, 1));
|
||||
job->base.func = ncclPreconnectFunc;
|
||||
job->base.undo = nullptr;
|
||||
job->base.destructor = free;
|
||||
job->comm = comm;
|
||||
ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
|
||||
|
||||
struct ncclComm* next = comm->preconnectNext;
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
comm = next;
|
||||
} while (comm != nullptr);
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
|
||||
args->coll.connIndex = 1;
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
}
|
||||
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
do {
|
||||
pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
job = ncclIntruQueueHead(&ncclAsyncJobs);
|
||||
do {
|
||||
int err = pthread_join(job->thread, nullptr);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
ret = ncclSystemError;
|
||||
}
|
||||
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect[1] = 0;
|
||||
}
|
||||
if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
|
||||
job = job->next;
|
||||
} while (job != nullptr);
|
||||
|
||||
jobsDone = true;
|
||||
if (ret != ncclSuccess) goto failure;
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
|
||||
args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
|
||||
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
|
||||
}
|
||||
if (ncclGroupCommHead != nullptr) {
|
||||
NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
|
||||
do {
|
||||
struct ncclComm* comm = ncclGroupCommHead;
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
ncclGroupCommLeave(comm);
|
||||
ncclGroupCommHead = next;
|
||||
} while (ncclGroupCommHead != nullptr);
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
if (false) {
|
||||
failure:
|
||||
struct ncclComm* comm = ncclGroupCommHead;
|
||||
while (comm != nullptr) {
|
||||
struct ncclComm* next = comm->groupNext;
|
||||
ncclGroupCommLeave(comm); // overwrites comm->groupNext
|
||||
// We don't know if preconnect succeeded or happened at all, so clear
|
||||
// the flags that let `taskAppend()` skip over checking if preconnect
|
||||
// is needed.
|
||||
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
for (int i=0; i < comm->nRanks; i++) {
|
||||
comm->tasks.peers[i].sendSeen = false;
|
||||
comm->tasks.peers[i].recvSeen = false;
|
||||
comm->connectSend[i] = 0;
|
||||
comm->connectRecv[i] = 0;
|
||||
comm->connectSend[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
comm->connectRecv[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
}
|
||||
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
int node = comm->node;
|
||||
int nNodes = comm->nNodes;
|
||||
int localRank = comm->localRank;
|
||||
|
||||
// Compute how much to split operations
|
||||
// Natural step size matching buffer steps.
|
||||
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
|
||||
// Try to use all channels
|
||||
int nChannelsMax = comm->p2pnChannelsPerPeer;
|
||||
int nChannelsMin = nChannelsMax;
|
||||
// Try to use all channels, but one channel per operation.
|
||||
//while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
|
||||
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
|
||||
//while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
|
||||
|
||||
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
// schedule delta 0, +1, -1, +2, -2, ...
|
||||
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
|
||||
for (int d=0; d<=nNodes/4; d++) {
|
||||
int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
|
||||
int index = 0;
|
||||
int delta = deltas[index];
|
||||
sched_delta:
|
||||
uint32_t recvNode = (node+nNodes-delta)%nNodes;
|
||||
uint32_t sendNode = (node+delta)%nNodes;
|
||||
int steps = comm->maxLocalRanks;
|
||||
for (int s=0; s<steps; s++) {
|
||||
int recvIndex = (localRank-s+steps)%steps;
|
||||
int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
|
||||
int sendIndex = (localRank+s)%steps;
|
||||
int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
|
||||
struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
|
||||
struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
|
||||
if (recv != NULL || send != NULL) {
|
||||
ssize_t totRecvBytes = -1, totSendBytes = -1;
|
||||
if (recv != NULL) totRecvBytes = recv->nbytes;
|
||||
if (send != NULL) totSendBytes = send->nbytes;
|
||||
if (recv) comm->p2pRecvCount--;
|
||||
if (send) comm->p2pSendCount--;
|
||||
if (recvPeer == comm->rank) { // Check self send/recv
|
||||
if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
|
||||
if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
|
||||
if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
|
||||
}
|
||||
void* recvBuff = recv ? recv->buff : NULL;
|
||||
void* sendBuff = send ? send->buff : NULL;
|
||||
// After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
|
||||
if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
|
||||
if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
|
||||
|
||||
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
|
||||
|
||||
uint16_t sendIdx = 1, recvIdx = 1;
|
||||
if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
|
||||
sendIdx = NCCL_CONN_IDX_P2P_NET;
|
||||
if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
|
||||
recvIdx = NCCL_CONN_IDX_P2P_NET;
|
||||
|
||||
ssize_t sendOffset = 0;
|
||||
ssize_t recvOffset = 0;
|
||||
int sendRemaining = 1, recvRemaining = 1;
|
||||
int chunk = 0;
|
||||
do {
|
||||
int channelId;
|
||||
// Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
|
||||
// to use multiple channels to guarantee progress on all ranks from the same node.
|
||||
ssize_t recvbytes = totRecvBytes-recvOffset;
|
||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
|
||||
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
|
||||
// (total size == 0), otherwise set size to -1.
|
||||
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
|
||||
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
|
||||
if (send || recv) {
|
||||
if (recv) {
|
||||
NCCLCHECK(ncclChannelCompute(comm, recvPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
|
||||
}
|
||||
else
|
||||
recvPeer = -1;
|
||||
if (send) {
|
||||
NCCLCHECK(ncclChannelCompute(comm, sendPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
|
||||
}
|
||||
else
|
||||
sendPeer = -1;
|
||||
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, recv ? ((char*)recvBuff)+recvOffset : NULL, recv ? recv->opCount : 0, recvIdx), ret, group_cleanup);
|
||||
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, send ? ((char*)sendBuff)+sendOffset : NULL, send ? send->opCount : 0, sendIdx), ret, group_cleanup);
|
||||
}
|
||||
recvOffset += recvChunkSize;
|
||||
sendOffset += sendChunkSize;
|
||||
chunk++;
|
||||
} while (sendRemaining || recvRemaining);
|
||||
comm->unlaunchedPlansHead = nullptr;
|
||||
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
|
||||
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
|
||||
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
|
||||
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
|
||||
// Persistent plans will be reclaimed via the callbackQueue when the
|
||||
// graph drops its UserObject reference.
|
||||
if (!plan->persistent) {
|
||||
for (int c=0; c < MAXCHANNELS; c++) {
|
||||
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
|
||||
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
|
||||
}
|
||||
}
|
||||
index++;
|
||||
if (index == 1 && deltas[1] == deltas[0]) index++;
|
||||
if (index == 2 && deltas[2] == deltas[0]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[2]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[1]) index++;
|
||||
if (index < 4) {
|
||||
delta = deltas[index];
|
||||
goto sched_delta;
|
||||
}
|
||||
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
|
||||
}
|
||||
}
|
||||
// Reset comm->tasks to empty.
|
||||
comm->tasks.nTasksColl = 0;
|
||||
comm->tasks.nTasksP2p = 0;
|
||||
comm->tasks.streams = nullptr;
|
||||
ncclIntruQueueConstruct(&comm->tasks.collQueue);
|
||||
comm->tasks.collBytesTotal = 0;
|
||||
for (int i=0; i < comm->nRanks; i++) {
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
|
||||
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
|
||||
}
|
||||
comm = next;
|
||||
}
|
||||
}
|
||||
|
||||
/* Collectives are done in three steps :
|
||||
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
|
||||
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
|
||||
* 2. Barrier Wait. No CUDA call is permitted
|
||||
* 3. Enqueue Events. CUDA event wait/enqueue.
|
||||
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
|
||||
* cudaFree happens between 1 and 3, it could block that CUDA call and
|
||||
* prevent some ranks from launching their network threads, which would
|
||||
* prevent the NCCL call from completing, blocking the cudaFree call.
|
||||
*/
|
||||
|
||||
// Check whether we are in cuda graph mode
|
||||
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
ncclComm_t comm = args->coll.comm;
|
||||
NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
|
||||
if (usingCudaGraphAll == -1) {
|
||||
usingCudaGraphAll = comm->usingCudaGraph;
|
||||
} else if (usingCudaGraphAll != comm->usingCudaGraph) {
|
||||
WARN("Illegal to have some communicators in graph mode while others not");
|
||||
ret = ncclInvalidUsage;
|
||||
goto group_cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
ncclComm_t comm = args->coll.comm;
|
||||
NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
if (args->coll.comm->userStream == hipStreamDefault/* ||
|
||||
args->coll.comm->userStream == hipStreamPerThread ||
|
||||
args->coll.comm->userStream == hipStreamLegacy*/)
|
||||
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
|
||||
if (usingCudaGraphAll == 1) {
|
||||
NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
|
||||
} else {
|
||||
ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
|
||||
NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
|
||||
}
|
||||
}
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_COLL) {
|
||||
if (args->coll.comm->userStream == hipStreamDefault/* ||
|
||||
args->coll.comm->userStream == hipStreamPerThread ||
|
||||
args->coll.comm->userStream == hipStreamLegacy*/)
|
||||
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
|
||||
NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
|
||||
NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
|
||||
}
|
||||
while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
|
||||
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
|
||||
if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
|
||||
if (job->destructor) job->destructor((void*)job);
|
||||
}
|
||||
|
||||
goto end;
|
||||
group_cleanup:
|
||||
if (ret != ncclSuccess) {
|
||||
// At least one call in the group failed. Since we want to make that group
|
||||
// an atomic operation, we need to cancel all operations.
|
||||
for (int i=0; i<ncclGroupIndex; i++) {
|
||||
struct ncclAsyncArgs* args = ncclGroupArgs+i;
|
||||
if (args->funcType == ASYNC_FUNC_INIT) {
|
||||
if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
|
||||
*args->init.newcomm = NULL;
|
||||
} else {
|
||||
struct ncclComm* comm = args->coll.comm;
|
||||
// Reset aggregation counters
|
||||
comm->asyncOpCount = 0;
|
||||
comm->asyncTotalSize = 0;
|
||||
// Dequeue p2p lists
|
||||
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
|
||||
for (int peer=0; peer<comm->nRanks; peer++) {
|
||||
if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle();
|
||||
if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle();
|
||||
}
|
||||
comm->p2pSendCount = comm->p2pRecvCount = 0;
|
||||
}
|
||||
ncclLaunchReset(comm);
|
||||
}
|
||||
}
|
||||
}
|
||||
end:
|
||||
ncclGroupError = ncclSuccess;
|
||||
ncclGroupIndex = 0;
|
||||
ncclGroupCommHead = nullptr;
|
||||
ncclGroupCommPreconnectHead = nullptr;
|
||||
CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
|
||||
if (graphs) free(graphs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
Archivo normal → Archivo ejecutable
+123
-39
@@ -11,28 +11,40 @@
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
#include "align.h"
|
||||
#include "utils.h"
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "rccl_vars.h"
|
||||
|
||||
uint64_t clockNano(); // from utils.h with which we have a circular dependency
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
CUDACHECK(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped));
|
||||
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time = 0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
time = clockNano();
|
||||
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped), result, finish);
|
||||
time = clockNano() - time;
|
||||
memset(*ptr, 0, nelem*sizeof(T));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
return ncclSuccess;
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: hipHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(hipHostFree(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
|
||||
void* p = malloc(nelem*sizeof(T));
|
||||
if (p == NULL) {
|
||||
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
|
||||
@@ -46,7 +58,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
|
||||
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
|
||||
if (nelem < oldNelem) return ncclInternalError;
|
||||
if (nelem == oldNelem) return ncclSuccess;
|
||||
|
||||
@@ -78,54 +90,126 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
|
||||
extern struct allocationTracker allocTracker[];
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
|
||||
// Need async stream for P2P pre-connect + CUDA Graph
|
||||
static bool streamCreated = false;
|
||||
static hipStream_t stream;
|
||||
if (rcclParamEnableHipGraph() && !streamCreated)
|
||||
{
|
||||
// Create stream only once to avoid performance penalty
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
streamCreated = true;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
uint64_t time = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time = clockNano() - time;
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
if (rcclParamEnableHipGraph()) {
|
||||
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
|
||||
CUDACHECK(hipStreamSynchronize(stream));
|
||||
// NOTE: Currently the re-used stream is not destroyed
|
||||
//CUDACHECK(hipStreamDestroy(stream));
|
||||
} else {
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
CUDACHECK(hipStreamSynchronize(NULL));
|
||||
}
|
||||
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time0=0, time1=0, time2=0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
hipStream_t stream;
|
||||
time0 = clockNano();
|
||||
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
|
||||
time1 = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time2 = clockNano();
|
||||
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
|
||||
}
|
||||
return ncclSuccess;
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipStreamCreateWithFlags=%g hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
uint64_t time = 0;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
*ptr = nullptr;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
time = clockNano();
|
||||
if (isFineGrain)
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
|
||||
else
|
||||
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
|
||||
time = clockNano() - time;
|
||||
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
if (dev < MAX_ALLOC_TRACK_NGPU) {
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
|
||||
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
|
||||
}
|
||||
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
// Need a side stream so as not to interfere with graph capture.
|
||||
hipStream_t stream;
|
||||
CUDACHECKGOTO(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking), result, finish);
|
||||
NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
|
||||
CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
|
||||
CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(hipMemcpyAsync(dst, src, nelem*sizeof(T), hipMemcpyDefault, stream), result, finish);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
ncclResult_t ncclCudaFree(T* ptr) {
|
||||
ncclResult_t result = ncclSuccess;
|
||||
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
CUDACHECKGOTO(hipFree(ptr), result, finish);
|
||||
finish:
|
||||
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
||||
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
|
||||
size_t page_size = sysconf(_SC_PAGESIZE);
|
||||
void* p;
|
||||
int size_aligned = ROUNDUP(size, page_size);
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
@@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int
|
||||
}
|
||||
|
||||
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
|
||||
*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
|
||||
//*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
|
||||
*channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
// Check CUDA calls
|
||||
// Check CUDA RT calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
hipError_t err = cmd; \
|
||||
if( err != hipSuccess ) { \
|
||||
@@ -143,9 +143,9 @@
|
||||
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
|
||||
} while (!(cond));
|
||||
|
||||
#define NCCLCHECKTHREAD(a) do { \
|
||||
if ((args->ret = (a)) != ncclSuccess) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
|
||||
#define NCCLCHECKTHREAD(a, args) do { \
|
||||
if (((args)->ret = (a)) != ncclSuccess) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
+17
-16
@@ -10,25 +10,26 @@
|
||||
#include "nccl.h"
|
||||
#include "nccl_net.h"
|
||||
|
||||
extern ncclCollNet_t* ncclCollNet;
|
||||
typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
// Translation to external API
|
||||
static const char* collNetName() { return ncclCollNet->name; }
|
||||
static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
|
||||
static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
|
||||
static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
|
||||
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
|
||||
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
/* DMA-BUF support */
|
||||
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
|
||||
NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
|
||||
static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
|
||||
static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
|
||||
|
||||
#endif
|
||||
|
||||
@@ -47,10 +47,10 @@ struct ncclDevRedOpFull {
|
||||
/* Declare all collective operations */
|
||||
#define DECL5(func, algo, proto, devredop, type) \
|
||||
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);
|
||||
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
|
||||
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
|
||||
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
|
||||
|
||||
#define CONCAT(a,b) a##b
|
||||
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
|
||||
|
||||
+197
-78
@@ -10,25 +10,13 @@
|
||||
|
||||
#include "transport.h"
|
||||
#include "p2p.h"
|
||||
// [RCCL]
|
||||
//#include "clique/CliqueManager.h"
|
||||
// [/RCCL]
|
||||
|
||||
// Convert volatile access to atomic
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
#else
|
||||
#define LOAD(VAR) *(VAR)
|
||||
#define STORE(DST, SRC) *(DST) = (SRC)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define HIPRT_CB
|
||||
#else
|
||||
#include "collectives.h"
|
||||
#include "proxy.h"
|
||||
#include "strongstream.h"
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define HIPRT_CB
|
||||
#else
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
void *func;
|
||||
@@ -77,8 +65,6 @@ struct ncclRecvMem {
|
||||
};
|
||||
};
|
||||
|
||||
typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
|
||||
|
||||
enum helperThreadState {ThreadStart, ThreadStop};
|
||||
|
||||
#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
|
||||
@@ -104,15 +90,87 @@ struct ncclNodeRanks {
|
||||
int* localRankToRank;
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
struct ncclDestructor {
|
||||
struct ncclDestructor* next;
|
||||
void* obj;
|
||||
ncclResult_t(*fn)(struct ncclDestructor* me);
|
||||
};
|
||||
|
||||
struct ncclCommCallback {
|
||||
struct ncclCommCallback* next;
|
||||
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
|
||||
};
|
||||
|
||||
struct ncclChannel {
|
||||
struct ncclChannelPeer* peers;
|
||||
struct ncclDevChannelPeer* devPeers;
|
||||
struct ncclRing ring;
|
||||
int* devRingUserRanks;
|
||||
struct ncclTree tree;
|
||||
struct ncclDirect collTree;
|
||||
int id; // index of this channel
|
||||
uint32_t workFifoSent; // last used work index+1
|
||||
uint64_t p2pOpCount;
|
||||
};
|
||||
|
||||
struct ncclWorkList {
|
||||
struct ncclWorkList* next;
|
||||
struct ncclWork work;
|
||||
};
|
||||
|
||||
struct ncclPointerList {
|
||||
struct ncclPointerList* next;
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
struct ncclKernelPlan {
|
||||
// A kernel plan is also a callback that reclaims itself. Hence this must
|
||||
// be the first member.
|
||||
struct ncclCommCallback reclaimer;
|
||||
struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
|
||||
|
||||
struct ncclComm* comm;
|
||||
struct ncclKernelPlan* next;
|
||||
|
||||
bool persistent; // aka captured in a graph
|
||||
void *kernelFn;
|
||||
int channelUbound; // only channels c < channelUbound are present
|
||||
int channelCount; // number of channels present
|
||||
uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
|
||||
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
|
||||
int threadPerBlock;
|
||||
// workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
|
||||
struct ncclWork* workHead;
|
||||
|
||||
int collOpCount; // zero based for this plan
|
||||
|
||||
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
|
||||
|
||||
struct Channel {
|
||||
int nWork;
|
||||
union {
|
||||
int nWorkElem; // used for coll and reg coll
|
||||
int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
|
||||
};
|
||||
size_t collBytes;
|
||||
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
|
||||
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
|
||||
} channels[MAXCHANNELS];
|
||||
};
|
||||
|
||||
struct ncclComm {
|
||||
struct ncclMemoryStack memPermanent, memScoped;
|
||||
// List of destructors to run when comm is destructed
|
||||
struct ncclDestructor* destructorHead;
|
||||
|
||||
struct ncclChannel channels[MAXCHANNELS];
|
||||
struct ncclPeerInfo* peerInfo;
|
||||
struct ncclTopoSystem* topo;
|
||||
|
||||
ncclNet_t* ncclNet;
|
||||
ncclCollNet_t* ncclCollNet;
|
||||
void* bootstrap;
|
||||
// Bitmasks for ncclTransportP2pSetup
|
||||
int connect[NCCL_MAX_CONNS];
|
||||
uint32_t* connectSend;
|
||||
uint32_t* connectRecv;
|
||||
|
||||
@@ -135,19 +193,13 @@ struct ncclComm {
|
||||
// localRanks and localRanktoRank for all nodes
|
||||
struct ncclNodeRanks* nodeRanks;
|
||||
|
||||
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
|
||||
hipStream_t userStream;
|
||||
bool userStreamSet;
|
||||
hipEvent_t doneEvent;
|
||||
hipEvent_t intDoneEvent;
|
||||
bool checkPointers;
|
||||
bool dmaBufSupport;
|
||||
|
||||
// Counter for tracking CUDA launches (P2P and collectives included)
|
||||
uint64_t opCount;
|
||||
// Collective operation counter
|
||||
uint64_t collOpCount;
|
||||
// P2P operation counter
|
||||
uint64_t p2pOpCount;
|
||||
|
||||
// Channels for collectives
|
||||
int nChannels;
|
||||
@@ -165,10 +217,6 @@ struct ncclComm {
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
hipStream_t groupStream;
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
|
||||
@@ -178,26 +226,33 @@ struct ncclComm {
|
||||
// Flags for enable P2P NET
|
||||
uint32_t p2pNet;
|
||||
uint32_t useIntraNet;
|
||||
bool hasFineGrain;
|
||||
|
||||
// Device side of the communicator
|
||||
struct ncclDevComm *devComm;
|
||||
// Host copy of the devComm (to free CUDA allocs)
|
||||
struct ncclDevComm hostDevComm;
|
||||
// Device side of the communicator (for cudaFree's)
|
||||
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
|
||||
|
||||
// Operation pool.
|
||||
int workFifoDepth; // size of workFifoHeap[], power of 2
|
||||
struct ncclWork* workFifoHeap;
|
||||
struct ncclWork* devWorkFifoHeap;
|
||||
void* workFifoHeapGdrHandle;
|
||||
|
||||
// Work completion notificaion
|
||||
uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
|
||||
uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
|
||||
uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
|
||||
|
||||
// Intra-process sync
|
||||
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
|
||||
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
|
||||
int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
|
||||
int intraRank;
|
||||
int intraRanks;
|
||||
int* intraBarrier;
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
hipLaunchParams * intraParams;
|
||||
hipLaunchParams *myParams;
|
||||
pthread_t* intraThreads;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
void* argsptrs[1];
|
||||
uint32_t intraBarrierPhase;
|
||||
char intraPad1[64 - sizeof(uint64_t)];
|
||||
uint64_t intraBarrierCounter; // only used if this is intraComm0
|
||||
char intraPad2[64 - sizeof(uint64_t)];
|
||||
uint64_t intraBarrierGate; // only used if this is intraComm0
|
||||
|
||||
struct ncclProxyState proxyState;
|
||||
|
||||
@@ -205,44 +260,108 @@ struct ncclComm {
|
||||
int collNetSupport;
|
||||
int intraHighestTransportType;
|
||||
|
||||
// Store info of async operations
|
||||
struct ncclInfo* asyncOps;
|
||||
int asyncOpCount;
|
||||
size_t asyncTotalSize;
|
||||
ssize_t channelSize;
|
||||
int lastChannel;
|
||||
enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
|
||||
size_t channelSize; // User requested work size (bytes) for channel partitions
|
||||
|
||||
//list of async p2p operation queued in a group semantics
|
||||
ncclP2Plist** p2pSends;
|
||||
ncclP2Plist** p2pRecvs;
|
||||
int p2pSendCount;
|
||||
int p2pRecvCount;
|
||||
// Internal streams
|
||||
struct ncclStrongStream deviceStream, hostStream;
|
||||
|
||||
// [RCCL]
|
||||
//CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels
|
||||
//int rootPid; // Process ID of root
|
||||
// [/RCCL]
|
||||
|
||||
// Store info for cudaGraph
|
||||
int usingCudaGraph; // Only use it during capture time, not launch time
|
||||
struct ncclQueueInfo* enqueueInfo;
|
||||
int nQueueInfoCreated;
|
||||
int nQueueInfoDestroyed;
|
||||
hipGraphNode_t lastSetupNode;
|
||||
unsigned long long lastCudaGraphId;
|
||||
int driverVersion;
|
||||
pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
|
||||
pthread_t graphHelperThread;
|
||||
struct ncclGraphHelperResources* graphHelperResources;
|
||||
int disableGraphHelper;
|
||||
int graphRegister;
|
||||
// pools backed by comm->memPermanent
|
||||
struct ncclMemoryPool memPool_ncclProxyOp;
|
||||
struct ncclMemoryPool memPool_ncclKernelPlan;
|
||||
struct ncclMemoryPool memPool_ncclPointerList;
|
||||
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
|
||||
// this comm is not yet in a group.
|
||||
struct ncclComm* groupNext;
|
||||
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
|
||||
struct ncclComm* preconnectNext;
|
||||
int persistentRefs; // number of persistent plan-lists capturing this comm
|
||||
struct ncclTasks tasks;
|
||||
|
||||
// user-created reduction ops
|
||||
int userRedOpCapacity, userRedOpFreeHead;
|
||||
ncclUserRedOp *userRedOps;
|
||||
|
||||
// Queue of things for the main thread to do
|
||||
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
|
||||
|
||||
// List of kernel plans built form tasks.
|
||||
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
|
||||
// First of the unlaunched kernels in `planQueue`
|
||||
struct ncclKernelPlan* unlaunchedPlansHead;
|
||||
|
||||
hipEvent_t doneEvent;
|
||||
hipStream_t lastStream;
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
struct ncclCollTrace* collTrace;
|
||||
volatile uint32_t *collTraceTail;
|
||||
pthread_t collTraceThread;
|
||||
volatile bool collTraceExit;
|
||||
#endif
|
||||
};
|
||||
|
||||
// Set to true during an `atexit()` handler. We use this to intentionally leak
|
||||
// unfreed CUDA resources when cleaning up after return of `main()` to avoid
|
||||
// CUDA calls after CUDA runtime teardown.
|
||||
extern bool ncclMainExited;
|
||||
|
||||
enum ncclLaunchMode {
|
||||
ncclLaunchModeInvalid=0,
|
||||
ncclLaunchModeParallel,
|
||||
ncclLaunchModeGroup
|
||||
};
|
||||
extern enum ncclLaunchMode ncclParamLaunchMode;
|
||||
|
||||
void ncclCommPushFree(struct ncclComm* comm, void* buf);
|
||||
void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
|
||||
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
|
||||
void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
|
||||
|
||||
inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
|
||||
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
|
||||
while (cb != nullptr) {
|
||||
struct ncclCommCallback* next = cb->next;
|
||||
NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
|
||||
cb = next;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
|
||||
int phase = comm->intraBarrierPhase;
|
||||
if (comm->intraRanks == 1) {
|
||||
// Release everyone (just me).
|
||||
comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
|
||||
} else {
|
||||
struct ncclComm* comm0 = comm->intraComm0;
|
||||
uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
|
||||
if (uint32_t(count) == uint32_t(comm->intraRanks)) {
|
||||
// Reset.
|
||||
__atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
|
||||
// Release everyone.
|
||||
__atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
|
||||
inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
|
||||
struct ncclComm* comm0 = comm->intraComm0;
|
||||
comm->intraBarrierPhase ^= 1;
|
||||
uint32_t phase = comm->intraBarrierPhase;
|
||||
uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
|
||||
if ((gate & 1) != phase) {
|
||||
uint64_t t0 = clockNano();
|
||||
do {
|
||||
// Spin vigorously for first 5us.
|
||||
if (clockNano()-t0 >= 5*1000) sched_yield();
|
||||
gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
|
||||
} while ((gate & 1) != phase);
|
||||
}
|
||||
if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
return gate>>32;
|
||||
}
|
||||
|
||||
// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
|
||||
// communicator memory address. Used to catch bugs so that integer handles
|
||||
// associated with this communicator won't collide with handles of other
|
||||
|
||||
@@ -37,7 +37,9 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
case ncclUint8:
|
||||
return 1;
|
||||
case ncclFloat16:
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
case ncclBfloat16:
|
||||
#endif
|
||||
return 2;
|
||||
case ncclInt32:
|
||||
case ncclUint32:
|
||||
@@ -54,6 +56,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
|
||||
|
||||
#include "debug.h"
|
||||
#include "checks.h"
|
||||
#include "rocmwrap.h"
|
||||
#include "alloc.h"
|
||||
#include "utils.h"
|
||||
#include "param.h"
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_CUDAWRAP_H_
|
||||
#define NCCL_CUDAWRAP_H_
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
#include <cudaTypedefs.h>
|
||||
#else
|
||||
typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
|
||||
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
|
||||
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
|
||||
#endif
|
||||
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if( err != CUDA_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
(void) pfn_cuGetErrorString(err, &errStr); \
|
||||
WARN("Cuda failure '%s'", errStr); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if( err != CUDA_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
(void) pfn_cuGetErrorString(err, &errStr); \
|
||||
WARN("Cuda failure '%s'", errStr); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Report failure but clear error and continue
|
||||
#define CUCHECKIGNORE(cmd) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if( err != CUDA_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
(void) pfn_cuGetErrorString(err, &errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKTHREAD(cmd, args) do { \
|
||||
CUresult err = pfn_##cmd; \
|
||||
if (err != CUDA_SUCCESS) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
|
||||
args->ret = ncclUnhandledCudaError; \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
|
||||
#if CUDA_VERSION >= 11070
|
||||
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN_EXTERN(cuInit);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
|
||||
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
|
||||
|
||||
|
||||
ncclResult_t cudaLibraryInit(void);
|
||||
|
||||
#endif
|
||||
@@ -10,8 +10,8 @@
|
||||
#include "nccl_net.h"
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
#include <type_traits>
|
||||
|
||||
#include <sys/syscall.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
extern int ncclDebugLevel;
|
||||
extern uint64_t ncclDebugMask;
|
||||
extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern pthread_mutex_t ncclDebugLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
|
||||
@@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
|
||||
|
||||
// Let code temporarily downgrade WARN into INFO
|
||||
extern thread_local int ncclDebugNoWarn;
|
||||
extern char ncclLastError[];
|
||||
|
||||
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
|
||||
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
|
||||
|
||||
#ifdef ENABLE_TRACE
|
||||
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
|
||||
extern std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
extern std::chrono::steady_clock::time_point ncclEpoch;
|
||||
#else
|
||||
#define TRACE(...)
|
||||
#endif
|
||||
|
||||
+87
-88
@@ -15,9 +15,6 @@
|
||||
#include "npkit/npkit_struct.h"
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
// [RCCL] Support for clique-based kernels
|
||||
//#include "clique/CliqueCommon.h"
|
||||
// [/RCCL]
|
||||
|
||||
|
||||
#define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
|
||||
@@ -33,7 +30,6 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
|
||||
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
|
||||
#define NCCL_PROTO_LL 0
|
||||
#define NCCL_PROTO_LL128 1
|
||||
#define NCCL_PROTO_CLIQUE 1 // [RCCL] Clique takes up same protocol as unused LL128
|
||||
#define NCCL_PROTO_SIMPLE 2
|
||||
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
@@ -83,10 +79,6 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
#define NCCL_LL128_MAX_NTHREADS 256
|
||||
#define NCCL_LL128_ELEMS_PER_THREAD 28
|
||||
|
||||
// Receiving from up to 3 sources is more compute intensive than sending
|
||||
// to 3 dests. Use 70% for reduce and 30% for bcast.
|
||||
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
@@ -145,7 +137,6 @@ struct ncclRing {
|
||||
// since we need to know how the user expects data to be ordered across
|
||||
// devices. Ordered from current device.
|
||||
int* userRanks;
|
||||
int* devUserRanks;
|
||||
|
||||
int index; // This rank's index in the ring
|
||||
};
|
||||
@@ -171,7 +162,7 @@ struct ncclDirect {
|
||||
|
||||
#define NCCL_CONN_IDX_P2P_NET 2
|
||||
#define NCCL_MAX_CONNS 3
|
||||
struct ncclPeer {
|
||||
struct ncclChannelPeer {
|
||||
struct ncclConnector send[NCCL_MAX_CONNS];
|
||||
struct ncclConnector recv[NCCL_MAX_CONNS];
|
||||
};
|
||||
@@ -185,31 +176,43 @@ struct ncclDevComm;
|
||||
/* Make sure to adjust padding at the end of ncclWorkElem. */
|
||||
#define NCCL_WORK_SIZE 256
|
||||
|
||||
enum ncclWorkElemType : uint8_t {
|
||||
enum ncclWorkType : uint8_t {
|
||||
ncclWorkTypeUnused=0,
|
||||
ncclWorkTypeColl=1,
|
||||
ncclWorkTypeP2p=2,
|
||||
ncclWorkTypeRegColl=3
|
||||
};
|
||||
enum ncclWorkElemSubType : uint8_t {
|
||||
ncclWorkSubTypeUnused =0,
|
||||
ncclWorkSubTypeSend,
|
||||
ncclWorkSubTypeRecv
|
||||
enum ncclWorkP2PType : uint8_t {
|
||||
ncclWorkP2pTypeUnused=0,
|
||||
ncclWorkP2pTypeSend,
|
||||
ncclWorkP2pTypeRecv
|
||||
};
|
||||
|
||||
struct ncclWorkElemHeader {
|
||||
struct ncclWorkHeader {
|
||||
union {
|
||||
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
|
||||
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
|
||||
};
|
||||
uint16_t funcIndex;
|
||||
enum ncclWorkElemType type;
|
||||
uint8_t nWarps:5;
|
||||
uint8_t isLast:1;
|
||||
uint8_t isLast:1; // last work for this kernel
|
||||
uint8_t inFifo:1; // is this work in the fifo
|
||||
enum ncclWorkType type;
|
||||
};
|
||||
|
||||
struct ncclWorkElem {
|
||||
struct ncclWorkElemHeader header;
|
||||
uint8_t regUsed;
|
||||
union {
|
||||
uint8_t flagBits;
|
||||
struct {
|
||||
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, pad_0:1, nWarps:4;
|
||||
};
|
||||
};
|
||||
uint8_t direct;
|
||||
uint8_t redOpArgIsPtr;
|
||||
uint8_t pad_0;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
struct {
|
||||
uint32_t root:30;
|
||||
uint32_t connIndex:2;
|
||||
};
|
||||
|
||||
const void * sendbuff;
|
||||
void * recvbuff;
|
||||
@@ -221,29 +224,40 @@ struct ncclWorkElem {
|
||||
// Instead, it needs the number of bidirectional rings.
|
||||
size_t pivotA2ANumBiRings;
|
||||
};
|
||||
uint32_t root;
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
uint16_t connIndex;
|
||||
uint64_t redOpArg;
|
||||
uint64_t opCount;
|
||||
};
|
||||
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
|
||||
|
||||
static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
|
||||
#define NCCL_MAX_WORK_ELEMENTS 1
|
||||
|
||||
struct ncclWorkElemP2p {
|
||||
struct ncclWorkElemHeader header;
|
||||
int32_t peer;
|
||||
void* buff;
|
||||
size_t count;
|
||||
struct {
|
||||
int32_t peer:30;
|
||||
uint32_t connIndex:2;
|
||||
};
|
||||
union {
|
||||
uint16_t flagBits;
|
||||
struct {
|
||||
enum ncclWorkP2PType p2pType:4;
|
||||
uint16_t nWarps:4;
|
||||
uint16_t warpStart:4;
|
||||
uint16_t ngroups:4;
|
||||
};
|
||||
};
|
||||
uint16_t opCount;
|
||||
// Important not to use any fields with greater than 4-byte alignment since
|
||||
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
|
||||
// there were 8-byte fields.
|
||||
//void* buff;
|
||||
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
|
||||
//size_t count;
|
||||
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
|
||||
int chunkSize;
|
||||
uint8_t ngroups:4;
|
||||
uint8_t warpStart:4;
|
||||
uint8_t nWarps:4;
|
||||
enum ncclWorkElemSubType subType:4;
|
||||
uint16_t opCount:12;
|
||||
uint16_t connIndex:4;
|
||||
};
|
||||
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
|
||||
|
||||
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
|
||||
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
|
||||
|
||||
struct ncclWorkElemReg {
|
||||
struct ncclWorkElem elem;
|
||||
@@ -251,56 +265,31 @@ struct ncclWorkElemReg {
|
||||
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
|
||||
};
|
||||
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
|
||||
static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
|
||||
|
||||
#define NCCL_MAX_WORK_ELEMENTS 1
|
||||
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
|
||||
#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
|
||||
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
|
||||
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
|
||||
|
||||
// Number of named barriers supported by CUDA
|
||||
#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
|
||||
|
||||
struct ncclWork {
|
||||
struct ncclWorkHeader header;
|
||||
union {
|
||||
char pad[NCCL_WORK_SIZE];
|
||||
struct ncclWorkElemHeader header;
|
||||
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
|
||||
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
|
||||
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
|
||||
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
|
||||
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
|
||||
|
||||
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
|
||||
|
||||
struct ncclChannel {
|
||||
union {
|
||||
struct {
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclDirect collTree;
|
||||
|
||||
int id;
|
||||
|
||||
// Communication structures
|
||||
struct ncclPeer* peers;
|
||||
struct ncclPeer* devPeers;
|
||||
|
||||
// Operation list for aggregation
|
||||
struct ncclWork* workFifo;
|
||||
int workCount;
|
||||
size_t totalSize;
|
||||
uint64_t workFifoTail; // Only used by CPU
|
||||
uint16_t index; // Only used by GPU
|
||||
|
||||
// GDRCOPY support
|
||||
struct ncclWork* workFifoGdr;
|
||||
struct ncclWork* workFifoDev;
|
||||
void* gdrMemDesc;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
struct ncclDevChannelPeer {
|
||||
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
|
||||
// instead of the full ncclConnector.
|
||||
struct ncclConnInfo send[NCCL_MAX_CONNS];
|
||||
struct ncclConnInfo recv[NCCL_MAX_CONNS];
|
||||
};
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
@@ -361,38 +350,48 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
|
||||
#define COLLTRACE_NUM_ITEMS 8192
|
||||
#endif
|
||||
|
||||
struct alignas(16) ncclDevChannel {
|
||||
struct ncclDevChannelPeer *peers;
|
||||
struct ncclRing ring;
|
||||
struct ncclTree tree;
|
||||
struct ncclDirect collTree;
|
||||
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
|
||||
};
|
||||
|
||||
struct ncclDevComm {
|
||||
int rank;
|
||||
int nRanks;
|
||||
int buffSizes[NCCL_NUM_PROTOCOLS];
|
||||
|
||||
// Operation list for aggregation
|
||||
int workFifoDepth;
|
||||
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
|
||||
|
||||
// Flag to ask NCCL kernels to abort
|
||||
volatile uint32_t *abortFlag;
|
||||
volatile uint32_t* abortFlag;
|
||||
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
|
||||
|
||||
#if defined(ENABLE_NPKIT)
|
||||
NpKitEventCollectContext* npKitEventCollectContexts;
|
||||
uint64_t* cpuTimestamp;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf* devProf;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
struct ncclCollTrace* collTrace;
|
||||
uint32_t collTraceHead, *collTraceTail;
|
||||
volatile uint32_t *collTraceTail;
|
||||
pthread_t collTraceThread;
|
||||
bool collTraceExit;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf* devProf;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct ncclDevCommAndChannels {
|
||||
ncclDevComm comm;
|
||||
ncclChannel channels[MAXCHANNELS];
|
||||
struct alignas(16) ncclDevCommAndChannels {
|
||||
struct ncclDevComm comm;
|
||||
struct ncclDevChannel channels[MAXCHANNELS];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
+6
-113
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,6 +10,7 @@
|
||||
#include "comm.h"
|
||||
#include "group.h"
|
||||
#include "collectives.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
|
||||
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
|
||||
@@ -19,117 +19,10 @@ size_t ncclKernMaxLocalSize();
|
||||
size_t ncclKernLocalSize(int i);
|
||||
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
|
||||
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
|
||||
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
|
||||
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchReset(ncclComm_t comm);
|
||||
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
|
||||
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
|
||||
template<int USING_CUDA_GRAPH>
|
||||
void HIPRT_CB ncclEnqueueHostSetup(void* arg);
|
||||
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph);
|
||||
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph);
|
||||
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
|
||||
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
|
||||
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
|
||||
|
||||
struct ncclBuffRegInfo {
|
||||
void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
|
||||
void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
|
||||
void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
|
||||
void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
|
||||
int nBuffs;
|
||||
};
|
||||
|
||||
// Enqueue information (for kernel and proxy) for each operation
|
||||
struct ncclQueueElem {
|
||||
struct ncclWork work;
|
||||
struct ncclProxyOp proxyOp;
|
||||
struct ncclBuffRegInfo buffRegInfo;
|
||||
};
|
||||
|
||||
typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
|
||||
|
||||
// Structure passed to CUDA graph
|
||||
struct ncclQueueInfo {
|
||||
ncclComm_t comm;
|
||||
int maxChannels; // Dynamic version of gridDim
|
||||
ncclResult_t ret; // Return value of host setup call
|
||||
int nRegBuffs;
|
||||
ncclQueueElemList* elemList;
|
||||
};
|
||||
|
||||
static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
|
||||
NCCLCHECK(ncclCalloc(eqInfo, 1));
|
||||
(*eqInfo)->comm = comm;
|
||||
(*eqInfo)->elemList = new ncclQueueElemList();
|
||||
(*eqInfo)->comm->nQueueInfoCreated++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Reset element queue
|
||||
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
|
||||
if (eqInfo == NULL) return ncclInternalError;
|
||||
eqInfo->maxChannels = 0;
|
||||
eqInfo->ret = ncclSuccess;
|
||||
eqInfo->nRegBuffs = 0;
|
||||
eqInfo->elemList->recycle();
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Destroy enqueue info space
|
||||
// used by both CUDA graph and non CUDA graph
|
||||
static void ncclDestroyQueueInfo(void* ptr) {
|
||||
if (ptr == NULL) return;
|
||||
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
|
||||
struct ncclComm* comm = eqInfo->comm;
|
||||
// Close IPC mem handles for registered buffers
|
||||
struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
|
||||
#if 0
|
||||
// Ideally, the deregistration should happen here
|
||||
// but currently the destroy function of CUDA objects does not allow CUDA API calls
|
||||
while (eqElem != NULL) {
|
||||
for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
|
||||
if (i == eqInfo->comm->localRank) continue;
|
||||
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
|
||||
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
|
||||
}
|
||||
eqElem = eqInfo->elemList->getNext();
|
||||
}
|
||||
#else
|
||||
// Instead, we push these pointers to a pool owned by ncclComm
|
||||
// and asks a helper thread to close mem handles
|
||||
struct ncclGraphHelperResources* res = comm->graphHelperResources;
|
||||
int ipcTailOld = 0;
|
||||
if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
|
||||
|
||||
pthread_mutex_lock(&res->threadLock);
|
||||
ipcTailOld = res->ipcTail;
|
||||
while (eqElem != NULL) {
|
||||
for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
|
||||
if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
|
||||
res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
|
||||
res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
|
||||
}
|
||||
if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
|
||||
res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
|
||||
res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
|
||||
}
|
||||
}
|
||||
eqElem = eqInfo->elemList->getNext();
|
||||
}
|
||||
if (res->ipcTail != ipcTailOld) {
|
||||
res->threadState = ThreadStart;
|
||||
TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
|
||||
pthread_cond_signal(&res->threadCond);
|
||||
}
|
||||
pthread_mutex_unlock(&res->threadLock);
|
||||
#endif
|
||||
|
||||
skip:
|
||||
delete eqInfo->elemList;
|
||||
free(eqInfo);
|
||||
comm->nQueueInfoDestroyed++;
|
||||
return;
|
||||
}
|
||||
#endif // End include guard
|
||||
|
||||
@@ -263,7 +263,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
|
||||
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
|
||||
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
|
||||
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
|
||||
CUDACHECK(hipFree(md->gdrDevMem));
|
||||
CUDACHECK(cudaFree(md->gdrDevMem));
|
||||
free(md);
|
||||
|
||||
return ncclSuccess;
|
||||
|
||||
Archivo normal → Archivo ejecutable
@@ -24,7 +24,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
|
||||
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
|
||||
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
|
||||
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
void ncclTopoFree(struct ncclTopoSystem* system);
|
||||
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
|
||||
@@ -37,7 +37,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int ne
|
||||
#define MAX_XGMI_INTER_GPUS 4
|
||||
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
|
||||
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
|
||||
int ncclPxnDisable();
|
||||
int ncclPxnDisable(struct ncclComm* comm);
|
||||
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
|
||||
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
|
||||
|
||||
|
||||
+70
-7
@@ -11,15 +11,78 @@
|
||||
#include "nccl.h"
|
||||
#include "comm.h"
|
||||
|
||||
bool ncclAsyncMode();
|
||||
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
|
||||
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
|
||||
void ncclGroupCommJoin(struct ncclComm* comm);
|
||||
void ncclGroupCommPreconnect(struct ncclComm* comm);
|
||||
void ncclGroupCommLeave(struct ncclComm* comm);
|
||||
|
||||
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
|
||||
struct ncclAsyncJob {
|
||||
struct ncclAsyncJob* next;
|
||||
pthread_t thread;
|
||||
ncclResult_t result;
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*);
|
||||
void(*undo)(struct ncclAsyncJob*);
|
||||
void(*destructor)(void*);
|
||||
};
|
||||
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
|
||||
ncclResult_t ncclAsyncLaunch(
|
||||
struct ncclAsyncJob* job,
|
||||
ncclResult_t(*func)(struct ncclAsyncJob*),
|
||||
void(*undo)(struct ncclAsyncJob*),
|
||||
void(*destructor)(void*)
|
||||
);
|
||||
|
||||
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclResult_t ncclGroupStartInternal();
|
||||
ncclResult_t ncclGroupEndInternal();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
|
||||
extern __thread ncclResult_t ncclGroupError;
|
||||
extern __thread struct ncclComm* ncclGroupCommHead;
|
||||
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
|
||||
|
||||
inline ncclResult_t ncclGroupStartInternal() {
|
||||
ncclGroupDepth++;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
|
||||
if (ncclGroupDepth > 0) {
|
||||
if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Add comm to this thread's group
|
||||
inline void ncclGroupCommJoin(struct ncclComm* comm) {
|
||||
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
|
||||
// the users program order yet insures siblings occur consecutively. This
|
||||
// is required by doLaunches() in "group.cc".
|
||||
struct ncclComm** pp = &ncclGroupCommHead;
|
||||
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
|
||||
pp = &(*pp)->groupNext;
|
||||
comm->groupNext = *pp;
|
||||
*pp = comm;
|
||||
// Comms gets a new memory stack scope upon joining. Each task batched for
|
||||
// this comm is allocated there.
|
||||
ncclMemoryStackPush(&comm->memScoped);
|
||||
}
|
||||
}
|
||||
|
||||
// Add comm to this thread's group needing preconnect
|
||||
inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
|
||||
if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
|
||||
comm->preconnectNext = ncclGroupCommPreconnectHead;
|
||||
ncclGroupCommPreconnectHead = comm;
|
||||
}
|
||||
}
|
||||
|
||||
// Comm has left group
|
||||
inline void ncclGroupCommLeave(struct ncclComm* comm) {
|
||||
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
|
||||
ncclMemoryStackPop(&comm->memScoped);
|
||||
}
|
||||
|
||||
ncclResult_t ncclAsyncColl(ncclComm_t comm);
|
||||
#endif
|
||||
|
||||
@@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
|
||||
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
|
||||
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
|
||||
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
|
||||
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
|
||||
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
|
||||
ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
|
||||
ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
|
||||
|
||||
+63
-2
@@ -11,6 +11,9 @@
|
||||
#include "nccl.h"
|
||||
#include "devcomm.h"
|
||||
#include "collectives.h"
|
||||
#include "core.h"
|
||||
#include "utils.h"
|
||||
#include "strongstream.h"
|
||||
|
||||
typedef enum : uint8_t {
|
||||
ncclPatternRing,
|
||||
@@ -53,8 +56,66 @@ struct ncclInfo {
|
||||
int nchunksPerLoop;
|
||||
int chunkSize;
|
||||
int channelId;
|
||||
uint16_t connIndex;
|
||||
uint64_t opCount;
|
||||
};
|
||||
|
||||
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
|
||||
info->count = info->nBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTaskColl {
|
||||
struct ncclTaskColl* next;
|
||||
ncclFunc_t func;
|
||||
void const* sendbuff;
|
||||
void* recvbuff;
|
||||
size_t count;
|
||||
int root;
|
||||
ncclDataType_t datatype;
|
||||
ncclDevRedOpFull op;
|
||||
int chunkSteps, sliceSteps;
|
||||
};
|
||||
struct ncclTaskP2p {
|
||||
ncclTaskP2p *next;
|
||||
void *buff;
|
||||
size_t bytes;
|
||||
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
|
||||
// of where it left off.
|
||||
int chunk;
|
||||
};
|
||||
|
||||
struct ncclCudaStreamList {
|
||||
struct ncclCudaStreamList *next;
|
||||
hipStream_t stream;
|
||||
};
|
||||
|
||||
struct ncclTasks {
|
||||
struct Peer {
|
||||
bool sendSeen, recvSeen;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
|
||||
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
|
||||
};
|
||||
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
|
||||
size_t collBytesTotal;
|
||||
struct Peer* peers/*[nRanks]*/;
|
||||
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
|
||||
int nTasksColl, nTasksP2p;
|
||||
|
||||
// The list of user streams aggregated over all tasks present.
|
||||
struct ncclCudaStreamList* streams;
|
||||
// Keep track of the number of user streams
|
||||
int numStreams;
|
||||
// The most recent user stream. Ignored if streams==nullptr
|
||||
hipStream_t streamRecent;
|
||||
// The graph capturing all user streams or invalid if none. Thus we restrict the
|
||||
// user that all streams must be captured in the same graph or not captured
|
||||
// at all. Technically we could probably relax this, but that would mean
|
||||
// collecting a different `ncclTasks` per graph and one for non-graph.
|
||||
struct ncclCudaGraph capturingGraph;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
+107
-14
@@ -14,12 +14,13 @@
|
||||
|
||||
#define NCCL_PTR_HOST 0x1
|
||||
#define NCCL_PTR_CUDA 0x2
|
||||
#define NCCL_PTR_DMABUF 0x4
|
||||
|
||||
// Maximum number of requests per comm object
|
||||
#define NCCL_NET_MAX_REQUESTS 8
|
||||
|
||||
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
|
||||
|
||||
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
|
||||
|
||||
@@ -28,15 +29,15 @@ typedef struct {
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
uint64_t guid; // Unique identifier for the NIC chip. Important for
|
||||
// cards with multiple PCI functions (Physical or virtual).
|
||||
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
|
||||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
|
||||
int speed; // Port speed in Mbps.
|
||||
int port; // Port number.
|
||||
float latency; // Network latency
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
int maxRecvs; // Maximum number of grouped receives.
|
||||
}ncclNetProperties_v5_t;
|
||||
}ncclNetProperties_v6_t;
|
||||
|
||||
typedef ncclNetProperties_v5_t ncclNetProperties_t;
|
||||
typedef ncclNetProperties_v6_t ncclNetProperties_t;
|
||||
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
@@ -46,7 +47,103 @@ typedef struct {
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Connect to a handle and return a sending comm object for that peer.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with sendComm == NULL with the expectation that
|
||||
// it will be called again until sendComm != NULL.
|
||||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
|
||||
// Finalize connection establishment after remote peer has called connect.
|
||||
// This call must not block for the connection to be established, and instead
|
||||
// should return successfully with recvComm == NULL with the expectation that
|
||||
// it will be called again until recvComm != NULL.
|
||||
ncclResult_t (*accept)(void* listenComm, void** recvComm);
|
||||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
|
||||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* comm, void* mhandle);
|
||||
// Asynchronous send to a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
|
||||
// Asynchronous recv from a peer.
|
||||
// May return request == NULL if the call cannot be performed (or would block)
|
||||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* sizes);
|
||||
// Close and free send/recv comm objects
|
||||
ncclResult_t (*closeSend)(void* sendComm);
|
||||
ncclResult_t (*closeRecv)(void* recvComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v6_t;
|
||||
|
||||
typedef ncclNet_v6_t ncclNet_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
|
||||
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the collective network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters capable of doing collective operations.
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
|
||||
// Create a group for collective operations. handles have been created
|
||||
// using listen() above. rank indicates caller's rank in the collective network.
|
||||
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
|
||||
// Returns whether a reduction operation on a data type is supported.
|
||||
// 1 for supported, 0 otherwise.
|
||||
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
|
||||
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
|
||||
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
|
||||
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
|
||||
// Performs an asynchronous allreduce operation on the collective group.
|
||||
// May return request == NULL if the call cannot be performed (or would block).
|
||||
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
|
||||
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
|
||||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
|
||||
// visible to the GPU
|
||||
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
|
||||
// Test whether a request is complete. If size is not NULL, it returns the
|
||||
// number of bytes sent/received.
|
||||
ncclResult_t (*test)(void* request, int* done, int* size);
|
||||
// Close and free collective comm objects
|
||||
ncclResult_t (*closeColl)(void* collComm);
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v6_t;
|
||||
|
||||
typedef ncclCollNet_v6_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
|
||||
|
||||
// v5 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
// Initialize the network.
|
||||
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
|
||||
// Return the number of adapters.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create a connection.
|
||||
@@ -83,10 +180,7 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v5_t;
|
||||
|
||||
typedef ncclNet_v5_t ncclNet_t;
|
||||
|
||||
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
|
||||
|
||||
// v5 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
@@ -96,7 +190,7 @@ typedef struct {
|
||||
// If ndev returns 0, all other functions might be set to NULL.
|
||||
ncclResult_t (*devices)(int* ndev);
|
||||
// Get various device properties.
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
|
||||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
|
||||
// Create a receiving object and provide a handle to connect to it. The
|
||||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
|
||||
// between ranks to create connections.
|
||||
@@ -125,10 +219,7 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclCollNet_v5_t;
|
||||
|
||||
typedef ncclCollNet_v5_t ncclCollNet_t;
|
||||
|
||||
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
|
||||
|
||||
// v4 struct for backwards compatibility
|
||||
typedef struct {
|
||||
char* name; // Used mostly for logging.
|
||||
char* pciPath; // Path to the PCI device in /sys.
|
||||
@@ -140,6 +231,7 @@ typedef struct {
|
||||
int maxComms; // Maximum number of comms we can create
|
||||
} ncclNetProperties_v4_t;
|
||||
|
||||
// v4 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the network (mainly for logs)
|
||||
const char* name;
|
||||
@@ -179,6 +271,7 @@ typedef struct {
|
||||
ncclResult_t (*closeListen)(void* listenComm);
|
||||
} ncclNet_v4_t;
|
||||
|
||||
// v4 struct for backwards compatibility
|
||||
typedef struct {
|
||||
// Name of the collective network (mainly for logs)
|
||||
const char* name;
|
||||
|
||||
+22
-19
@@ -9,33 +9,36 @@
|
||||
|
||||
#include "nccl.h"
|
||||
#include "nccl_net.h"
|
||||
#include "comm.h"
|
||||
#include "checks.h"
|
||||
|
||||
extern ncclNet_t* ncclNet;
|
||||
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
|
||||
|
||||
ncclResult_t ncclNetInit();
|
||||
int ncclNetVersion();
|
||||
ncclResult_t ncclNetPluginInit();
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm);
|
||||
int ncclNetVersion(struct ncclComm* comm);
|
||||
|
||||
// Translation to external API
|
||||
static const char* ncclNetName() { return ncclNet->name; }
|
||||
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
|
||||
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
|
||||
/* DMA-BUF support */
|
||||
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
|
||||
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
|
||||
|
||||
// Test whether the current GPU support GPU Direct RDMA.
|
||||
ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
|
||||
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
|
||||
|
||||
extern ncclNet_t ncclNetIb;
|
||||
extern ncclNet_t ncclNetSocket;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "hip/hip_runtime.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDA_V3
|
||||
#define NVTOOLSEXT_CUDA_V3
|
||||
@@ -42,10 +42,10 @@ extern "C" {
|
||||
*/
|
||||
typedef enum nvtxResourceCUDAType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
|
||||
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
|
||||
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
|
||||
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
|
||||
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* hipDevice_t */
|
||||
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* hipCtx_t */
|
||||
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* hipStream_t */
|
||||
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* hipEvent_t */
|
||||
} nvtxResourceCUDAType_t;
|
||||
|
||||
|
||||
@@ -59,8 +59,8 @@ typedef enum nvtxResourceCUDAType_t
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@@ -73,16 +73,16 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
|
||||
*
|
||||
* \par Example:
|
||||
* \code
|
||||
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
|
||||
* if ( CUDA_SUCCESS != status )
|
||||
* hipError_t status = hipCtxCreate( &cuContext, 0, cuDevice );
|
||||
* if ( hipSuccess != status )
|
||||
* goto Error;
|
||||
* nvtxNameCuContext(cuContext, "CTX_NAME");
|
||||
* \endcode
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@@ -95,8 +95,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@@ -109,8 +109,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
|
||||
#include "nvToolsExt.h"
|
||||
|
||||
#include "cuda.h"
|
||||
#include "driver_types.h"
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/driver_types.h"
|
||||
|
||||
#ifndef NVTOOLSEXT_CUDART_V3
|
||||
#define NVTOOLSEXT_CUDART_V3
|
||||
@@ -44,8 +44,8 @@ extern "C" {
|
||||
typedef enum nvtxResourceCUDARTType_t
|
||||
{
|
||||
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
|
||||
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
|
||||
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
|
||||
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* hipStream_t */
|
||||
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* hipEvent_t */
|
||||
} nvtxResourceCUDARTType_t;
|
||||
|
||||
|
||||
@@ -73,8 +73,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@@ -87,8 +87,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
|
||||
*
|
||||
* \version \NVTX_VERSION_1
|
||||
* @{ */
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name);
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name);
|
||||
/** @} */
|
||||
|
||||
/** @} */ /* END RESOURCE_NAMING */
|
||||
|
||||
@@ -16,10 +16,10 @@ extern "C" {
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(hipStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(hipEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
|
||||
{
|
||||
@@ -39,7 +39,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
|
||||
@@ -48,7 +48,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char*
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
|
||||
@@ -57,7 +57,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
|
||||
@@ -66,7 +66,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* na
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
|
||||
|
||||
@@ -15,16 +15,16 @@
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(hipDevice_t device, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(hipDevice_t device, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(hipCtx_t context, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(hipCtx_t context, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(hipStream_t stream, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(hipEvent_t event, const char* name);
|
||||
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
|
||||
@@ -33,7 +33,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
|
||||
@@ -42,7 +42,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
|
||||
@@ -51,7 +51,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* na
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
|
||||
@@ -60,7 +60,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
|
||||
@@ -69,7 +69,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
|
||||
@@ -78,7 +78,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
|
||||
@@ -87,7 +87,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
|
||||
#endif /*NVTX_DISABLE*/
|
||||
}
|
||||
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
|
||||
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name)
|
||||
{
|
||||
#ifndef NVTX_DISABLE
|
||||
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
/* ------ Dependency-free types binary-compatible with real types ------- */
|
||||
|
||||
/* In order to avoid having the NVTX core API headers depend on non-NVTX
|
||||
* headers like cuda.h, NVTX defines binary-compatible types to use for
|
||||
* headers like hip/hip_runtime.h, NVTX defines binary-compatible types to use for
|
||||
* safely making the initialization versions of all NVTX functions without
|
||||
* needing to have definitions for the real types. */
|
||||
|
||||
|
||||
@@ -9,21 +9,4 @@
|
||||
#ifndef NCCL_P2P_H_
|
||||
#define NCCL_P2P_H_
|
||||
|
||||
struct ncclP2Pinfo {
|
||||
void* buff;
|
||||
ssize_t nbytes;
|
||||
uint64_t opCount;
|
||||
};
|
||||
|
||||
typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
|
||||
|
||||
static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes, uint64_t opCount) {
|
||||
if (p2p == NULL) p2p = new ncclP2Plist();
|
||||
struct ncclP2Pinfo* next;
|
||||
NCCLCHECK(p2p->getNewElem(&next));
|
||||
next->buff = buff;
|
||||
next->nbytes = nBytes;
|
||||
next->opCount = opCount;
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
+18
-10
@@ -26,18 +26,26 @@ struct ncclProxyOp {
|
||||
int channelId;
|
||||
int nsteps;
|
||||
ssize_t nbytes;
|
||||
int root;
|
||||
struct {
|
||||
int root:30;
|
||||
uint32_t connIndex:2;
|
||||
};
|
||||
int next;
|
||||
|
||||
uint64_t opCount;
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int chunkSize;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
ncclPattern_t pattern; // uint8_t
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t protocol;
|
||||
uint16_t connIndex;
|
||||
|
||||
union {
|
||||
uint64_t unused;
|
||||
// For use by enqueue.cc
|
||||
struct ncclProxyOp *enqNext;
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
|
||||
|
||||
@@ -73,9 +81,9 @@ struct ncclProxyArgs {
|
||||
int sliceSteps;
|
||||
int chunkSteps;
|
||||
int chunkSize;
|
||||
ncclDataType_t dtype;
|
||||
ncclRedOp_t redOp;
|
||||
ncclPattern_t pattern;
|
||||
uint8_t /*ncclDataType_t*/ dtype;
|
||||
uint8_t /*ncclDevRedOp_t*/ redOp;
|
||||
uint8_t /*ncclPattern_t*/ pattern;
|
||||
uint8_t protocol;
|
||||
int state;
|
||||
char* sharedBuff[NCCL_STEPS];
|
||||
@@ -164,6 +172,7 @@ struct ncclProxyState {
|
||||
pthread_t thread;
|
||||
struct ncclSocket* listenSock;
|
||||
int stop;
|
||||
hipCtx_t cudaCtx;
|
||||
|
||||
// Used by main thread
|
||||
union ncclSocketAddress* peerAddresses;
|
||||
@@ -193,9 +202,8 @@ enum proxyMode {
|
||||
proxyTo = 2
|
||||
};
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
|
||||
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
|
||||
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
|
||||
ncclResult_t ncclProxyStart(struct ncclComm* comm);
|
||||
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
|
||||
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_ROCMWRAP_H_
|
||||
#define NCCL_ROCMWRAP_H_
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
|
||||
typedef hsa_status_t (*PFN_hsa_init)();
|
||||
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
|
||||
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
|
||||
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
|
||||
|
||||
|
||||
#define CUPFN(symbol) pfn_##symbol
|
||||
|
||||
// Check CUDA PFN driver calls
|
||||
#define CUCHECK(cmd) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKGOTO(cmd, res, label) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
WARN("ROCr failure '%s'", errStr); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
// Report failure but clear error and continue
|
||||
#define CUCHECKIGNORE(cmd) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if( err != HSA_STATUS_SUCCESS ) { \
|
||||
const char *errStr; \
|
||||
pfn_hsa_status_string(err, &errStr); \
|
||||
INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr); \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUCHECKTHREAD(cmd, args) do { \
|
||||
hsa_status_t err = pfn_##cmd; \
|
||||
if (err != HSA_STATUS_SUCCESS) { \
|
||||
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
|
||||
args->ret = ncclUnhandledCudaError; \
|
||||
return args; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DECLARE_ROCM_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
|
||||
|
||||
DECLARE_ROCM_PFN_EXTERN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
|
||||
|
||||
/* ROCr Driver functions loaded with dlsym() */
|
||||
DECLARE_ROCM_PFN_EXTERN(hsa_init);
|
||||
DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
|
||||
DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
|
||||
|
||||
ncclResult_t rocmLibraryInit(void);
|
||||
|
||||
#endif
|
||||
Archivo normal → Archivo ejecutable
@@ -0,0 +1,142 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_STRONGSTREAM_H_
|
||||
#define NCCL_STRONGSTREAM_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "checks.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
|
||||
* easily.
|
||||
*/
|
||||
struct ncclCudaGraph {
|
||||
#if CUDART_VERSION >= 11030
|
||||
cudaGraph_t graph;
|
||||
uint64_t graphId;
|
||||
#endif
|
||||
};
|
||||
|
||||
inline struct ncclCudaGraph ncclCudaGraphNull() {
|
||||
struct ncclCudaGraph tmp;
|
||||
#if CUDART_VERSION >= 11030
|
||||
tmp.graph = nullptr;
|
||||
tmp.graphId = ULLONG_MAX;
|
||||
#endif
|
||||
return tmp;
|
||||
}
|
||||
|
||||
inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
return graph.graph != nullptr;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
return a.graphId == b.graphId;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, hipStream_t stream);
|
||||
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg);
|
||||
|
||||
|
||||
/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
|
||||
* identity while being captured. Regular streams have the deficiency that the
|
||||
* captured form of a stream in one graph launch has no relation to the
|
||||
* uncaptured stream or to the captured form in other graph launches. This makes
|
||||
* streams unfit for the use of serializing access to a persistent resource.
|
||||
* Strong streams have been introduced to address this need.
|
||||
*
|
||||
* Constraints of using strong streams:
|
||||
*
|
||||
* - Operations that enqueue work to the strong stream need to be enclosed by
|
||||
* ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences,
|
||||
* the strong stream is not stateful so there is no harm in redundant acquire
|
||||
* or releases.
|
||||
*
|
||||
* - An {Acquire; ...; Release} sequence must not be concurrent with any
|
||||
* other operations against the strong stream including graph launches which
|
||||
* reference this stream.
|
||||
*
|
||||
* - All strong stream functions take a "graph" parameter which must reference
|
||||
* the currently capturing graph, or null if none.
|
||||
*/
|
||||
struct ncclStrongStream;
|
||||
|
||||
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
|
||||
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
|
||||
|
||||
// Has this strong stream ever been captured in a graph.
|
||||
bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss);
|
||||
|
||||
// Acquire-fence the strong stream.
|
||||
ncclResult_t ncclStrongStreamAcquire(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss
|
||||
);
|
||||
|
||||
// Acquire-fence the strong stream assuming no graph is capturing. This permits
|
||||
// the caller to enqueue directly to the `ss->stream` member using native CUDA
|
||||
// calls. Strong stream must be released via:
|
||||
// ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
|
||||
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
|
||||
|
||||
// Release-fence of the strong stream.
|
||||
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
|
||||
|
||||
// Add a host launch to the stream.
|
||||
ncclResult_t ncclStrongStreamLaunchHost(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
hipHostFn_t fn, void* arg
|
||||
);
|
||||
// Add a kernel launch to the stream.
|
||||
ncclResult_t ncclStrongStreamLaunchKernel(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
|
||||
);
|
||||
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
|
||||
);
|
||||
// `b` must be capturing within `graph`.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
|
||||
);
|
||||
// `a` must be capturing within `graph`.
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
|
||||
);
|
||||
|
||||
// Synchrnoization does not need the strong stream to be acquired.
|
||||
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclStrongStream {
|
||||
hipStream_t stream;
|
||||
hipEvent_t event;
|
||||
#if CUDART_VERSION >= 11030
|
||||
cudaGraphNode_t node; // null if never captured, otherwise never null again
|
||||
uint64_t graphId:63, eventIsLagging:1;
|
||||
#endif
|
||||
};
|
||||
|
||||
inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
return ss->node != nullptr;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -21,7 +21,12 @@
|
||||
|
||||
#include "proxy.h"
|
||||
|
||||
extern struct ncclTransport ncclTransports[];
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
extern struct ncclTransport netTransport;
|
||||
extern struct ncclTransport collNetTransport;
|
||||
|
||||
extern struct ncclTransport* ncclTransports[];
|
||||
|
||||
// Forward declarations
|
||||
struct ncclRing;
|
||||
@@ -66,7 +71,7 @@ struct ncclTransport {
|
||||
struct ncclTransportComm recv;
|
||||
};
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
|
||||
|
||||
enum { collNetRecv=0, collNetSend=1 };
|
||||
|
||||
+433
-64
@@ -8,8 +8,12 @@
|
||||
#define NCCL_UTILS_H_
|
||||
|
||||
#include "nccl.h"
|
||||
#include "alloc.h"
|
||||
#include "checks.h"
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
#include <sched.h>
|
||||
#include <new>
|
||||
|
||||
int ncclCudaCompCap();
|
||||
|
||||
@@ -38,81 +42,446 @@ static long log2i(long n) {
|
||||
return l;
|
||||
}
|
||||
|
||||
// Recyclable list that avoids frequent malloc/free
|
||||
inline uint64_t clockNano() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename Int>
|
||||
inline void ncclAtomicRefCountIncrement(Int* refs) {
|
||||
__atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
template<typename Int>
|
||||
inline Int ncclAtomicRefCountDecrement(Int* refs) {
|
||||
return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
|
||||
* granularity of LIFO is not per object, instead frames containing many objects
|
||||
* are pushed and popped. Therefor deallocation is extremely cheap since its
|
||||
* done at the frame granularity.
|
||||
*
|
||||
* The initial state of the stack is with one frame, the "nil" frame, which
|
||||
* cannot be popped. Therefor objects allocated in the nil frame cannot be
|
||||
* deallocated sooner than stack destruction.
|
||||
*/
|
||||
struct ncclMemoryStack;
|
||||
|
||||
void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackPush(struct ncclMemoryStack* me);
|
||||
void ncclMemoryStackPop(struct ncclMemoryStack* me);
|
||||
template<typename T>
|
||||
struct ncclListElem {
|
||||
T data;
|
||||
struct ncclListElem* next;
|
||||
T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
|
||||
* a pool instance to ever hold objects whose type have differing
|
||||
* (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
|
||||
* a backing `ncclMemoryStack` passed during Alloc(). If memory
|
||||
* backing any currently held object is deallocated then it is an error to do
|
||||
* anything other than reconstruct it, after which it is a valid empty pool.
|
||||
*/
|
||||
struct ncclMemoryPool;
|
||||
|
||||
// Equivalent to zero-initialization
|
||||
void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
|
||||
template<typename T>
|
||||
T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
|
||||
template<typename T>
|
||||
void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
|
||||
void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
|
||||
* field is given via the `next` template argument.
|
||||
*
|
||||
* Example:
|
||||
* struct Foo {
|
||||
* struct Foo *next1, *next2; // can be a member of two lists at once
|
||||
* };
|
||||
* ncclIntruQueue<Foo, &Foo::next1> list1;
|
||||
* ncclIntruQueue<Foo, &Foo::next2> list2;
|
||||
*/
|
||||
template<typename T, T *T::*next>
|
||||
struct ncclIntruQueue;
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
|
||||
* and "cond" fields are part of the public interface.
|
||||
*/
|
||||
struct ncclThreadSignal {
|
||||
pthread_mutex_t mutex;
|
||||
pthread_cond_t cond;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class ncclRecyclableList {
|
||||
private:
|
||||
struct ncclListElem<T>* head;
|
||||
struct ncclListElem<T>* tail;
|
||||
struct ncclListElem<T>* cursor;
|
||||
int n;
|
||||
// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
|
||||
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
|
||||
|
||||
public:
|
||||
ncclRecyclableList() {
|
||||
tail = cursor = head = NULL;
|
||||
n = 0;
|
||||
}
|
||||
void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
|
||||
void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
|
||||
|
||||
int count() const { return n; }
|
||||
// A convenience instance per-thread.
|
||||
extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
|
||||
|
||||
// Get a new element from the list and return pointer
|
||||
ncclResult_t getNewElem(T** dataOut) {
|
||||
if (tail != NULL) {
|
||||
*dataOut = &tail->data;
|
||||
memset(*dataOut, 0, sizeof(T));
|
||||
} else {
|
||||
NCCLCHECK(ncclCalloc(&tail, 1));
|
||||
*dataOut = &tail->data;
|
||||
cursor = head = tail;
|
||||
}
|
||||
if (tail->next == NULL) {
|
||||
NCCLCHECK(ncclCalloc(&tail->next, 1));
|
||||
}
|
||||
tail = tail->next;
|
||||
n += 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
T* begin() {
|
||||
if (head == NULL || head == tail) return NULL;
|
||||
cursor = head->next;
|
||||
return &head->data;
|
||||
}
|
||||
template<typename T, T *T::*next>
|
||||
struct ncclIntruQueueMpsc;
|
||||
|
||||
// Get next element from the list during an iteration
|
||||
T* getNext() {
|
||||
// tail always points to the next element to be enqueued
|
||||
// hence does not contain valid data
|
||||
if (cursor == NULL || cursor == tail) return NULL;
|
||||
T* rv = &cursor->data;
|
||||
cursor = cursor->next;
|
||||
return rv;
|
||||
}
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
|
||||
template<typename T, T *T::*next>
|
||||
bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
|
||||
// Enqueue element. Returns true if queue is not abandoned. Even if queue is
|
||||
// abandoned the element enqueued, so the caller needs to make arrangements for
|
||||
// the queue to be tended.
|
||||
template<typename T, T *T::*next>
|
||||
bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
|
||||
// Dequeue all elements at a glance. If there aren't any and `waitSome` is
|
||||
// true then this call will wait until it can return a non empty list.
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
|
||||
// Dequeue all elements and set queue to abandoned state.
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);
|
||||
|
||||
T* peakNext() {
|
||||
if (cursor == NULL || cursor == tail) return NULL;
|
||||
return &cursor->data;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Recycle the list without freeing the space
|
||||
void recycle() {
|
||||
tail = cursor = head;
|
||||
n = 0;
|
||||
}
|
||||
struct ncclMemoryStack {
|
||||
struct Hunk {
|
||||
struct Hunk* above; // reverse stack pointer
|
||||
size_t size; // size of this allocation (including this header struct)
|
||||
};
|
||||
struct Unhunk { // proxy header for objects allocated out-of-hunk
|
||||
struct Unhunk* next;
|
||||
void* obj;
|
||||
};
|
||||
struct Frame {
|
||||
struct Hunk* hunk; // top of non-empty hunks
|
||||
uintptr_t bumper, end; // points into top hunk
|
||||
struct Unhunk* unhunks;
|
||||
struct Frame* below;
|
||||
};
|
||||
|
||||
~ncclRecyclableList() {
|
||||
while (head != NULL) {
|
||||
struct ncclListElem<T>* temp = head;
|
||||
head = head->next;
|
||||
free(temp);
|
||||
}
|
||||
}
|
||||
static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
|
||||
static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
|
||||
|
||||
struct Hunk stub;
|
||||
struct Frame topFrame;
|
||||
};
|
||||
|
||||
inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
|
||||
me->stub.above = nullptr;
|
||||
me->stub.size = 0;
|
||||
me->topFrame.hunk = &me->stub;
|
||||
me->topFrame.bumper = 0;
|
||||
me->topFrame.end = 0;
|
||||
me->topFrame.unhunks = nullptr;
|
||||
me->topFrame.below = nullptr;
|
||||
}
|
||||
|
||||
inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
|
||||
uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
|
||||
void* obj;
|
||||
if (__builtin_expect(o + size <= me->topFrame.end, true)) {
|
||||
me->topFrame.bumper = o + size;
|
||||
obj = reinterpret_cast<void*>(o);
|
||||
} else {
|
||||
obj = allocateSpilled(me, size, align);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
|
||||
void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
|
||||
memset(obj, 0, n*sizeof(T));
|
||||
return (T*)obj;
|
||||
}
|
||||
|
||||
inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
|
||||
using Frame = ncclMemoryStack::Frame;
|
||||
Frame tmp = me->topFrame;
|
||||
Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
|
||||
*snapshot = tmp; // C++ struct assignment
|
||||
me->topFrame.unhunks = nullptr;
|
||||
me->topFrame.below = snapshot;
|
||||
}
|
||||
|
||||
inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
|
||||
ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
|
||||
while (un != nullptr) {
|
||||
free(un->obj);
|
||||
un = un->next;
|
||||
}
|
||||
me->topFrame = *me->topFrame.below; // C++ struct assignment
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ncclMemoryPool {
|
||||
struct Cell {
|
||||
Cell *next;
|
||||
};
|
||||
template<int Size, int Align>
|
||||
union CellSized {
|
||||
Cell cell;
|
||||
alignas(Align) char space[Size];
|
||||
};
|
||||
struct Cell* head;
|
||||
struct Cell* tail; // meaningful only when head != nullptr
|
||||
};
|
||||
|
||||
inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
|
||||
me->head = nullptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
|
||||
using Cell = ncclMemoryPool::Cell;
|
||||
using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
|
||||
Cell* cell;
|
||||
if (__builtin_expect(me->head != nullptr, true)) {
|
||||
cell = me->head;
|
||||
me->head = cell->next;
|
||||
} else {
|
||||
// Use the internal allocate() since it doesn't memset to 0 yet.
|
||||
cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
|
||||
}
|
||||
memset(cell, 0, sizeof(T));
|
||||
return reinterpret_cast<T*>(cell);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
|
||||
using Cell = ncclMemoryPool::Cell;
|
||||
Cell* cell = reinterpret_cast<Cell*>(obj);
|
||||
cell->next = me->head;
|
||||
if (me->head == nullptr) me->tail = cell;
|
||||
me->head = cell;
|
||||
}
|
||||
|
||||
inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
|
||||
if (from->head != nullptr) {
|
||||
from->tail->next = me->head;
|
||||
if (me->head == nullptr) me->tail = from->tail;
|
||||
me->head = from->head;
|
||||
from->head = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
struct ncclIntruQueue {
|
||||
T *head, *tail;
|
||||
};
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
|
||||
me->head = nullptr;
|
||||
me->tail = nullptr;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
|
||||
return me->head == nullptr;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
|
||||
return me->head;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
|
||||
return me->tail;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
|
||||
x->*next = nullptr;
|
||||
(me->head ? me->tail->*next : me->head) = x;
|
||||
me->tail = x;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
|
||||
T *ans = me->head;
|
||||
me->head = ans->*next;
|
||||
if (me->head == nullptr) me->tail = nullptr;
|
||||
return ans;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
|
||||
T *ans = me->head;
|
||||
if (ans != nullptr) {
|
||||
me->head = ans->*next;
|
||||
if (me->head == nullptr) me->tail = nullptr;
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
|
||||
T *head = me->head;
|
||||
me->head = nullptr;
|
||||
me->tail = nullptr;
|
||||
while (head != nullptr) {
|
||||
T *tmp = head->*next;
|
||||
ncclMemoryPoolFree(pool, tmp);
|
||||
head = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
|
||||
return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
|
||||
}
|
||||
|
||||
inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
|
||||
pthread_mutex_init(&me->mutex, nullptr);
|
||||
pthread_cond_init(&me->cond, nullptr);
|
||||
}
|
||||
|
||||
inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
|
||||
pthread_mutex_destroy(&me->mutex);
|
||||
pthread_cond_destroy(&me->cond);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
struct ncclIntruQueueMpsc {
|
||||
T* head;
|
||||
uintptr_t tail;
|
||||
struct ncclThreadSignal* waiting;
|
||||
};
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
|
||||
me->head = nullptr;
|
||||
me->tail = 0x0;
|
||||
me->waiting = nullptr;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
|
||||
return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
|
||||
__atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
|
||||
uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
|
||||
T* prev = reinterpret_cast<T*>(utail);
|
||||
T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
|
||||
__atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
|
||||
if (utail == 0x1) { // waiting
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
|
||||
// This lock/unlock is essential to ensure we don't race ahead of the consumer
|
||||
// and signal the cond before they begin waiting on it.
|
||||
struct ncclThreadSignal* waiting = me->waiting;
|
||||
pthread_mutex_lock(&waiting->mutex);
|
||||
pthread_mutex_unlock(&waiting->mutex);
|
||||
pthread_cond_broadcast(&waiting->cond);
|
||||
}
|
||||
return utail != 0x2; // not abandoned
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
|
||||
T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
|
||||
if (head == nullptr) {
|
||||
if (!waitSome) return nullptr;
|
||||
uint64_t t0 = clockNano();
|
||||
bool sleeping = false;
|
||||
do {
|
||||
if (clockNano()-t0 >= 10*1000) { // spin for first 10us
|
||||
struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
|
||||
pthread_mutex_lock(&waitSignal->mutex);
|
||||
uintptr_t expected = sleeping ? 0x1 : 0x0;
|
||||
uintptr_t desired = 0x1;
|
||||
me->waiting = waitSignal; // release done by successful compare exchange
|
||||
if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
|
||||
sleeping = true;
|
||||
pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
|
||||
}
|
||||
pthread_mutex_unlock(&waitSignal->mutex);
|
||||
}
|
||||
head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
|
||||
} while (head == nullptr);
|
||||
}
|
||||
|
||||
__atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
|
||||
uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
|
||||
T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
|
||||
T *x = head;
|
||||
while (x != tail) {
|
||||
T *x1;
|
||||
int spins = 0;
|
||||
while (true) {
|
||||
x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
|
||||
if (x1 != nullptr) break;
|
||||
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
|
||||
}
|
||||
x = x1;
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
template<typename T, T *T::*next>
|
||||
T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
|
||||
uintptr_t expected = 0x0;
|
||||
if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
|
||||
return nullptr;
|
||||
} else {
|
||||
int spins = 0;
|
||||
T* head;
|
||||
while (true) {
|
||||
head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
|
||||
if (head != nullptr) break;
|
||||
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
|
||||
}
|
||||
__atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
|
||||
uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
|
||||
T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
|
||||
T *x = head;
|
||||
while (x != tail) {
|
||||
T *x1;
|
||||
spins = 0;
|
||||
while (true) {
|
||||
x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
|
||||
if (x1 != nullptr) break;
|
||||
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
|
||||
}
|
||||
x = x1;
|
||||
}
|
||||
return head;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
+421
-312
La diferencia del archivo ha sido suprimido porque es demasiado grande
Cargar Diff
@@ -45,12 +45,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
|
||||
return ncclInvalidArgument;
|
||||
}
|
||||
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P/AllToAllPivot calls to chars.
|
||||
info->nBytes = info->count * ncclTypeSize(info->datatype);
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
|
||||
info->count = info->nBytes;
|
||||
info->datatype = ncclInt8;
|
||||
}
|
||||
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
|
||||
NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
|
||||
|
||||
if (info->op < 0 || ncclMaxRedOp < info->op) {
|
||||
WARN("%s : invalid reduction operation %d", info->opName, info->op);
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "debug.h"
|
||||
#include "cudawrap.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
|
||||
DECLARE_CUDA_PFN(cuDeviceGet);
|
||||
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
|
||||
DECLARE_CUDA_PFN(cuGetErrorString);
|
||||
DECLARE_CUDA_PFN(cuGetErrorName);
|
||||
/* enqueue.cc */
|
||||
DECLARE_CUDA_PFN(cuMemGetAddressRange);
|
||||
/* proxy.cc */
|
||||
DECLARE_CUDA_PFN(cuCtxCreate_v3020);
|
||||
DECLARE_CUDA_PFN(cuCtxDestroy);
|
||||
DECLARE_CUDA_PFN(cuCtxSetCurrent);
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* transport/collNet.cc/net.cc*/
|
||||
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* CUDA Driver functions loaded with dlsym() */
|
||||
DECLARE_CUDA_PFN(cuInit);
|
||||
DECLARE_CUDA_PFN(cuDriverGetVersion);
|
||||
DECLARE_CUDA_PFN(cuGetProcAddress);
|
||||
|
||||
static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
|
||||
|
||||
#define CUDA_DRIVER_MIN_VERSION 11030
|
||||
|
||||
static void *cudaLib;
|
||||
static int cudaDriverVersion;
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
/*
|
||||
Load the CUDA symbols
|
||||
*/
|
||||
static int cudaPfnFuncLoader(void) {
|
||||
CUresult res;
|
||||
|
||||
#define LOAD_SYM(symbol, ignore) do { \
|
||||
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
|
||||
if (res != 0) { \
|
||||
if (!ignore) { \
|
||||
WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
|
||||
return ncclSystemError; } \
|
||||
} } while(0)
|
||||
|
||||
LOAD_SYM(cuGetErrorString, 0);
|
||||
LOAD_SYM(cuGetErrorName, 0);
|
||||
LOAD_SYM(cuDeviceGet, 0);
|
||||
LOAD_SYM(cuDeviceGetAttribute, 0);
|
||||
LOAD_SYM(cuMemGetAddressRange, 1);
|
||||
LOAD_SYM(cuCtxCreate_v3020, 1);
|
||||
LOAD_SYM(cuCtxDestroy, 1);
|
||||
LOAD_SYM(cuCtxSetCurrent, 1);
|
||||
#if CUDA_VERSION >= 11070
|
||||
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
ncclResult_t cudaLibraryInit(void) {
|
||||
CUresult res;
|
||||
|
||||
if (cudaState == cudaInitialized)
|
||||
return ncclSuccess;
|
||||
if (cudaState == cudaError)
|
||||
return ncclSystemError;
|
||||
|
||||
if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (cudaState == cudaInitializing) sched_yield();
|
||||
return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load CUDA driver library
|
||||
*/
|
||||
char path[1024];
|
||||
char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
|
||||
if (ncclCudaPath == NULL)
|
||||
snprintf(path, 1024, "%s", "libcuda.so");
|
||||
else
|
||||
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
|
||||
|
||||
cudaLib = dlopen(path, RTLD_LAZY);
|
||||
if (cudaLib == NULL) {
|
||||
WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load initial CUDA functions
|
||||
*/
|
||||
|
||||
pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
|
||||
if (pfn_cuInit == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuInit");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
|
||||
if (pfn_cuDriverGetVersion == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
|
||||
goto error;
|
||||
}
|
||||
|
||||
res = pfn_cuDriverGetVersion(&cudaDriverVersion);
|
||||
if (res != 0) {
|
||||
WARN("cuDriverGetVersion failed with %d", res);
|
||||
goto error;
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
|
||||
|
||||
if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
|
||||
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
|
||||
// Silently ignore version check mismatch for backwards compatibility
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
|
||||
if (pfn_cuGetProcAddress == NULL) {
|
||||
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Required to initialize the CUDA Driver.
|
||||
* Multiple calls of cuInit() will return immediately
|
||||
* without making any relevant change
|
||||
*/
|
||||
pfn_cuInit(0);
|
||||
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (cudaPfnFuncLoader()) {
|
||||
WARN("CUDA some PFN functions not found in the library");
|
||||
goto error;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaState = cudaInitialized;
|
||||
return ncclSuccess;
|
||||
|
||||
error:
|
||||
cudaState = cudaError;
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) {
|
||||
|
||||
if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (gdrState == gdrInitializing) pthread_yield();
|
||||
while (gdrState == gdrInitializing) sched_yield();
|
||||
return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
|
||||
|
||||
+20
-3
@@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
|
||||
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
|
||||
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
|
||||
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
|
||||
/* DMA-BUF support */
|
||||
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
|
||||
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
|
||||
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
|
||||
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
|
||||
@@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) {
|
||||
|
||||
if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (ibvState == ibvInitializing) pthread_yield();
|
||||
while (ibvState == ibvInitializing) sched_yield();
|
||||
return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
|
||||
@@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) {
|
||||
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
|
||||
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
|
||||
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
|
||||
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
|
||||
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
|
||||
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
|
||||
LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
|
||||
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
|
||||
@@ -126,6 +130,7 @@ teardown:
|
||||
ibv_internal_dealloc_pd = NULL;
|
||||
ibv_internal_reg_mr = NULL;
|
||||
ibv_internal_reg_mr_iova2 = NULL;
|
||||
ibv_internal_reg_dmabuf_mr = NULL;
|
||||
ibv_internal_dereg_mr = NULL;
|
||||
ibv_internal_create_cq = NULL;
|
||||
ibv_internal_destroy_cq = NULL;
|
||||
@@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
|
||||
IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
|
||||
}
|
||||
|
||||
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
|
||||
@@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void
|
||||
return ncclInternalError;
|
||||
}
|
||||
if (ret == NULL) { return ncclSuccess; } // Assume dummy call
|
||||
IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
|
||||
}
|
||||
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
|
||||
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
|
||||
}
|
||||
|
||||
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
|
||||
if (ibv_internal_reg_dmabuf_mr == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
|
||||
}
|
||||
|
||||
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "nccl.h"
|
||||
#include "debug.h"
|
||||
#include "rocmwrap.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
|
||||
DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
|
||||
|
||||
/* ROCr Driver functions loaded with dlsym() */
|
||||
DECLARE_ROCM_PFN(hsa_init);
|
||||
DECLARE_ROCM_PFN(hsa_system_get_info);
|
||||
DECLARE_ROCM_PFN(hsa_status_string);
|
||||
|
||||
static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaState = hsaUninitialized;
|
||||
|
||||
static void *hsaLib;
|
||||
static uint16_t version_major, version_minor;
|
||||
|
||||
ncclResult_t rocmLibraryInit(void) {
|
||||
hsa_status_t res;
|
||||
|
||||
if (hsaState == hsaInitialized)
|
||||
return ncclSuccess;
|
||||
if (hsaState == hsaError)
|
||||
return ncclSystemError;
|
||||
|
||||
if (__sync_bool_compare_and_swap(&hsaState, hsaUninitialized, hsaInitializing) == false) {
|
||||
// Another thread raced in front of us. Wait for it to be done.
|
||||
while (hsaState == hsaInitializing) sched_yield();
|
||||
return (hsaState == hsaInitialized) ? ncclSuccess : ncclSystemError;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load ROCr driver library
|
||||
*/
|
||||
char path[1024];
|
||||
char *ncclCudaPath = getenv("RCCL_ROCR_PATH");
|
||||
if (ncclCudaPath == NULL)
|
||||
snprintf(path, 1024, "%s", "libhsa-runtime64.so");
|
||||
else
|
||||
snprintf(path, 1024, "%s%s", ncclCudaPath, "libhsa-runtime64.so");
|
||||
|
||||
hsaLib = dlopen(path, RTLD_LAZY);
|
||||
if (hsaLib == NULL) {
|
||||
WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", ncclCudaPath, ncclCudaPath);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Load initial ROCr functions
|
||||
*/
|
||||
|
||||
pfn_hsa_init = (PFN_hsa_init) dlsym(hsaLib, "hsa_init");
|
||||
if (pfn_hsa_init == NULL) {
|
||||
WARN("Failed to load ROCr missing symbol hsa_init");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_hsa_system_get_info = (PFN_hsa_system_get_info) dlsym(hsaLib, "hsa_system_get_info");
|
||||
if (pfn_hsa_system_get_info == NULL) {
|
||||
WARN("Failed to load ROCr missing symbol hsa_system_get_info");
|
||||
goto error;
|
||||
}
|
||||
|
||||
pfn_hsa_status_string = (PFN_hsa_status_string) dlsym(hsaLib, "hsa_status_string");
|
||||
if (pfn_hsa_status_string == NULL) {
|
||||
WARN("Failed to load ROCr missing symbol hsa_status_string");
|
||||
goto error;
|
||||
}
|
||||
|
||||
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
|
||||
if (res != 0) {
|
||||
WARN("pfn_hsa_system_get_info failed with %d", res);
|
||||
goto error;
|
||||
}
|
||||
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
|
||||
if (res != 0) {
|
||||
WARN("pfn_hsa_system_get_info failed with %d", res);
|
||||
goto error;
|
||||
}
|
||||
|
||||
INFO(NCCL_INIT, "ROCr version %d.%d", version_major, version_minor);
|
||||
|
||||
//if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
|
||||
// WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
|
||||
// Silently ignore version check mismatch for backwards compatibility
|
||||
//goto error;
|
||||
//}
|
||||
|
||||
pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
|
||||
if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
|
||||
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
|
||||
goto error;
|
||||
}
|
||||
/*
|
||||
* Required to initialize the ROCr Driver.
|
||||
* Multiple calls of hsa_init() will return immediately
|
||||
* without making any relevant change
|
||||
*/
|
||||
pfn_hsa_init();
|
||||
|
||||
hsaState = hsaInitialized;
|
||||
return ncclSuccess;
|
||||
|
||||
error:
|
||||
hsaState = hsaError;
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
|
||||
@@ -59,15 +59,15 @@ ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void**
|
||||
|
||||
NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
|
||||
if (devShmPtr) {
|
||||
CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, cudaError);
|
||||
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
|
||||
CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, hipError_t);
|
||||
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
|
||||
}
|
||||
|
||||
*shmPtr = ptr;
|
||||
return ncclSuccess;
|
||||
sysError:
|
||||
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
|
||||
cudaError:
|
||||
hipError_t:
|
||||
if (fd != -1) close(fd);
|
||||
if (create) shm_unlink(shmPath);
|
||||
if (ptr != MAP_FAILED) munmap(ptr, shmSize);
|
||||
|
||||
Archivo normal → Archivo ejecutable
+24
-23
@@ -15,6 +15,9 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <unordered_set>
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
|
||||
static std::vector<std::pair<int, std::unordered_set<std::string>>> clientPortPool;
|
||||
|
||||
/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
|
||||
@@ -337,9 +340,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/* make all new sockets non-blocking */
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
if (sock->asyncFlag) {
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
|
||||
// addr port should be 0 (Any port)
|
||||
SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
|
||||
@@ -378,7 +382,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
|
||||
SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
|
||||
}
|
||||
|
||||
if (ret == EINPROGRESS)
|
||||
if (ret == EINPROGRESS || ret == ECONNREFUSED)
|
||||
*state = ncclSocketConnecting;
|
||||
else if (ret == 0)
|
||||
*state = ncclSocketConnected;
|
||||
@@ -414,10 +418,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
|
||||
|
||||
const int one = 1;
|
||||
SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
|
||||
|
||||
|
||||
/* support non-blocking socket; by default, the socket is non-blocking */
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
if (sock->asyncFlag) {
|
||||
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
|
||||
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
|
||||
}
|
||||
|
||||
/* const int bufsize = 128*1024;
|
||||
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
|
||||
@@ -458,31 +464,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
|
||||
int timedout_retries = 0;
|
||||
int refused_retries = 0;
|
||||
retry:
|
||||
/* async connect; abort when error happens and abortFlag is present. */
|
||||
/* blocking/non-blocking connect() is determined by asyncFlag. */
|
||||
ret = connect(fd, &sock->addr.sa, salen);
|
||||
|
||||
if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
|
||||
if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
|
||||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
|
||||
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
|
||||
usleep(SLEEP_INT);
|
||||
goto retry;
|
||||
} else if (errno == EINPROGRESS && !sock->asyncFlag) {
|
||||
enum ncclSocketState state;
|
||||
do {
|
||||
if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
|
||||
NCCLCHECK(getFdState(fd, &state));
|
||||
} while (state == ncclSocketConnecting);
|
||||
EQCHECK(state, ncclSocketError);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
|
||||
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
|
||||
* However, it can return EISCONN instead of success which indicates connection is built up in
|
||||
* background already. No need to call connect() again. */
|
||||
if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
|
||||
sock->fd = fd;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
|
||||
@@ -535,7 +536,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
|
||||
if (bytes == -1) {
|
||||
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
|
||||
WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
} else {
|
||||
bytes = 0;
|
||||
}
|
||||
@@ -555,7 +556,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
|
||||
if (closed) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,273 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "strongstream.h"
|
||||
#include "checks.h"
|
||||
#include "param.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ncclResult_t ncclCudaGetCapturingGraph(
|
||||
struct ncclCudaGraph* graph, hipStream_t stream
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
thread_local int driver = -1;
|
||||
if (driver == -1) {
|
||||
CUDACHECK(cudaDriverGetVersion(&driver));
|
||||
}
|
||||
if (driver < 11030) {
|
||||
cudaStreamCaptureStatus status;
|
||||
unsigned long long gid;
|
||||
graph->graph = nullptr;
|
||||
CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
|
||||
if (status != cudaStreamCaptureStatusNone) {
|
||||
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
} else {
|
||||
cudaStreamCaptureStatus status;
|
||||
unsigned long long gid;
|
||||
CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
|
||||
if (status != cudaStreamCaptureStatusActive) {
|
||||
graph->graph = nullptr;
|
||||
gid = ULLONG_MAX;
|
||||
}
|
||||
graph->graphId = gid;
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
cudaUserObject_t object;
|
||||
CUDACHECK(cudaUserObjectCreate(
|
||||
&object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync
|
||||
));
|
||||
// Hand over ownership to CUDA Graph
|
||||
CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove));
|
||||
return ncclSuccess;
|
||||
#else
|
||||
return ncclInvalidUsage;
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
|
||||
CUDACHECK(hipStreamCreateWithFlags(&ss->stream, hipStreamNonBlocking));
|
||||
CUDACHECK(hipEventCreateWithFlags(&ss->event, hipEventDisableTiming));
|
||||
#if CUDART_VERSION >= 11030
|
||||
ss->node = nullptr;
|
||||
ss->graphId = (1ull<<(8*sizeof(long long)-1))-1;
|
||||
ss->eventIsLagging = 0;
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUDACHECK(cudaEventDestroy(ss->event));
|
||||
#endif
|
||||
CUDACHECK(hipStreamDestroy(ss->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
|
||||
|
||||
ncclResult_t ncclStrongStreamAcquire(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (graph.graph == nullptr) {
|
||||
if (mixing && ncclStrongStreamEverCaptured(ss)) {
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
|
||||
ss->eventIsLagging = 0;
|
||||
}
|
||||
} else {
|
||||
if (ss->graphId != graph.graphId) {
|
||||
if (mixing && ss->eventIsLagging) {
|
||||
// Can only be here if previous release was for uncaptured work that
|
||||
// elided updating the event because no capture had yet occurred.
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
|
||||
CUDACHECK(cudaEventRecord(ss->event, ss->stream));
|
||||
}
|
||||
ss->graphId = graph.graphId;
|
||||
ss->eventIsLagging = 0;
|
||||
if (mixing) {
|
||||
CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event));
|
||||
} else {
|
||||
CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (mixing && ncclStrongStreamEverCaptured(ss)) {
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
|
||||
}
|
||||
ss->eventIsLagging = 1; // Assume the caller is going to add work to stream.
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
bool mixing = ncclParamGraphMixingSupport();
|
||||
if (mixing && ss->eventIsLagging) {
|
||||
if (graph.graph == nullptr) {
|
||||
if (ncclStrongStreamEverCaptured(ss)) {
|
||||
CUDACHECK(cudaEventRecord(ss->event, ss->stream));
|
||||
ss->eventIsLagging = 0;
|
||||
}
|
||||
} else {
|
||||
CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event));
|
||||
ss->eventIsLagging = 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamLaunchHost(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss, hipHostFn_t fn, void* arg
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (graph.graph == nullptr) {
|
||||
CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
|
||||
} else {
|
||||
cudaHostNodeParams p;
|
||||
p.fn = fn;
|
||||
p.userData = arg;
|
||||
CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p));
|
||||
}
|
||||
ss->eventIsLagging = 1;
|
||||
#else
|
||||
//CUDACHECK(hipLaunchHostFunc(ss->stream, fn, arg));
|
||||
CUDACHECK(hipStreamAddCallback(ss->stream, (hipStreamCallback_t)fn, arg, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamLaunchKernel(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
|
||||
void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (graph.graph == nullptr) {
|
||||
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
|
||||
} else {
|
||||
cudaGraphNode_t tip = ss->node;
|
||||
cudaKernelNodeParams p;
|
||||
p.func = fn;
|
||||
p.gridDim = grid;
|
||||
p.blockDim = block;
|
||||
p.kernelParams = args;
|
||||
p.sharedMemBytes = sharedMemBytes;
|
||||
p.extra = nullptr;
|
||||
CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p));
|
||||
}
|
||||
ss->eventIsLagging = 1;
|
||||
#else
|
||||
CUDACHECK(hipLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (graph.graph == nullptr) {
|
||||
if (b->eventIsLagging) {
|
||||
b->eventIsLagging = 0;
|
||||
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
||||
}
|
||||
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
|
||||
a->eventIsLagging = 1;
|
||||
} else {
|
||||
cudaGraphNode_t pair[2] = {a->node, b->node};
|
||||
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
||||
}
|
||||
#else
|
||||
CUDACHECK(hipEventRecord(b->event, b->stream));
|
||||
CUDACHECK(hipStreamWaitEvent(a->stream, b->event, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (graph.graph == nullptr) {
|
||||
CUDACHECK(cudaEventRecord(a->event, b));
|
||||
CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
|
||||
// We used a->event to record b so it no longer reflects anything about a.
|
||||
a->eventIsLagging = 1;
|
||||
} else {
|
||||
cudaStreamCaptureStatus status;
|
||||
unsigned long long gid1;
|
||||
cudaGraphNode_t const* deps;
|
||||
size_t depN = 0;
|
||||
CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN));
|
||||
if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) {
|
||||
WARN("Stream is not being captured by the expected graph.");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
if (depN > 0 && (depN > 1 || deps[0] != a->node)) {
|
||||
cudaGraphNode_t tie;
|
||||
if (depN == 1) {
|
||||
tie = deps[0];
|
||||
} else {
|
||||
CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN));
|
||||
}
|
||||
cudaGraphNode_t pair[2] = {a->node, tie};
|
||||
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
|
||||
}
|
||||
// a->eventIsLagging doesn't change since we are just updating the
|
||||
// dependencies of a->node.
|
||||
}
|
||||
#else
|
||||
CUDACHECK(hipEventRecord(a->event, b));
|
||||
CUDACHECK(hipStreamWaitEvent(a->stream, a->event, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamWaitStream(
|
||||
struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
|
||||
) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
if (graph.graph == nullptr) {
|
||||
if (b->eventIsLagging) {
|
||||
b->eventIsLagging = 0;
|
||||
CUDACHECK(cudaEventRecord(b->event, b->stream));
|
||||
}
|
||||
CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
|
||||
} else {
|
||||
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies));
|
||||
}
|
||||
#else
|
||||
CUDACHECK(hipEventRecord(b->event, b->stream));
|
||||
CUDACHECK(hipStreamWaitEvent(a, b->event, 0));
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
|
||||
#endif
|
||||
CUDACHECK(hipStreamSynchronize(ss->stream));
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -11,6 +11,8 @@
|
||||
#include "nvmlwrap.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// Get current Compute Capability
|
||||
int ncclCudaCompCap() {
|
||||
int cudaDev;
|
||||
@@ -192,3 +194,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
|
||||
|
||||
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
|
||||
// `me->hunks` points to the top of the stack non-empty hunks. Hunks above
|
||||
// this (reachable via `->above`) are empty.
|
||||
struct Hunk* top = me->topFrame.hunk;
|
||||
size_t mallocSize = 0;
|
||||
|
||||
// If we have lots of space left in hunk but that wasn't enough then we'll
|
||||
// allocate the object unhunked.
|
||||
if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
|
||||
goto unhunked;
|
||||
|
||||
// If we have another hunk (which must be empty) waiting above this one and
|
||||
// the object fits then use that.
|
||||
if (top && top->above) {
|
||||
struct Hunk* top1 = top->above;
|
||||
uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
|
||||
if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
|
||||
me->topFrame.hunk = top1;
|
||||
me->topFrame.bumper = uobj + size;
|
||||
me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
|
||||
return reinterpret_cast<void*>(uobj);
|
||||
}
|
||||
}
|
||||
|
||||
{ // If the next hunk we're going to allocate wouldn't be big enough but the
|
||||
// Unhunk proxy fits in the current hunk then go allocate as unhunked.
|
||||
size_t nextSize = (top ? top->size : 0) + (64<<10);
|
||||
constexpr size_t maxAlign = 64;
|
||||
if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
|
||||
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
|
||||
if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
|
||||
goto unhunked;
|
||||
}
|
||||
|
||||
// At this point we must need another hunk, either to fit the object
|
||||
// itself or its Unhunk proxy.
|
||||
mallocSize = nextSize;
|
||||
INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
|
||||
struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
|
||||
if (top1 == nullptr) goto malloc_exhausted;
|
||||
top1->size = nextSize;
|
||||
top1->above = nullptr;
|
||||
if (top) top->above = top1;
|
||||
top = top1;
|
||||
me->topFrame.hunk = top;
|
||||
me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
|
||||
me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
|
||||
}
|
||||
|
||||
{ // Try to fit object in the new top hunk.
|
||||
uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
|
||||
if (uobj + size <= me->topFrame.end) {
|
||||
me->topFrame.bumper = uobj + size;
|
||||
return reinterpret_cast<void*>(uobj);
|
||||
}
|
||||
}
|
||||
|
||||
unhunked:
|
||||
{ // We need to allocate the object out-of-band and put an Unhunk proxy in-band
|
||||
// to keep track of it.
|
||||
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
|
||||
Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
|
||||
me->topFrame.bumper = uproxy + sizeof(Unhunk);
|
||||
proxy->next = me->topFrame.unhunks;
|
||||
me->topFrame.unhunks = proxy;
|
||||
mallocSize = size;
|
||||
proxy->obj = malloc(mallocSize);
|
||||
INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
|
||||
if (proxy->obj == nullptr) goto malloc_exhausted;
|
||||
return proxy->obj;
|
||||
}
|
||||
|
||||
malloc_exhausted:
|
||||
WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
|
||||
abort();
|
||||
}
|
||||
|
||||
void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
|
||||
// Free unhunks first because both the frames and unhunk proxies lie within the hunks.
|
||||
struct ncclMemoryStack::Frame* f = &me->topFrame;
|
||||
while (f != nullptr) {
|
||||
struct ncclMemoryStack::Unhunk* u = f->unhunks;
|
||||
while (u != nullptr) {
|
||||
free(u->obj);
|
||||
u = u->next;
|
||||
}
|
||||
f = f->below;
|
||||
}
|
||||
// Free hunks
|
||||
struct ncclMemoryStack::Hunk* h = me->stub.above;
|
||||
while (h != nullptr) {
|
||||
struct ncclMemoryStack::Hunk *h1 = h->above;
|
||||
free(h);
|
||||
h = h1;
|
||||
}
|
||||
}
|
||||
|
||||
+14
-3
@@ -41,7 +41,8 @@ typedef enum { ncclSuccess = 0,
|
||||
ncclInternalError = 3,
|
||||
ncclInvalidArgument = 4,
|
||||
ncclInvalidUsage = 5,
|
||||
ncclNumResults = 6 } ncclResult_t;
|
||||
ncclRemoteError = 6,
|
||||
ncclNumResults = 7 } ncclResult_t;
|
||||
|
||||
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
|
||||
*
|
||||
@@ -135,11 +136,21 @@ ncclResult_t ncclCommAbort(ncclComm_t comm);
|
||||
ncclResult_t pncclCommAbort(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Returns a human-readable error message. */
|
||||
/*! @brief Returns a string for each error code. */
|
||||
const char* ncclGetErrorString(ncclResult_t result);
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetErrorString(ncclResult_t result);
|
||||
/// @endcond
|
||||
|
||||
/*! @brief Checks whether the comm has encountered any asynchronous errors */
|
||||
/*! @brief Returns a human-readable message of the last error that occurred.
|
||||
* comm is currently unused and can be set to NULL
|
||||
*/
|
||||
const char* ncclGetLastError(ncclComm_t comm);
|
||||
/// @cond include_hidden
|
||||
const char* pncclGetError(ncclComm_t comm);
|
||||
/// @endcond
|
||||
|
||||
/* Checks whether the comm has encountered any asynchronous errors */
|
||||
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
/// @cond include_hidden
|
||||
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
|
||||
|
||||
+192
-97
@@ -9,15 +9,16 @@
|
||||
//#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
|
||||
ncclNet_t *ncclNet;
|
||||
ncclCollNet_t *ncclCollNet;
|
||||
|
||||
static ncclNet_v5_t ncclNet_v4_as_v5;
|
||||
static ncclNet_v6_t ncclNet_v4_as_v6;
|
||||
static ncclNet_v6_t ncclNet_v5_as_v6;
|
||||
static ncclNet_v4_t *ncclNet_v4;
|
||||
static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
|
||||
static ncclNet_v5_t *ncclNet_v5;
|
||||
static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
|
||||
static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
|
||||
static ncclCollNet_v4_t *ncclCollNet_v4;
|
||||
static ncclCollNet_v5_t *ncclCollNet_v5;
|
||||
|
||||
static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
|
||||
static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
|
||||
ncclNetProperties_v4_t p4;
|
||||
ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
||||
static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
|
||||
return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
||||
static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
|
||||
if (n == 0) return ncclSuccess;
|
||||
if (n != 1) return ncclInvalidArgument;
|
||||
return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
|
||||
static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
|
||||
if (n == 0) return ncclSuccess;
|
||||
if (n != 1) return ncclInvalidArgument;
|
||||
return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
|
||||
@@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data,
|
||||
|
||||
// We use a wrapper around the v4 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v4->init(logfn));
|
||||
ncclNet_v4_as_v5.name = ncclNet_v4->name;
|
||||
ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
|
||||
ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
|
||||
ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
|
||||
ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
|
||||
ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
|
||||
ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
|
||||
ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
|
||||
ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
|
||||
ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
|
||||
ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
|
||||
ncclNet_v4_as_v5.test = ncclNet_v4->test;
|
||||
ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
|
||||
ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
|
||||
ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
|
||||
ncclNet_v4_as_v6.name = ncclNet_v4->name;
|
||||
ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
|
||||
ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
|
||||
ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
|
||||
ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
|
||||
ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
|
||||
ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
|
||||
ncclNet_v4_as_v6.regMrDmaBuf = NULL;
|
||||
ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
|
||||
ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
|
||||
ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
|
||||
ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
|
||||
ncclNet_v4_as_v6.test = ncclNet_v4->test;
|
||||
ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
|
||||
ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
|
||||
ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclNet_v5->init(logfn));
|
||||
ncclNet_v5_as_v6.name = ncclNet_v5->name;
|
||||
ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
|
||||
ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
|
||||
ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
|
||||
ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
|
||||
ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
|
||||
ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
|
||||
ncclNet_v5_as_v6.regMrDmaBuf = NULL;
|
||||
ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
|
||||
ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
|
||||
ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
|
||||
ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
|
||||
ncclNet_v5_as_v6.test = ncclNet_v5->test;
|
||||
ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
|
||||
ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
|
||||
ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
|
||||
ncclNetProperties_v4_t p4;
|
||||
ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
|
||||
if (ans != ncclSuccess) return ans;
|
||||
@@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie
|
||||
|
||||
// We use a wrapper around the v4 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
|
||||
static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v4->init(logfn));
|
||||
ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
|
||||
ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
|
||||
ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
|
||||
ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
|
||||
ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
|
||||
ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
|
||||
ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
|
||||
ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
|
||||
ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
|
||||
ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
|
||||
ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
|
||||
ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
|
||||
ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
|
||||
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
|
||||
ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
|
||||
ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
|
||||
ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
|
||||
ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
|
||||
ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
|
||||
ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
|
||||
ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
|
||||
ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
|
||||
ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
|
||||
ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
|
||||
ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
|
||||
ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
|
||||
// We use a wrapper around the v5 init to copy over the struct contents
|
||||
// post-init since they may not be initialized before hand.
|
||||
static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclCollNet_v5->init(logfn));
|
||||
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
|
||||
ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
|
||||
ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
|
||||
ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
|
||||
ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
|
||||
ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
|
||||
ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
|
||||
ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
|
||||
ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
|
||||
ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
|
||||
ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
|
||||
ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
|
||||
ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
|
||||
ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
|
||||
ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
|
||||
enum ncclNetState {
|
||||
ncclNetStateInit = 0,
|
||||
ncclNetStateEnabled = 1,
|
||||
ncclNetStateDisabled = 2
|
||||
};
|
||||
enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
|
||||
ncclResult_t ncclNetPluginInit() {
|
||||
char ncclNetPluginName[128];
|
||||
const char* envPluginName = getenv("NCCL_NET_PLUGIN");
|
||||
if (envPluginName && strlen(envPluginName)) {
|
||||
@@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
|
||||
}
|
||||
return;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
*net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (*net == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
|
||||
ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
|
||||
if (ncclNet_v4 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return;
|
||||
ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
|
||||
if (ncclNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
|
||||
// Try v5 plugin
|
||||
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
|
||||
if (ncclNet_v5 == nullptr) {
|
||||
ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
|
||||
if (ncclNet_v4 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
|
||||
if (netPluginLib != nullptr) dlclose(netPluginLib);
|
||||
return ncclSuccess;
|
||||
}
|
||||
ncclNets[0] = &ncclNet_v4_as_v6;
|
||||
ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v4_as_v6.name = ncclNet_v4->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
|
||||
} else {
|
||||
ncclNets[0] = &ncclNet_v5_as_v6;
|
||||
ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
|
||||
// Set the name right away to allow for NCCL_NET=... to work
|
||||
ncclNet_v5_as_v6.name = ncclNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
|
||||
}
|
||||
*net = &ncclNet_v4_as_v5;
|
||||
ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
|
||||
}
|
||||
|
||||
// Check for CollNet
|
||||
*collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (*collnet == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
|
||||
ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
|
||||
if (ncclCollNet_v4 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
|
||||
ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
|
||||
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
|
||||
if (ncclCollNet_v5 == nullptr) {
|
||||
ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
|
||||
if (ncclCollNet_v4 == nullptr) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
|
||||
} else {
|
||||
ncclCollNets[0] = &ncclCollNet_v4_as_v6;
|
||||
ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
|
||||
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
|
||||
}
|
||||
} else {
|
||||
*collnet = &ncclCollNet_v4_as_v5;
|
||||
ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
|
||||
ncclCollNets[0] = &ncclCollNet_v5_as_v6;
|
||||
ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
|
||||
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
|
||||
}
|
||||
}
|
||||
return;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetInit() {
|
||||
// Always initialize bootstrap network
|
||||
NCCLCHECK(bootstrapNetInit());
|
||||
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclNetStates[i] = ncclNetStateEnabled;
|
||||
}
|
||||
*state = ncclNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
if (ncclCollNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclCollNetStates[i] = ncclNetStateEnabled;
|
||||
}
|
||||
*state = ncclCollNetStates[i];
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm) {
|
||||
// Initialize main communication network
|
||||
ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
|
||||
ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
|
||||
initPlugin(&nets[0], &collNets[0]);
|
||||
char* netName = getenv("NCCL_NET");
|
||||
bool ok = false;
|
||||
|
||||
for (int i=0; i<3; i++) {
|
||||
if (nets[i] == nullptr) continue;
|
||||
if (netName && strcmp(netName, nets[i]->name) != 0) continue;
|
||||
if (ncclNets[i] == nullptr) continue;
|
||||
enum ncclNetState state;
|
||||
NCCLCHECK(netGetState(i, &state));
|
||||
if (state != ncclNetStateEnabled) continue;
|
||||
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
|
||||
|
||||
// net plugin is already initialized
|
||||
int ndev;
|
||||
if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
|
||||
if (nets[i]->devices(&ndev) != ncclSuccess) continue;
|
||||
if (ndev <= 0) continue;
|
||||
ncclNet = nets[i];
|
||||
comm->ncclNet = ncclNets[i];
|
||||
ok = true;
|
||||
|
||||
if (collNets[i]) {
|
||||
do {
|
||||
if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
|
||||
if (collNets[i]->devices(&ndev) != ncclSuccess) break;
|
||||
if (ndev <= 0) break;
|
||||
ncclCollNet = collNets[i];
|
||||
} while(0);
|
||||
if (ncclCollNets[i]) {
|
||||
NCCLCHECK(collNetGetState(i, &state));
|
||||
if (state == ncclNetStateEnabled) {
|
||||
comm->ncclCollNet = ncclCollNets[i];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
|
||||
constexpr int GPU_BUF_SIZE = 2*1024*1024;
|
||||
#if CUDART_VERSION >= 11030
|
||||
// In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
|
||||
@@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
}
|
||||
#endif
|
||||
int netDevs;
|
||||
NCCLCHECK(ncclNetDevices(&netDevs));
|
||||
NCCLCHECK(ncclNetDevices(comm, &netDevs));
|
||||
*gdrSupport = 0;
|
||||
for (int dev=0; dev<netDevs; dev++) {
|
||||
// Find a net device which is GDR-capable
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(dev, &props));
|
||||
NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
|
||||
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
*gdrSupport = 1;
|
||||
@@ -232,34 +327,34 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
void* mHandle = NULL;
|
||||
ncclResult_t ret;
|
||||
ncclDebugNoWarn = NCCL_NET;
|
||||
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
|
||||
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
|
||||
while (sComm == NULL) {
|
||||
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
|
||||
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
|
||||
}
|
||||
while (rComm == NULL) {
|
||||
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
|
||||
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
|
||||
}
|
||||
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
|
||||
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
|
||||
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
||||
NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
|
||||
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
|
||||
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
|
||||
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
|
||||
NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
|
||||
*gdrSupport = 1;
|
||||
}
|
||||
ncclDebugNoWarn = 0;
|
||||
CUDACHECK(hipFree(gpuPtr));
|
||||
cleanup4:
|
||||
NCCLCHECK(ncclNetCloseRecv(rComm));
|
||||
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
|
||||
cleanup3:
|
||||
NCCLCHECK(ncclNetCloseSend(sComm));
|
||||
NCCLCHECK(ncclNetCloseSend(comm, sComm));
|
||||
cleanup2:
|
||||
NCCLCHECK(ncclNetCloseListen(lComm));
|
||||
NCCLCHECK(ncclNetCloseListen(comm, lComm));
|
||||
cleanup1:
|
||||
break;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclNetVersion() {
|
||||
return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
|
||||
int ncclNetVersion(struct ncclComm* comm) {
|
||||
return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
|
||||
}
|
||||
|
||||
+110
-56
@@ -14,6 +14,8 @@
|
||||
#define ENABLE_TIMER 0
|
||||
#include "timer.h"
|
||||
|
||||
#include <sys/syscall.h>
|
||||
|
||||
enum { proxyRecv=0, proxySend=1 };
|
||||
|
||||
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
|
||||
@@ -350,10 +352,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
|
||||
static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
|
||||
if (peer < 0) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peerComm = channel->peers+peer;
|
||||
struct ncclChannelPeer* peerComm = channel->peers+peer;
|
||||
struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
|
||||
if (connector->transportComm == NULL) {
|
||||
WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
|
||||
@@ -362,35 +364,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
|
||||
}
|
||||
if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
|
||||
|
||||
NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
|
||||
if (justInquire) *justInquire = true;
|
||||
else {
|
||||
NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
|
||||
struct ncclChannel* channel = comm->channels+op->channelId;
|
||||
int pattern = op->pattern;
|
||||
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex));
|
||||
if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex));
|
||||
}
|
||||
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree up
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
|
||||
}
|
||||
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
|
||||
// Tree down
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
|
||||
}
|
||||
if (pattern == ncclPatternCollTreeUpDown) {
|
||||
// CollTree up
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push
|
||||
// CollTree down
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
|
||||
// justInquire != nullptr means don't actually do anything, just assertain need of
|
||||
// ncclProxySaveOp for this op.
|
||||
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
|
||||
struct ncclChannel* channel = &comm->channels[op->channelId];
|
||||
if (justInquire) *justInquire = false;
|
||||
switch (op->pattern) {
|
||||
case ncclPatternRing:
|
||||
case ncclPatternRingTwice:
|
||||
case ncclPatternPipelineFrom:
|
||||
case ncclPatternPipelineTo: {
|
||||
struct ncclRing* ring = &channel->ring;
|
||||
if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
|
||||
}
|
||||
if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex, justInquire));
|
||||
}
|
||||
} break;
|
||||
case ncclPatternTreeUp:
|
||||
case ncclPatternTreeDown:
|
||||
case ncclPatternTreeUpDown: {
|
||||
if (op->pattern != ncclPatternTreeDown) { // Tree up
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
|
||||
}
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
|
||||
}
|
||||
if (op->pattern != ncclPatternTreeUp) { // Tree down
|
||||
struct ncclTree* tree = &channel->tree;
|
||||
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
|
||||
}
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
|
||||
}
|
||||
} break;
|
||||
case ncclPatternCollTreeUpDown: {
|
||||
// CollTree up
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire)); // For CollTree up, we are using push
|
||||
// CollTree down
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
|
||||
} break;
|
||||
case ncclPatternSend:
|
||||
case ncclPatternRecv: {
|
||||
if (op->root == comm->rank) return ncclSuccess;
|
||||
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
|
||||
} break;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -406,26 +435,24 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
op->chunkSteps = 1;
|
||||
op->protocol = NCCL_PROTO_SIMPLE;
|
||||
op->dtype = info->datatype;
|
||||
op->connIndex = info->connIndex;
|
||||
|
||||
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
|
||||
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
|
||||
if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
|
||||
info->chunkSize = stepSize;
|
||||
op->root = info->root;
|
||||
op->nbytes = info->count;
|
||||
if (info->root == -1) return ncclSuccess;
|
||||
|
||||
struct ncclPeer* peer = channel->peers + op->root;
|
||||
struct ncclChannelPeer* peer = channel->peers + op->root;
|
||||
|
||||
if (info->coll == ncclFuncSend) {
|
||||
op->pattern = ncclPatternSend;
|
||||
if (op->root != info->comm->rank && peer->send[info->connIndex].transportComm && peer->send[info->connIndex].transportComm->proxyProgress) {
|
||||
if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
}
|
||||
} else if (info->coll == ncclFuncRecv) {
|
||||
op->pattern = ncclPatternRecv;
|
||||
if (op->root != info->comm->rank && peer->recv[info->connIndex].transportComm && peer->recv[info->connIndex].transportComm->proxyProgress) {
|
||||
if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
|
||||
// Tune chunk size for the network
|
||||
if (info->count < stepSize) info->chunkSize /= 4;
|
||||
else if (info->count < 8*stepSize) info->chunkSize /= 2;
|
||||
@@ -441,22 +468,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
|
||||
struct ncclChannel* channel = comm->channels+op->channelId;
|
||||
op->opCount = channel->workFifoTail-1;
|
||||
if (op->root == comm->rank) return ncclSuccess;
|
||||
if (op->pattern == ncclPatternRecv) {
|
||||
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, op->connIndex));
|
||||
} else if (op->pattern == ncclPatternSend) {
|
||||
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
|
||||
if (op->nsteps == 0) op->nsteps = 1;
|
||||
NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, op->connIndex));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
|
||||
struct ncclProxyArgs* freeOp = *opPtr;
|
||||
struct ncclProxyArgs* next = freeOp->next;
|
||||
@@ -598,8 +609,48 @@ void ncclDumpProxyState(int signal) {
|
||||
dumpProxyState(ncclLastProxyState);
|
||||
}
|
||||
|
||||
NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
|
||||
ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
|
||||
#if CUDART_VERSION >= 11030
|
||||
static int createThreadContext = -1;
|
||||
|
||||
if (createThreadContext == -1) {
|
||||
createThreadContext = ncclParamCreateThreadContext();
|
||||
if (createThreadContext) {
|
||||
if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
|
||||
WARN("Unable to create thread context due to old driver, disabling.");
|
||||
createThreadContext = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (createThreadContext) {
|
||||
if (comm->proxyState.cudaCtx == NULL) {
|
||||
if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
|
||||
CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to create CUDA context on device %d", comm->cudaDev);
|
||||
createThreadContext = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
} else {
|
||||
if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
|
||||
WARN("Failed to set CUDA context on device %d", comm->cudaDev);
|
||||
return ncclUnhandledCudaError;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void* ncclProxyProgress(void *comm_) {
|
||||
struct ncclComm* comm = (struct ncclComm*)comm_;
|
||||
if (ncclSetThreadContext(comm) != ncclSuccess) {
|
||||
WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
|
||||
} else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
|
||||
WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
|
||||
}
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
|
||||
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
|
||||
state->nextOps = -1;
|
||||
signal(SIGUSR1, ncclDumpProxyState);
|
||||
@@ -732,9 +783,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
|
||||
|
||||
static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
if (connection->send) {
|
||||
NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
|
||||
NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
|
||||
} else {
|
||||
NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
|
||||
NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -778,7 +829,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
|
||||
NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
|
||||
NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
|
||||
NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
|
||||
struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
|
||||
struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
|
||||
// If we need proxy progress, map progress ops
|
||||
if (tcomm->proxyProgress) {
|
||||
char poolPath[] = "/dev/shm/nccl-XXXXXX";
|
||||
@@ -885,7 +936,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
|
||||
NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
|
||||
connection->localRank = peer->localRank;
|
||||
NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
|
||||
connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
|
||||
connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
|
||||
// If we need proxy progress, let's allocate ops and start the thread
|
||||
if (connection->tcomm->proxyProgress) {
|
||||
NCCLCHECK(proxyProgressInit(comm));
|
||||
@@ -951,7 +1002,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
|
||||
|
||||
void* ncclProxyService(void* _args) {
|
||||
struct ncclComm* comm = (struct ncclComm *) _args;
|
||||
if (hipSetDevice(comm->cudaDev) != hipSuccess) {
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
if (ncclSetThreadContext(comm) != ncclSuccess) {
|
||||
WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
|
||||
} else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
|
||||
WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
|
||||
}
|
||||
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
|
||||
|
||||
+28
-33
@@ -11,16 +11,11 @@
|
||||
#define ENABLE_TIMER 0
|
||||
#include "timer.h"
|
||||
|
||||
extern struct ncclTransport p2pTransport;
|
||||
extern struct ncclTransport shmTransport;
|
||||
extern struct ncclTransport netTransport;
|
||||
extern struct ncclTransport collNetTransport;
|
||||
|
||||
struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
p2pTransport,
|
||||
shmTransport,
|
||||
netTransport,
|
||||
collNetTransport
|
||||
struct ncclTransport* ncclTransports[NTRANSPORTS] = {
|
||||
&p2pTransport,
|
||||
&shmTransport,
|
||||
&netTransport,
|
||||
&collNetTransport
|
||||
};
|
||||
|
||||
template <int type>
|
||||
@@ -37,10 +32,11 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
}
|
||||
bool xgmi;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
|
||||
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
|
||||
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransport *transport = ncclTransports[t];
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
||||
@@ -55,18 +51,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t mask = 1 << channel->id;
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
uint32_t mask = 1 << channelId;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
||||
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
|
||||
comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
|
||||
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
|
||||
comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -82,17 +79,18 @@ void dumpData(struct ncclConnect* data, int ndata) {
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||
|
||||
hipStream_t transportSetupStream;
|
||||
CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
|
||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||
|
||||
struct ncclConnect data[2*MAXCHANNELS];
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
|
||||
|
||||
struct ncclConnect* recvData = data;
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
@@ -137,7 +135,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
|
||||
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
|
||||
}
|
||||
}
|
||||
TIME_STOP(3);
|
||||
@@ -147,11 +146,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
|
||||
}
|
||||
}
|
||||
TIME_STOP(4);
|
||||
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
|
||||
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
|
||||
}
|
||||
CUDACHECK(hipStreamSynchronize(transportSetupStream));
|
||||
CUDACHECK(hipStreamDestroy(transportSetupStream));
|
||||
@@ -179,10 +178,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
// check if we can connect to collnet, whose root is the nranks-th rank
|
||||
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
|
||||
peerInfo->rank = nranks;
|
||||
int support = 1;
|
||||
if (isMaster) {
|
||||
NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
|
||||
}
|
||||
|
||||
// send master receives connect info from peer recv master
|
||||
if (isMaster && type == collNetSend) {
|
||||
@@ -192,14 +187,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
}
|
||||
|
||||
// select
|
||||
struct ncclPeer* root = channel->peers+nranks;
|
||||
struct ncclChannelPeer* root = channel->peers+nranks;
|
||||
// connector index: 0 for recv, 1 for send
|
||||
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
|
||||
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
|
||||
conn->transportComm = transportComm;
|
||||
// setup
|
||||
struct ncclConnect myConnect;
|
||||
if (isMaster && support) {
|
||||
if (isMaster) {
|
||||
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
|
||||
}
|
||||
// prepare connect handles
|
||||
@@ -229,11 +224,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
||||
}
|
||||
// connect
|
||||
if (isMaster && support) {
|
||||
if (isMaster) {
|
||||
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
struct ncclPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
|
||||
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
|
||||
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
|
||||
CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
|
||||
}
|
||||
// recv side sends connect info to send side
|
||||
if (isMaster && type == collNetRecv) {
|
||||
@@ -242,7 +237,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
|
||||
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
||||
}
|
||||
if (support) fail = 0;
|
||||
fail = 0;
|
||||
cleanup:
|
||||
if (allConnects != NULL) free(allConnects);
|
||||
if (masterConnects != NULL) free(masterConnects);
|
||||
@@ -271,7 +266,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
|
||||
// Free collNet resources
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
struct ncclPeer* peer = channel->peers+comm->nRanks;
|
||||
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* send = peer->send + b;
|
||||
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
|
||||
|
||||
+52
-26
@@ -108,7 +108,7 @@ struct sendResources {
|
||||
uint64_t step;
|
||||
struct reqSlot (*reqFifo)[NCCL_STEPS];
|
||||
int collNetRank;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
struct recvResources {
|
||||
@@ -128,12 +128,12 @@ struct recvResources {
|
||||
uint64_t step;
|
||||
struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
|
||||
int collNetRank;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
/* Determine if we can communicate with the peer */
|
||||
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 1;
|
||||
// This transport cannot be used for p2p
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -175,7 +175,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -300,7 +300,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle
|
||||
comm->proxyState.progressState.collNet.resources = resources;
|
||||
}
|
||||
if (resources->collNetComms[netDev] == NULL)
|
||||
NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
|
||||
NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -314,13 +314,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
|
||||
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
|
||||
handlePtrs[i] = &(info->collNetHandle);
|
||||
}
|
||||
ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
|
||||
ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
|
||||
resources->collNetListenComms[netDev],
|
||||
resources->collNetComms+netDev);
|
||||
free(handlePtrs);
|
||||
if (ret == ncclSuccess) {
|
||||
// Close listen comm
|
||||
NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
|
||||
NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
|
||||
} else {
|
||||
resources->collNetListenComms[netDev] = NULL;
|
||||
}
|
||||
@@ -334,7 +334,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
|
||||
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
|
||||
resources->commRefCount[netDev]--;
|
||||
if (resources->commRefCount[netDev] == 0) {
|
||||
NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
|
||||
NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
|
||||
}
|
||||
for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
|
||||
comm->proxyState.progressState.collNet.resources = NULL;
|
||||
@@ -450,9 +450,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
|
||||
|
||||
NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
|
||||
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
if (resources->useGdr && comm->dmaBufSupport) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
|
||||
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
(void)close(dmabuf_fd);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#endif
|
||||
{
|
||||
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
|
||||
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
|
||||
}
|
||||
|
||||
*((struct connectMap**)respBuff) = &resources->map;
|
||||
return ncclSuccess;
|
||||
@@ -506,9 +519,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
|
||||
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
|
||||
|
||||
NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
|
||||
&resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
if (resources->useGdr && comm->dmaBufSupport) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
|
||||
&resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
(void)close(dmabuf_fd);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#endif
|
||||
{
|
||||
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
|
||||
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
|
||||
&resources->mhandles[NCCL_PROTO_SIMPLE]));
|
||||
}
|
||||
|
||||
// Pass info to send side
|
||||
info->reqFifo = resources->reqFifo;
|
||||
@@ -524,7 +550,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
||||
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->sendMhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
@@ -541,7 +567,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->mhandles[p]) {
|
||||
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
|
||||
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
@@ -621,9 +647,9 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
args->idle = 0;
|
||||
//continue;
|
||||
// flush HDP if not done
|
||||
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
|
||||
args->hdp_flushed = LOAD(recvTail);
|
||||
STORE(resources->curr_hdp_reg, 1);
|
||||
if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
|
||||
args->hdp_flushed = *recvTail;
|
||||
*resources->curr_hdp_reg = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -634,10 +660,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
|
||||
if (reqFifo[group][buffSlot].recvBuff != NULL) {
|
||||
int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
|
||||
int count = totalSize / ncclTypeSize(args->dtype);
|
||||
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
|
||||
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
|
||||
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] == NULL) continue;
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
|
||||
@@ -653,7 +679,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int done, size;
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
|
||||
NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
@@ -744,7 +770,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int startChannel = group*COLLNET_GROUP_NSUBS;
|
||||
int offset;
|
||||
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
|
||||
NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
|
||||
}
|
||||
} else {
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
@@ -758,7 +784,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int group = s / COLLNET_GROUP_NSUBS;
|
||||
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
|
||||
int done = 1;
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
|
||||
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
|
||||
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
|
||||
|
||||
+83
-127
@@ -8,15 +8,11 @@
|
||||
#include "comm.h"
|
||||
#include "net.h"
|
||||
#include "graph.h"
|
||||
#include <sys/time.h>
|
||||
#include "proxy.h"
|
||||
#include "collectives.h"
|
||||
#include "gdrwrap.h"
|
||||
#include "shm.h"
|
||||
#include "profiler.h"
|
||||
#if defined(ENABLE_NPKIT)
|
||||
#include "npkit/npkit.h"
|
||||
#endif
|
||||
#include "graph.h"
|
||||
#include "graph/topo.h"
|
||||
|
||||
@@ -108,7 +104,7 @@ struct sendResources {
|
||||
void* mhandles[NCCL_NUM_PROTOCOLS];
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
struct recvResources {
|
||||
@@ -136,7 +132,7 @@ struct recvResources {
|
||||
void* mhandles[NCCL_NUM_PROTOCOLS];
|
||||
uint64_t step;
|
||||
uint64_t llLastCleaning;
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
|
||||
@@ -178,7 +174,6 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
req.connIndex = connIndex;
|
||||
req.netDev = -1;
|
||||
req.curr_hdp_reg = 0;
|
||||
|
||||
int proxyRank = myInfo->rank;
|
||||
@@ -198,12 +193,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
|
||||
|
||||
if (proxyRank == myInfo->rank) {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
||||
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
}
|
||||
*((int*)connectInfo) = proxyRank;
|
||||
@@ -222,7 +215,6 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
|
||||
req.channelId = channelId;
|
||||
req.connIndex = connIndex;
|
||||
req.netDev = -1;
|
||||
|
||||
// Use myInfo->rank as the receiver uses its own NIC
|
||||
int proxyRank = myInfo->rank;
|
||||
@@ -238,8 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
|
||||
req.remoteRank = peerInfo->rank;
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d",
|
||||
channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -448,7 +439,7 @@ static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, i
|
||||
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
|
||||
int rank = comm->localRankToRank[connection->localRank];
|
||||
int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
|
||||
NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
|
||||
NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -470,7 +461,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
|
||||
resources->connIndex = req->connIndex;
|
||||
resources->curr_hdp_reg = req->curr_hdp_reg;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
|
||||
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
|
||||
resources->maxRecvs = props.maxRecvs;
|
||||
|
||||
// We don't return any data
|
||||
@@ -496,11 +487,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
|
||||
resources->channelId = req->channelId;
|
||||
resources->connIndex = req->connIndex;
|
||||
ncclNetProperties_t props;
|
||||
NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
|
||||
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
|
||||
resources->maxRecvs = props.maxRecvs;
|
||||
|
||||
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
|
||||
NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
|
||||
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -527,15 +518,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
||||
}
|
||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
|
||||
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
|
||||
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
|
||||
resources->netSendComm = comms->sendComm[resources->channelId];
|
||||
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
|
||||
} else {
|
||||
NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
|
||||
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
||||
}
|
||||
} else {
|
||||
// Connect to remote peer
|
||||
NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
|
||||
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
}
|
||||
if (resources->netSendComm == NULL) {
|
||||
@@ -609,7 +600,31 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
|
||||
if (resources->buffers[p]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#else
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
|
||||
int dmabuf_fd;
|
||||
uint64_t offset;
|
||||
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
|
||||
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#endif
|
||||
{
|
||||
NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -643,15 +658,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
|
||||
}
|
||||
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
|
||||
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
|
||||
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
|
||||
resources->netRecvComm = comms->recvComm[resources->channelId];
|
||||
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
|
||||
} else {
|
||||
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
||||
}
|
||||
} else {
|
||||
// Connect to remote peer
|
||||
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
}
|
||||
if (resources->netRecvComm == NULL) {
|
||||
@@ -659,7 +674,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
return ncclSuccess;
|
||||
}
|
||||
*done = 1;
|
||||
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
|
||||
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
|
||||
|
||||
// Create structures
|
||||
struct connectMap* map = &resources->map;
|
||||
@@ -714,7 +729,31 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
|
||||
if (resources->buffers[p]) {
|
||||
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
|
||||
#if CUDA_VERSION >= 11070
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
|
||||
int dmabuf_fd;
|
||||
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#else
|
||||
/* DMA-BUF support */
|
||||
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
|
||||
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
|
||||
int dmabuf_fd;
|
||||
uint64_t offset;
|
||||
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
|
||||
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
|
||||
(void)close(dmabuf_fd);
|
||||
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
|
||||
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
|
||||
} else // FALL-THROUGH to nv_peermem GDR path
|
||||
#endif
|
||||
{
|
||||
NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -732,7 +771,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
||||
}
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->buffers[p]) {
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
|
||||
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
@@ -748,12 +787,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
|
||||
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
|
||||
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
|
||||
comms->sendRefCount[resources->channelId]--;
|
||||
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
|
||||
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
|
||||
} else {
|
||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
|
||||
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
|
||||
}
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
@@ -767,7 +806,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
}
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (resources->buffers[p]) {
|
||||
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
|
||||
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
|
||||
}
|
||||
}
|
||||
struct connectMapMem* mems = resources->map.mems;
|
||||
@@ -779,12 +818,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
|
||||
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
|
||||
comms->recvRefCount[resources->channelId]--;
|
||||
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
|
||||
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
|
||||
} else {
|
||||
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
|
||||
}
|
||||
} else {
|
||||
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
|
||||
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
|
||||
}
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
@@ -792,16 +831,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
|
||||
|
||||
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
static int g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
|
||||
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt++;
|
||||
#endif
|
||||
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
@@ -855,11 +885,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
|
||||
// We have something to receive, let's check if it's completely ready.
|
||||
int size = sizesFifo[buffSlot];
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
sub->npKitSizesFifo[buffSlot] = size;
|
||||
#endif
|
||||
|
||||
char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
|
||||
int ready = 1;
|
||||
if (p == NCCL_PROTO_LL128) {
|
||||
@@ -887,29 +912,13 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
if (ready) {
|
||||
// flush HDP if not done
|
||||
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
|
||||
args->hdp_flushed = LOAD(recvTail);
|
||||
STORE(resources->curr_hdp_reg, 1);
|
||||
if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
|
||||
args->hdp_flushed = *recvTail;
|
||||
*resources->curr_hdp_reg = 1;
|
||||
}
|
||||
// Data is ready, try to send.
|
||||
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
|
||||
NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
|
||||
if (sub->requests[buffSlot] != NULL) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_SEND_ENTRY,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
size,
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
|
||||
sizesFifo[buffSlot] = -1;
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
@@ -926,24 +935,8 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
if (sub->done < sub->transmitted) {
|
||||
int done;
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
|
||||
NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_SEND_EXIT,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sub->npKitSizesFifo[buffSlot],
|
||||
#endif
|
||||
uint64_t(sub->requests+buffSlot)/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
|
||||
sub->done += args->sliceSteps;
|
||||
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
|
||||
@@ -969,11 +962,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
|
||||
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt++;
|
||||
#endif
|
||||
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
// Initialize subs and group them by same recvComm.
|
||||
void* recvComm;
|
||||
@@ -1051,26 +1039,10 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
uint64_t step = subGroup->posted;
|
||||
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
|
||||
void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
|
||||
NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
|
||||
if (*requestPtr) {
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup+i;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_RECV_ENTRY,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub->posted += args->sliceSteps;
|
||||
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
|
||||
}
|
||||
@@ -1089,29 +1061,13 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
int sizes[NCCL_PROXY_MAX_SUBS];
|
||||
void* mhandles[NCCL_PROXY_MAX_SUBS];
|
||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
|
||||
NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
|
||||
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
|
||||
if (done) {
|
||||
int useGdr = 0;
|
||||
int totalSize = 0;
|
||||
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
|
||||
NpKit::CollectCpuEvent(
|
||||
NPKIT_EVENT_NET_RECV_EXIT,
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt,
|
||||
#else
|
||||
sizes[i],
|
||||
#endif
|
||||
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
|
||||
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
|
||||
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
|
||||
g_npkit_net_poll_cnt = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
sub->received += args->sliceSteps;
|
||||
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
|
||||
if (step < sub->nsteps) {
|
||||
@@ -1146,7 +1102,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
}
|
||||
}
|
||||
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
|
||||
NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
|
||||
NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
|
||||
}
|
||||
}
|
||||
args->idle = 0;
|
||||
@@ -1161,7 +1117,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
|
||||
uint64_t step = subGroup->transmitted;
|
||||
int done = 1;
|
||||
void* request = subGroup->requests[step%NCCL_STEPS];
|
||||
if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
|
||||
if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
|
||||
if (done) {
|
||||
for (int i=0; i<subGroup->groupSize; i++) {
|
||||
struct ncclProxySubArgs* sub = subGroup + i;
|
||||
|
||||
+61
-21
@@ -296,6 +296,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Detect whether DMA-BUF support is present in the kernel
|
||||
// Returns :
|
||||
// ncclSuccess : DMA-BUF support is available
|
||||
// ncclSystemError : DMA-BUF is not supported by the kernel
|
||||
ncclResult_t ncclIbDmaBufSupport(int dev) {
|
||||
static int dmaBufSupported = -1;
|
||||
if (dmaBufSupported == -1) {
|
||||
ncclResult_t res;
|
||||
struct ibv_pd* pd;
|
||||
struct ibv_context* ctx;
|
||||
ctx = ncclIbDevs[dev].context;
|
||||
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
|
||||
// Test kernel DMA-BUF support with a dummy call (fd=-1)
|
||||
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
|
||||
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
|
||||
dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
|
||||
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
|
||||
}
|
||||
if (dmaBufSupported == 0) return ncclSystemError;
|
||||
return ncclSuccess;
|
||||
failure:
|
||||
dmaBufSupported = 0;
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
|
||||
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
|
||||
return ncclSuccess;
|
||||
@@ -308,10 +333,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
|
||||
props->pciPath = ncclIbDevs[dev].pciPath;
|
||||
props->guid = ncclIbDevs[dev].guid;
|
||||
props->ptrSupport = NCCL_PTR_HOST;
|
||||
if (ncclIbGdrSupport(dev) != ncclSuccess) {
|
||||
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
|
||||
} else {
|
||||
props->ptrSupport |= NCCL_PTR_CUDA;
|
||||
if (ncclIbGdrSupport(dev) == ncclSuccess) {
|
||||
props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
|
||||
}
|
||||
if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
|
||||
props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
|
||||
}
|
||||
props->speed = ncclIbDevs[dev].speed;
|
||||
props->latency = 0; // Not set
|
||||
@@ -568,6 +594,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
|
||||
memset(handle, 0, sizeof(struct ncclIbHandle));
|
||||
comm->dev = dev;
|
||||
comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
|
||||
NCCLCHECK(GetSocketAddr(&comm->sock.addr));
|
||||
if (ncclParamIbSockServerPortReuse()) {
|
||||
// reuse the socket address and fd for listen system call
|
||||
@@ -614,7 +641,7 @@ ib_connect_check:
|
||||
/* expect user to call again */
|
||||
return ncclSuccess;
|
||||
} else if (conState == ncclSocketError) {
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
}
|
||||
|
||||
// IB Setup
|
||||
@@ -692,7 +719,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
|
||||
stage->comm = rComm;
|
||||
stage->state = ncclIbCommStateAccept;
|
||||
lComm->sock.asyncFlag = 1;
|
||||
rComm->sock.asyncFlag = 1;
|
||||
|
||||
ib_accept:
|
||||
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
|
||||
@@ -846,7 +872,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
|
||||
|
||||
ncclResult_t ncclIbTest(void* request, int* done, int* size);
|
||||
|
||||
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||
/* DMA-BUF support */
|
||||
ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
|
||||
static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
|
||||
assert(size > 0);
|
||||
|
||||
@@ -856,7 +883,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
|
||||
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
|
||||
struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
|
||||
uintptr_t addr = (uintptr_t)data & -pageSize;
|
||||
int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
|
||||
ncclResult_t res;
|
||||
pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
|
||||
for (int slot=0; /*true*/; slot++) {
|
||||
@@ -868,14 +895,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
|
||||
// Deregister / register
|
||||
struct ibv_mr* mr;
|
||||
unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
|
||||
if (ncclIbRelaxedOrderingEnabled) {
|
||||
// Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
|
||||
if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
|
||||
if (fd != -1) {
|
||||
/* DMA-BUF support */
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
|
||||
} else {
|
||||
if (ncclIbRelaxedOrderingEnabled) {
|
||||
// Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
|
||||
}
|
||||
else {
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
|
||||
}
|
||||
}
|
||||
else {
|
||||
NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
|
||||
}
|
||||
TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
|
||||
TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd);
|
||||
cache->population += 1;
|
||||
cache->slots[slot].addr = addr;
|
||||
cache->slots[slot].pages = pages;
|
||||
@@ -897,6 +930,10 @@ returning:
|
||||
return res;
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
|
||||
return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
|
||||
}
|
||||
|
||||
ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
|
||||
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
|
||||
struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
|
||||
@@ -950,13 +987,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
|
||||
|
||||
// Write size as immediate data. In the case of multi-send, only write
|
||||
// 0 or 1 as size to indicate whether there was data sent or received.
|
||||
uint64_t immData = 0;
|
||||
uint32_t immData = 0;
|
||||
if (nreqs == 1) {
|
||||
immData = reqs[0]->send.size;
|
||||
} else {
|
||||
uint8_t* multiImmData = (uint8_t*)&immData;
|
||||
if (nreqs > 32) {
|
||||
WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
|
||||
return ncclInternalError;
|
||||
}
|
||||
for (int r=0; r<nreqs; r++) {
|
||||
multiImmData[r] = reqs[r]->send.size ? 1 : 0;
|
||||
immData |= (reqs[r]->send.size ? 1 : 0) << r;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1231,7 +1271,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
char line[SOCKET_NAME_MAXLEN+1];
|
||||
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
|
||||
ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
}
|
||||
|
||||
struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
|
||||
@@ -1246,9 +1286,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
|
||||
if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
|
||||
if (req->nreqs > 1) {
|
||||
// In the case of a multi recv, we only set sizes to 0 or 1.
|
||||
uint8_t* sizes = (uint8_t*)&wc->imm_data;
|
||||
for (int i=0; i<req->nreqs; i++) {
|
||||
req->recv.sizes[i] |= sizes[i];
|
||||
req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
|
||||
}
|
||||
} else {
|
||||
req->recv.sizes[0] += wc->imm_data;
|
||||
@@ -1309,6 +1348,7 @@ ncclNet_t ncclNetIb = {
|
||||
ncclIbConnect,
|
||||
ncclIbAccept,
|
||||
ncclIbRegMr,
|
||||
ncclIbRegMrDmaBuf,
|
||||
ncclIbDeregMr,
|
||||
ncclIbIsend,
|
||||
ncclIbIrecv,
|
||||
|
||||
@@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
|
||||
struct ncclSocketListenComm* comm;
|
||||
NCCLCHECK(ncclSocketNewListenComm(&comm));
|
||||
NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
|
||||
comm->sock.asyncFlag = 1;
|
||||
NCCLCHECK(ncclSocketListen(&comm->sock));
|
||||
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
|
||||
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
|
||||
@@ -359,7 +360,7 @@ socket_connect_check:
|
||||
/* expect user to call again */
|
||||
return ncclSuccess;
|
||||
} else if (conState == ncclSocketError) {
|
||||
return ncclSystemError;
|
||||
return ncclRemoteError;
|
||||
}
|
||||
stage->state = ncclSocketCommStateSend;
|
||||
|
||||
@@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = {
|
||||
ncclSocketConnect,
|
||||
ncclSocketAccept,
|
||||
ncclSocketRegMr,
|
||||
NULL, // No DMA-BUF support
|
||||
ncclSocketDeregMr,
|
||||
ncclSocketIsend,
|
||||
ncclSocketIrecv,
|
||||
|
||||
+240
-24
@@ -8,6 +8,7 @@
|
||||
#include "comm.h"
|
||||
#include "graph.h"
|
||||
#include "utils.h"
|
||||
#include "shm.h"
|
||||
#include "graph.h"
|
||||
#include "graph/topo.h"
|
||||
|
||||
@@ -20,6 +21,34 @@ struct p2pConnectInfo {
|
||||
int rank;
|
||||
int read;
|
||||
struct ncclP2pBuff p2pBuff;
|
||||
// Use by CE memcpy
|
||||
char shmName[7];
|
||||
int shmSize;
|
||||
};
|
||||
static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
|
||||
|
||||
struct p2pShm {
|
||||
struct ncclSendMem sendMem;
|
||||
struct ncclRecvMem recvMem;
|
||||
};
|
||||
struct p2pProxyInfo {
|
||||
// Shared memory between proxy and receiving GPU
|
||||
struct p2pShm* shm;
|
||||
struct p2pShm* devShm;
|
||||
char shmName[7];
|
||||
int shmSize;
|
||||
|
||||
// Intermediate step for sender
|
||||
struct ncclRecvMem* ceRecvMem;
|
||||
char* ceDevBuff;
|
||||
|
||||
// Receiver buffer
|
||||
char* recvFifo;
|
||||
|
||||
// Used by progress only
|
||||
uint64_t step;
|
||||
hipStream_t stream;
|
||||
hipEvent_t events[NCCL_STEPS];
|
||||
};
|
||||
static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
|
||||
|
||||
@@ -28,18 +57,22 @@ struct p2pSendResources {
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
void* sendMemIpc;
|
||||
void* recvMemIpc;
|
||||
struct p2pProxyInfo proxyInfo;
|
||||
};
|
||||
|
||||
struct p2pRecvResources {
|
||||
struct ncclRecvMem* devMem;
|
||||
void* sendMemIpc;
|
||||
void* recvMemIpc;
|
||||
struct p2pShm* shm;
|
||||
struct p2pShm* devShm;
|
||||
int shmSize;
|
||||
};
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
|
||||
int busIdToCudaDev(int64_t busId) {
|
||||
static int busIdToCudaDev(int64_t busId) {
|
||||
int ndev;
|
||||
if (hipGetDeviceCount(&ndev) != hipSuccess)
|
||||
return -1;
|
||||
@@ -55,8 +88,13 @@ int busIdToCudaDev(int64_t busId) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
|
||||
static int useMemcpy = 0;
|
||||
static void initCeOperation();
|
||||
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
initCeOperation();
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (!info1->hasFineGrain || !info2->hasFineGrain) {
|
||||
*ret = 0;
|
||||
@@ -74,7 +112,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
int intermediateRank;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
if (intermediateRank != -1) return ncclSuccess;
|
||||
if (intermediateRank != -1) {
|
||||
if (useMemcpy) *ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
|
||||
int cudaDev1 = busIdToCudaDev(info1->busId);
|
||||
@@ -94,7 +135,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
int p2p;
|
||||
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
|
||||
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
|
||||
cudaDev1, info1->busId, cudaDev2, info2->busId);
|
||||
cudaDev1, info1->busId, cudaDev2, info2->busId);
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -188,6 +229,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
send->transportResources = resources;
|
||||
int useRead, intermediateRank;
|
||||
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
|
||||
if (useMemcpy) useRead = 0;
|
||||
|
||||
resources->next_hdp_reg = 0;
|
||||
bool isXGMI;
|
||||
@@ -214,14 +256,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
||||
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
} else {
|
||||
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
|
||||
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
|
||||
}
|
||||
} else {
|
||||
info->rank = intermediateRank;
|
||||
@@ -231,9 +273,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
}
|
||||
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
|
||||
info->shmSize = resources->proxyInfo.shmSize;
|
||||
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
|
||||
} else {
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
|
||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
|
||||
}
|
||||
|
||||
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -259,7 +307,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
if (intermediateRank == -1) {
|
||||
info->rank = myInfo->rank;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
|
||||
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
|
||||
} else {
|
||||
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
|
||||
@@ -287,31 +335,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
|
||||
if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
|
||||
send->conn.buffs[p] = (char*)(resources->devMem+1);
|
||||
} else {
|
||||
send->conn.buffs[p] = buff;
|
||||
buff += send->comm->buffSizes[p];
|
||||
}
|
||||
}
|
||||
send->conn.tail = &remDevMem->tail;
|
||||
send->conn.head = &resources->devMem->head;
|
||||
send->conn.ptrExchange = &resources->devMem->ptrExchange;
|
||||
send->conn.next_hdp_reg = resources->next_hdp_reg;
|
||||
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
|
||||
|
||||
if (useMemcpy) {
|
||||
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
|
||||
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
|
||||
// Send SIMPLE buff to proxy, and replace it by local buffer
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
|
||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
|
||||
} else {
|
||||
send->conn.tail = &remDevMem->tail;
|
||||
send->conn.head = &resources->devMem->head;
|
||||
send->conn.ptrExchange = &resources->devMem->ptrExchange;
|
||||
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
/* Connect/Recv from this peer */
|
||||
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
|
||||
struct ncclSendMem* remDevMem;
|
||||
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
|
||||
|
||||
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
|
||||
struct ncclSendMem* remDevMem = NULL;
|
||||
|
||||
if (useMemcpy) {
|
||||
char shmPath[PATH_MAX];
|
||||
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
resources->shmSize = info->shmSize;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
|
||||
// Remove the file to ensure proper clean-up
|
||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
||||
|
||||
recv->conn.tail = &resources->devShm->recvMem.tail;
|
||||
recv->conn.head = &resources->devShm->sendMem.head;
|
||||
} else {
|
||||
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
|
||||
|
||||
recv->conn.tail = &resources->devMem->tail;
|
||||
recv->conn.head = &remDevMem->head;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
|
||||
}
|
||||
|
||||
char* buff = (char*)(resources->devMem+1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
|
||||
/* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
|
||||
recv->conn.buffs[p] = (char*)(remDevMem+1);
|
||||
} else {
|
||||
@@ -319,10 +397,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
buff += recv->comm->buffSizes[p];
|
||||
}
|
||||
}
|
||||
recv->conn.tail = &resources->devMem->tail;
|
||||
recv->conn.head = &remDevMem->head;
|
||||
recv->conn.ptrExchange = &remDevMem->ptrExchange;
|
||||
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -338,11 +412,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
|
||||
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
|
||||
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
|
||||
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
|
||||
if (useMemcpy) {
|
||||
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
|
||||
}
|
||||
free(resources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
if (useMemcpy) {
|
||||
struct p2pProxyInfo* proxyInfo;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
connection->transportResources = proxyInfo;
|
||||
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], true));
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
|
||||
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
|
||||
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
|
||||
if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
|
||||
} else {
|
||||
if (reqSize != sizeof(int)) return ncclInternalError;
|
||||
int size = *((int*)reqBuff);
|
||||
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
|
||||
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, true));
|
||||
connection->transportResources = p2pBuff->directPtr;
|
||||
hipError_t res = hipIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
|
||||
if (res != hipSuccess) {
|
||||
WARN("hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
|
||||
hipFree(p2pBuff->directPtr);
|
||||
free(p2pBuff);
|
||||
CUDACHECK(res);
|
||||
}
|
||||
}
|
||||
*done = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
if (reqSize != sizeof(int)) return ncclInternalError;
|
||||
int size = *((int*)reqBuff);
|
||||
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
|
||||
@@ -360,15 +475,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
|
||||
|
||||
if (reqSize != sizeof(void*)) return ncclInternalError;
|
||||
proxyInfo->recvFifo = *((char**)reqBuff);
|
||||
|
||||
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventCreate(proxyInfo->events+i));
|
||||
}
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
if (useMemcpy) {
|
||||
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
|
||||
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
|
||||
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
|
||||
CUDACHECK(hipFree(proxyInfo->ceDevBuff));
|
||||
CUDACHECK(hipStreamDestroy(proxyInfo->stream));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(proxyInfo->events[i]));
|
||||
}
|
||||
free(proxyInfo);
|
||||
} else {
|
||||
// Do not check return code as CUDA may have already shut down
|
||||
hipFree(connection->transportResources);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
// Do not check return code as CUDA may have already shut down
|
||||
hipFree(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
|
||||
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses hipMemcpy
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
continue;
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
CUDACHECK(hipMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, hipMemcpyDeviceToDevice, resources->stream));
|
||||
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
}
|
||||
}
|
||||
if (sub->done < sub->transmitted) {
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
hipError_t res = hipEventQuery(resources->events[buffSlot]);
|
||||
if (res != hipErrorNotReady) CUDACHECK(res);
|
||||
if (res == hipSuccess) {
|
||||
sub->done += args->sliceSteps;
|
||||
// Notify SHM
|
||||
resources->shm->recvMem.tail = sub->base + sub->done;
|
||||
}
|
||||
if (sub->done == sub->nsteps) {
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport p2pTransport = {
|
||||
"P2P",
|
||||
p2pCanConnect,
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
|
||||
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
|
||||
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
|
||||
};
|
||||
|
||||
static void initCeOperation() {
|
||||
static int init = 0;
|
||||
if (!init) {
|
||||
useMemcpy = ncclParamP2pUseCudaMemcpy();
|
||||
if (useMemcpy) {
|
||||
p2pTransport.send.proxyConnect = p2pSendProxyConnect;
|
||||
p2pTransport.send.proxyProgress = p2pSendProxyProgress;
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
|
||||
+266
-20
@@ -31,11 +31,21 @@ struct shmRecvResources {
|
||||
struct ncclRecvMem* devHostMem;
|
||||
};
|
||||
|
||||
#define SHM_SEND_SIDE 1
|
||||
#define SHM_RECV_SIDE 2
|
||||
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
|
||||
NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0);
|
||||
NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both
|
||||
static int useMemcpySend = 0;
|
||||
static int useMemcpyRecv = 0;
|
||||
NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size
|
||||
static int shmLocality = 0;
|
||||
static void initCeOperation();
|
||||
|
||||
/* Determine two peers can communicate with SHM */
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
initCeOperation();
|
||||
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
@@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
|
||||
/* Create and return connect structures for this peer to connect to me */
|
||||
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
|
||||
struct shmSendResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
send->transportResources = resources;
|
||||
@@ -65,17 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
|
||||
int shmSize = sizeof(struct ncclSendMem);
|
||||
if (shmLocality == SHM_SEND_SIDE) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
|
||||
}
|
||||
info->shmSize = resources->shmSize = shmSize;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d",
|
||||
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
|
||||
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct", comm, comm->nRanks);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
|
||||
struct shmRecvResources* resources;
|
||||
NCCLCHECK(ncclCalloc(&resources, 1));
|
||||
recv->transportResources = resources;
|
||||
@@ -86,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
char shmPath[PATH_MAX];
|
||||
shmPath[0] = '\0';
|
||||
int shmSize = sizeof(struct ncclRecvMem);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
|
||||
if (shmLocality == SHM_RECV_SIDE) {
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
|
||||
}
|
||||
info->shmSize = resources->shmSize = shmSize;
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
|
||||
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
@@ -95,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct shmProxyInfo {
|
||||
struct ncclRecvMem* ceRecvMem;
|
||||
char* devFifo;
|
||||
char* shmFifo;
|
||||
struct ncclSendMem* sendMem;
|
||||
struct ncclRecvMem* recvMem;
|
||||
|
||||
// used by progress only
|
||||
uint64_t step;
|
||||
hipStream_t stream;
|
||||
hipEvent_t events[NCCL_STEPS];
|
||||
};
|
||||
|
||||
/* Connect to this peer */
|
||||
ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
|
||||
// Setup device pointers
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
|
||||
@@ -109,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
// Remove the file to ensure proper clean-up
|
||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
||||
|
||||
send->transportResources = resources;
|
||||
int offset = 0;
|
||||
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
|
||||
offset += send->comm->buffSizes[p];
|
||||
send->conn.buffs[p] = buff;
|
||||
buff += send->comm->buffSizes[p];
|
||||
}
|
||||
send->conn.tail = &resources->devRemHostMem->tail;
|
||||
|
||||
send->conn.head = &resources->devHostMem->head;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
|
||||
}
|
||||
if (useMemcpySend) {
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
|
||||
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
|
||||
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
send->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
|
||||
// Setup device pointers
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
|
||||
@@ -132,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
|
||||
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
|
||||
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
|
||||
NCCLCHECK(ncclShmUnlink(shmPath));
|
||||
recv->conn.head = &resources->devRemHostMem->head;
|
||||
|
||||
int offset = 0;
|
||||
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
|
||||
offset += recv->comm->buffSizes[p];
|
||||
recv->conn.buffs[p] = buff;
|
||||
buff += recv->comm->buffSizes[p];
|
||||
}
|
||||
recv->conn.head = &resources->devRemHostMem->head;
|
||||
recv->conn.tail = &resources->devHostMem->tail;
|
||||
|
||||
if (useMemcpyRecv) {
|
||||
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
|
||||
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
|
||||
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
|
||||
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
|
||||
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
static ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
@@ -151,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
|
||||
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
|
||||
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
|
||||
@@ -159,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(proxyInfo, reqBuff, reqSize);
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventCreate(proxyInfo->events+i));
|
||||
}
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
connection->transportResources = proxyInfo;
|
||||
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, respSize);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
|
||||
struct shmProxyInfo* proxyInfo;
|
||||
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
|
||||
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(proxyInfo, reqBuff, reqSize);
|
||||
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
|
||||
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventCreate(proxyInfo->events+i));
|
||||
}
|
||||
connection->proxyAppendPtr = &connection->proxyAppend;
|
||||
connection->transportResources = proxyInfo;
|
||||
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
|
||||
memcpy(respBuff, proxyInfo, respSize);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
|
||||
CUDACHECK(hipStreamDestroy(resources->stream));
|
||||
CUDACHECK(hipFree(resources->devFifo));
|
||||
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
|
||||
for (int i=0; i<NCCL_STEPS; i++) {
|
||||
CUDACHECK(hipEventDestroy(resources->events[i]));
|
||||
}
|
||||
free(connection->transportResources);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
|
||||
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
continue;
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
|
||||
// Check GPU has sent everything
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
CUDACHECK(hipMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, hipMemcpyDeviceToHost, resources->stream));
|
||||
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
|
||||
resources->recvMem->sizesFifo[buffSlot] = size;
|
||||
__sync_synchronize(); // make sure sizesFifo is visible
|
||||
sub->transmitted += args->sliceSteps;
|
||||
}
|
||||
}
|
||||
if (sub->done < sub->transmitted) {
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
hipError_t res = hipEventQuery(resources->events[buffSlot]);
|
||||
if (res != hipErrorNotReady) CUDACHECK(res);
|
||||
if (res == hipSuccess) {
|
||||
sub->done += args->sliceSteps;
|
||||
// Notify SHM
|
||||
resources->recvMem->tail = sub->base + sub->done;
|
||||
}
|
||||
if (sub->done == sub->nsteps) {
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
|
||||
if (args->state == ncclProxyOpReady) {
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
|
||||
// Round to next multiple of sliceSteps
|
||||
sub->base = ROUNDUP(resources->step, args->chunkSteps);
|
||||
sub->posted = sub->transmitted = sub->done = 0;
|
||||
}
|
||||
args->state = ncclProxyOpProgress;
|
||||
}
|
||||
args->idle = 1;
|
||||
if (args->state == ncclProxyOpProgress) {
|
||||
int p = args->protocol;
|
||||
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
|
||||
for (int s=0; s<args->nsubs; s++) {
|
||||
struct ncclProxySubArgs* sub = args->subs+s;
|
||||
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
|
||||
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
continue;
|
||||
}
|
||||
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
|
||||
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
|
||||
volatile int* sizesFifo = resources->recvMem->sizesFifo;
|
||||
volatile uint64_t* recvTail = &resources->recvMem->tail;
|
||||
// Check data is ready in SHM
|
||||
if ((*recvTail > sub->base+sub->transmitted)) {
|
||||
int size = sizesFifo[buffSlot];
|
||||
CUDACHECK(hipMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, hipMemcpyHostToDevice, resources->stream));
|
||||
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
|
||||
sub->transmitted += args->sliceSteps;
|
||||
}
|
||||
}
|
||||
if (sub->done < sub->transmitted) {
|
||||
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
|
||||
hipError_t res = hipEventQuery(resources->events[buffSlot]);
|
||||
if (res != hipErrorNotReady) CUDACHECK(res);
|
||||
if (res == hipSuccess) {
|
||||
sub->done += args->sliceSteps;
|
||||
// Notify GPU
|
||||
resources->ceRecvMem->tail = sub->base + sub->done;
|
||||
}
|
||||
if (sub->done == sub->nsteps) {
|
||||
resources->step = sub->base + sub->nsteps;
|
||||
args->done++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (args->done == args->nsubs) {
|
||||
args->state = ncclProxyOpNone;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
struct ncclTransport shmTransport = {
|
||||
"SHM",
|
||||
shmCanConnect,
|
||||
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
|
||||
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
static void initCeOperation() {
|
||||
static int init = 0;
|
||||
if (!init) {
|
||||
useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1);
|
||||
useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
|
||||
if (useMemcpySend) {
|
||||
shmTransport.send.proxyConnect = shmSendProxyConnect;
|
||||
shmTransport.send.proxyFree = shmSendProxyFree;
|
||||
shmTransport.send.proxyProgress = shmSendProxyProgress;
|
||||
}
|
||||
if (useMemcpyRecv) {
|
||||
shmTransport.recv.proxyConnect = shmRecvProxyConnect;
|
||||
shmTransport.recv.proxyFree = shmRecvProxyFree;
|
||||
shmTransport.recv.proxyProgress = shmRecvProxyProgress;
|
||||
}
|
||||
shmLocality = ncclParamShmLocality();
|
||||
if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) {
|
||||
WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)");
|
||||
shmLocality = SHM_RECV_SIDE;
|
||||
}
|
||||
init = 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE = topo_expl
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/rocm_smi/include/ -DTOPO_EXPL -DENABLE_TRACE
|
||||
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/misc/param.cc \
|
||||
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc
|
||||
|
||||
@@ -27,6 +27,7 @@ struct allGather3Data_t{
|
||||
struct ncclGraphInfo ring;
|
||||
struct ncclGraphInfo collNet;
|
||||
struct ncclTopoRanks topoRanks;
|
||||
bool pivotA2AEnabled;
|
||||
};
|
||||
|
||||
void initCollNet();
|
||||
|
||||
@@ -179,10 +179,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
|
||||
|
||||
if (proxyRank == myInfo->rank) {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
} else {
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
|
||||
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
}
|
||||
*((int*)connectInfo) = proxyRank;
|
||||
@@ -205,7 +205,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
|
||||
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
|
||||
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
|
||||
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
|
||||
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -250,8 +250,9 @@ struct ncclTransport collNetTransport = {
|
||||
{ collNetRecvSetup, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
struct ncclTransport ncclTransports[NTRANSPORTS] = {
|
||||
p2pTransport,
|
||||
shmTransport,
|
||||
netTransport,
|
||||
struct ncclTransport* ncclTransports[] = {
|
||||
&p2pTransport,
|
||||
&shmTransport,
|
||||
&netTransport,
|
||||
&collNetTransport,
|
||||
};
|
||||
|
||||
@@ -49,6 +49,8 @@ THE SOFTWARE.
|
||||
#include "graph.h"
|
||||
|
||||
NodeModel *node_model;
|
||||
extern ncclNet_t* ncclNet;
|
||||
|
||||
|
||||
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
|
||||
char ** itr = std::find(begin, end, option);
|
||||
@@ -216,14 +218,12 @@ int main(int argc,char* argv[])
|
||||
comm[i].nRanks = nranks;
|
||||
NCCLCHECK(ncclCalloc(&comm[i].connectSend, NCCL_MAX_CONNS*comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm[i].connectRecv, NCCL_MAX_CONNS*comm->nRanks));
|
||||
comm[i].p2pSendCount = comm[i].p2pRecvCount = 0;
|
||||
NCCLCHECK(ncclCalloc(&comm[i].p2pSends, comm->nRanks));
|
||||
NCCLCHECK(ncclCalloc(&comm[i].p2pRecvs, comm->nRanks));
|
||||
node_model = network.GetNode(i);
|
||||
assert(node_model!=0);
|
||||
comm[i].busId = node_model->getGpuBusId(i);
|
||||
comm[i].topo = node_model->getSystem(i);
|
||||
comm[i].peerInfo = peerInfo;
|
||||
comm[i].ncclNet = ncclNet;
|
||||
// Mark channels as non initialized.
|
||||
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
|
||||
NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
|
||||
@@ -272,8 +272,6 @@ int main(int argc,char* argv[])
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
free(comm[i].connectSend);
|
||||
free(comm[i].connectRecv);
|
||||
free(comm[i].p2pSends);
|
||||
free(comm[i].p2pRecvs);
|
||||
}
|
||||
|
||||
free(treeGraph);
|
||||
|
||||
+154
-78
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -216,20 +216,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
|
||||
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
|
||||
comm->channels[channelId].peers[peer].recv + connIndex;
|
||||
|
||||
// handle intra-node network connections
|
||||
int n1 = -1, n2 = -1;
|
||||
if (connIndex == NCCL_CONN_IDX_P2P_NET) {
|
||||
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, comm->rank, graph, channelId, (type == 1) ? 1 : 0, &n1));
|
||||
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
|
||||
}
|
||||
|
||||
bool xgmi;
|
||||
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
|
||||
|
||||
for (int t=0; t<NTRANSPORTS; t++) {
|
||||
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
|
||||
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
|
||||
struct ncclTransport *transport = ncclTransports+t;
|
||||
struct ncclTransport *transport = ncclTransports[t];
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
||||
@@ -244,18 +243,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
|
||||
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
|
||||
uint32_t mask = 1 << channel->id;
|
||||
struct ncclChannel* channel = &comm->channels[channelId];
|
||||
uint32_t mask = 1 << channelId;
|
||||
for (int i=0; i<nrecv; i++) {
|
||||
int peer = peerRecv[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
|
||||
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
|
||||
comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
|
||||
}
|
||||
for (int i=0; i<nsend; i++) {
|
||||
int peer = peerSend[i];
|
||||
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
|
||||
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
|
||||
comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -271,17 +271,18 @@ void dumpData(struct ncclConnect* data, int ndata) {
|
||||
|
||||
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
|
||||
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
|
||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||
|
||||
//hipStream_t transportSetupStream;
|
||||
//CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
|
||||
int highestType = TRANSPORT_P2P; // track highest transport type
|
||||
|
||||
struct ncclConnect data[2*MAXCHANNELS];
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
|
||||
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
|
||||
int sendPeer = (comm->rank + i) % comm->nRanks;
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
|
||||
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
|
||||
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
|
||||
|
||||
struct ncclConnect* recvData = data;
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
@@ -319,7 +320,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
|
||||
//NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
|
||||
}
|
||||
}
|
||||
for (int c=0; c<MAXCHANNELS; c++) {
|
||||
@@ -327,10 +328,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
|
||||
//NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
|
||||
conn->connected = 1;
|
||||
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
|
||||
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
|
||||
}
|
||||
}
|
||||
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
|
||||
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
|
||||
}
|
||||
//CUDACHECK(hipStreamSynchronize(transportSetupStream));
|
||||
//CUDACHECK(hipStreamDestroy(transportSetupStream));
|
||||
@@ -357,10 +358,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
// check if we can connect to collnet, whose root is the nranks-th rank
|
||||
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
|
||||
peerInfo->rank = nranks;
|
||||
int support = 1;
|
||||
if (isMaster) {
|
||||
NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
|
||||
}
|
||||
|
||||
// send master receives connect info from peer recv master
|
||||
if (isMaster && type == collNetSend) {
|
||||
@@ -370,14 +367,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
}
|
||||
|
||||
// select
|
||||
struct ncclPeer* root = channel->peers+nranks;
|
||||
struct ncclChannelPeer* root = channel->peers+nranks;
|
||||
// connector index: 0 for recv, 1 for send
|
||||
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
|
||||
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
|
||||
conn->transportComm = transportComm;
|
||||
// setup
|
||||
struct ncclConnect myConnect;
|
||||
if (isMaster && support) {
|
||||
if (isMaster) {
|
||||
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
|
||||
}
|
||||
// prepare connect handles
|
||||
@@ -407,11 +404,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
//if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
|
||||
}
|
||||
// connect
|
||||
if (isMaster && support) {
|
||||
if (isMaster) {
|
||||
//NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
|
||||
struct ncclPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
|
||||
//CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
|
||||
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
|
||||
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
|
||||
//CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
|
||||
}
|
||||
// recv side sends connect info to send side
|
||||
if (isMaster && type == collNetRecv) {
|
||||
@@ -420,7 +417,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
|
||||
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
|
||||
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
|
||||
}
|
||||
if (support) fail = 0;
|
||||
fail = 0;
|
||||
cleanup:
|
||||
if (allConnects != NULL) free(allConnects);
|
||||
if (masterConnects != NULL) free(masterConnects);
|
||||
@@ -449,21 +446,24 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
|
||||
// Free collNet resources
|
||||
for (int r=0; r<comm->nChannels; r++) {
|
||||
struct ncclChannel* channel = comm->channels+r;
|
||||
struct ncclPeer* peer = channel->peers+comm->nRanks;
|
||||
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* send = peer->send + b;
|
||||
//if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
|
||||
//if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
|
||||
send->transportResources = NULL; // avoid double free
|
||||
}
|
||||
for (int b=0; b<NCCL_MAX_CONNS; b++) {
|
||||
struct ncclConnector* recv = peer->recv + b;
|
||||
//if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
|
||||
//if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
|
||||
recv->transportResources = NULL; // avoid double free
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
|
||||
RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
|
||||
|
||||
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
|
||||
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
|
||||
// We use 2 AllGathers
|
||||
@@ -499,12 +499,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
comm->topo->nRanks = comm->nRanks;
|
||||
// init netGdrLevel
|
||||
comm->topo->netGdrLevel = -2;
|
||||
// init Pivot A2A related fields
|
||||
comm->topo->pivotA2AEnabled = false;
|
||||
comm->topo->pivotA2ANumBiRings = 0;
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
|
||||
// Recompute paths after trimming
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
|
||||
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
|
||||
// Init search
|
||||
NCCLCHECK(ncclTopoSearchInit(comm->topo));
|
||||
// Print final topology
|
||||
@@ -571,39 +574,31 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
|
||||
CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)
|
||||
{
|
||||
if (hasPeerAccess)
|
||||
{
|
||||
if (intraProcRanks == nranks)
|
||||
cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
|
||||
else
|
||||
cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
|
||||
}
|
||||
|
||||
// For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
|
||||
if (!allXgmi && !rcclParamCliqueIgnoreTopo())
|
||||
{
|
||||
INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
|
||||
cliqueMode = CliqueManager::CLIQUE_DISABLED;
|
||||
}
|
||||
}
|
||||
comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
|
||||
NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
|
||||
} // [/RCCL]
|
||||
#endif
|
||||
|
||||
if (comm->rank == ncclParamGraphDumpFileRank()) {
|
||||
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
|
||||
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
|
||||
}
|
||||
|
||||
// Determine local CollNet support before all-gather
|
||||
if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
|
||||
if (collNetSupport(comm)) {
|
||||
char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
|
||||
if (collNetEnable != NULL) {
|
||||
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
|
||||
if (strcmp(collNetEnable, "1") == 0) {
|
||||
comm->collNetSupport = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;
|
||||
|
||||
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
|
||||
if (rcclParamP2pNetDisable() == 0) {
|
||||
if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1;
|
||||
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
|
||||
}
|
||||
else
|
||||
INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
|
||||
}
|
||||
// AllGather3 - begin
|
||||
#if 0
|
||||
struct ncclGraphInfo {
|
||||
@@ -624,6 +619,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
struct ncclGraphInfo ring;
|
||||
struct ncclGraphInfo collNet;
|
||||
struct ncclTopoRanks topoRanks;
|
||||
bool pivotA2AEnabled;
|
||||
} *allGather3Data;
|
||||
|
||||
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
|
||||
@@ -666,6 +662,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
|
||||
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
|
||||
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
|
||||
allGather3Data[rank].collNetSupport = comm->collNetSupport;
|
||||
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
|
||||
|
||||
comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
|
||||
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
|
||||
@@ -758,6 +755,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
|
||||
collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
|
||||
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
|
||||
comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
|
||||
}
|
||||
|
||||
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
|
||||
@@ -818,16 +816,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
|
||||
if (ringGraph.nIntraChannels) {
|
||||
if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
|
||||
comm->useIntraNet = 1;
|
||||
// Connect NET for intranode use
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
|
||||
}
|
||||
@@ -838,8 +836,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
|
||||
INFO(NCCL_INIT, "Connected all trees");
|
||||
@@ -861,7 +859,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
for (int h=0; h<nHeads; h++) {
|
||||
const int head = heads[h];
|
||||
collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
|
||||
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
|
||||
collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
|
||||
}
|
||||
// Verify CollNet setup across ranks after trying the first channel
|
||||
if (c == 0) {
|
||||
@@ -876,12 +874,12 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
int highestTransportType0, highestTransportType1;
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelRecv = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channelSend = comm->channels+c;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
|
||||
|
||||
@@ -919,6 +917,52 @@ collnet_cleanup:
|
||||
// Compute nChannels per peer for p2p
|
||||
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
|
||||
#if 0
|
||||
do { // Setup p2p structures in comm->tasks
|
||||
struct ncclTasks* tasks = &comm->tasks;
|
||||
int nRanks = comm->nRanks;
|
||||
int node = comm->node;
|
||||
int nNodes = comm->nNodes;
|
||||
struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
|
||||
int localRank = comm->localRank;
|
||||
tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
|
||||
tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
||||
tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
|
||||
int s=0, r=0;
|
||||
// schedule delta 0, +1, -1, +2, -2, ...
|
||||
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
|
||||
for (int d=0; d <= nNodes/4; d++) {
|
||||
int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
|
||||
int index = 0;
|
||||
int delta = deltas[index];
|
||||
sched_delta:
|
||||
int recvNode = (node+nNodes-delta)%nNodes;
|
||||
int sendNode = (node+delta)%nNodes;
|
||||
int steps = comm->maxLocalRanks;
|
||||
for (int step=0; step < steps; step++) {
|
||||
int recvIndex = (localRank-step+steps)%steps;
|
||||
if (recvIndex < nodeRanks[recvNode].localRanks) {
|
||||
tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
|
||||
r++;
|
||||
}
|
||||
int sendIndex = (localRank+step)%steps;
|
||||
if (sendIndex < nodeRanks[sendNode].localRanks) {
|
||||
tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
|
||||
s++;
|
||||
}
|
||||
}
|
||||
index++;
|
||||
if (index == 1 && deltas[1] == deltas[0]) index++;
|
||||
if (index == 2 && deltas[2] == deltas[0]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[2]) index++;
|
||||
if (index == 3 && deltas[3] == deltas[1]) index++;
|
||||
if (index < 4) {
|
||||
delta = deltas[index];
|
||||
goto sched_delta;
|
||||
}
|
||||
}
|
||||
assert(s == nRanks && r == nRanks);
|
||||
} while (0);
|
||||
|
||||
if (ncclParamNvbPreconnect()) {
|
||||
// Connect p2p when using NVB path
|
||||
int nvbNpeers;
|
||||
@@ -926,18 +970,17 @@ collnet_cleanup:
|
||||
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
|
||||
for (int r=0; r<nvbNpeers; r++) {
|
||||
int peer = nvbPeers[r];
|
||||
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
|
||||
int channelId;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
|
||||
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
|
||||
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
|
||||
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
|
||||
comm->connectSend[peer] |= (1<<channelId);
|
||||
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
|
||||
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
|
||||
comm->connectRecv[peer] |= (1<<channelId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -947,18 +990,17 @@ collnet_cleanup:
|
||||
#endif
|
||||
// Connect to local net proxy
|
||||
struct ncclProxyConnector proxyConn;
|
||||
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
|
||||
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
|
||||
//NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
|
||||
|
||||
// Then to remote ones when using PXN
|
||||
if (ncclPxnDisable() == 0) {
|
||||
if (ncclPxnDisable(comm) == 0) {
|
||||
int nranks;
|
||||
int* pxnPeers;
|
||||
NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
|
||||
for (int r=0; r<nranks; r++) {
|
||||
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
|
||||
//NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
|
||||
// NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
|
||||
}
|
||||
free(pxnPeers);
|
||||
}
|
||||
@@ -973,6 +1015,10 @@ collnet_cleanup:
|
||||
if (intraProcRanks == 0) intraProcRank0 = i;
|
||||
if (i == rank) intraProcRank = intraProcRanks;
|
||||
intraProcRanks++;
|
||||
if (intraProcRank0 == rank && rank != i) {
|
||||
comm->peerInfo[i].comm->intraNext = comm->intraNext;
|
||||
comm->intraNext = comm->peerInfo[i].comm;
|
||||
}
|
||||
}
|
||||
}
|
||||
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
|
||||
@@ -983,14 +1029,40 @@ collnet_cleanup:
|
||||
intraProcRank, intraProcRanks, intraProcRank0);
|
||||
return ncclInternalError;
|
||||
}
|
||||
//NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
|
||||
struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
|
||||
assert(intraProcRank==0 ? comm==comm0 : true);
|
||||
comm->intraComm0 = comm0;
|
||||
comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
|
||||
comm->intraRank = intraProcRank;
|
||||
comm->intraRanks = intraProcRanks;
|
||||
comm->intraBarrierPhase = 0;
|
||||
comm->intraBarrierCounter = 0;
|
||||
comm->intraBarrierGate = 0;
|
||||
} while(0);
|
||||
|
||||
#if 0
|
||||
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
|
||||
char* str = getenv("NCCL_LAUNCH_MODE");
|
||||
enum ncclLaunchMode mode, modeOld;
|
||||
if (str && strcasecmp(str, "GROUP") == 0) {
|
||||
mode = ncclLaunchModeGroup;
|
||||
} else {
|
||||
mode = ncclLaunchModeParallel;
|
||||
}
|
||||
// In theory we could be racing with other communicators not associated with
|
||||
// this one if the user is connecting to multiple ncclUniqueId's concurrently.
|
||||
modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
|
||||
if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
|
||||
INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
|
||||
}
|
||||
}
|
||||
|
||||
/* Local intra-node barrier */
|
||||
//NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
|
||||
|
||||
// Unlink proxy shm to make sure it will be properly cleaned up.
|
||||
//NCCLCHECK(ncclProxyShmUnlink(comm));
|
||||
NCCLCHECK(ncclProxyShmUnlink(comm));
|
||||
#endif
|
||||
|
||||
// We should have allocated all buffers, collective fifos, ... we can
|
||||
// restore the affinity.
|
||||
@@ -1013,3 +1085,7 @@ ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* d
|
||||
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *bw) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int ncclNetVersion(struct ncclComm* comm) {
|
||||
return 4;
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user