Merge remote-tracking branch 'nccl/master' into develop

Este commit está contenido en:
Wenkai Du
2022-09-09 01:20:52 +00:00
Se han modificado 90 ficheros con 5517 adiciones y 3115 borrados
+2 -7
Ver fichero
@@ -175,12 +175,6 @@ set(CC_SOURCES
src/collectives/all_to_all_api.cc
src/collectives/all_to_allv_api.cc
src/channel.cc
#src/clique/CliqueManager.cc # RCCL
#src/clique/HandleCache.cc # RCCL
#src/clique/HandleShm.cc # RCCL
#src/clique/Hash.cc # RCCL
#src/clique/MsgQueue.cc # RCCL
#src/clique/ShmObject.cc # RCCL
src/misc/argcheck.cc
src/misc/nvmlwrap_stub.cc
src/misc/utils.cc
@@ -193,6 +187,8 @@ set(CC_SOURCES
src/misc/signals.cc # RCCL
src/misc/socket.cc
src/misc/param.cc
src/misc/rocmwrap.cc
src/misc/strongstream.cc
src/transport/coll_net.cc
src/transport/net.cc
src/transport/net_ib.cc
@@ -208,7 +204,6 @@ set(CC_SOURCES
src/enqueue.cc
${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
foreach(filename ${CC_SOURCES})
list(APPEND CPP_SOURCES ${filename})
endforeach(filename)
Archivo normal → Archivo ejecutable
Ver fichero
+2 -2
Ver fichero
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 12
NCCL_PATCH := 12
NCCL_MINOR := 13
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
+2 -1
Ver fichero
@@ -10,7 +10,8 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc \
misc/cudawrap.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc \
misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc misc/param.cc misc/strongstream.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
+4 -1
Ver fichero
@@ -106,6 +106,7 @@ static void *bootstrapRoot(void* args) {
do {
struct ncclSocket sock;
sock.abortFlag = NULL;
/* bootstrap root thread always uses blocking ncclSocketAccept. */
NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
close(sock.fd);
@@ -140,6 +141,7 @@ static void *bootstrapRoot(void* args) {
int next = (r+1) % nranks;
struct ncclSocket sock;
sock.abortFlag = NULL;
sock.asyncFlag = 0;
memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
@@ -289,7 +291,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
TRACE(NCCL_INIT, "rank %d nranks %d virtualId %d", rank, nranks, virtualId);
return ncclSuccess;
}
@@ -324,6 +326,7 @@ ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int s
struct bootstrapState* state = (struct bootstrapState*)commState;
struct ncclSocket sock;
sock.abortFlag = state->abortFlag;
sock.asyncFlag = 0;
memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketConnect(&sock));
NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
+24 -46
Ver fichero
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,75 +8,54 @@
#include "param.h"
#include "gdrwrap.h"
// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
struct ncclChannel* channel = &comm->channels[channelId];
if (channel->id != -1) return ncclSuccess;
channel->id = channelid;
// Ring index to user rank table.
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
int nRanks = comm->nRanks;
channel->id = channelId;
channel->workFifoSent = 0;
// Communication structures with peers.
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
for (size_t i=0; i<comm->nRanks+1; ++i) {
for (int b=0; b<NCCL_MAX_CONNS; b++) {
channel->peers[i].send[b].comm = comm;
channel->peers[i].recv[b].comm = comm;
NCCLCHECK(ncclStrongStreamAcquireUncaptured(&comm->deviceStream));
// The extra on nRanks+1 is for collnet root (i.e. network)
channel->peers = ncclMemoryStackAlloc<struct ncclChannelPeer>(&comm->memPermanent, nRanks+1);
NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nRanks+1, comm->deviceStream.stream));
ncclCommPushCudaFree(comm, channel->devPeers);
channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, comm->deviceStream.stream));
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNull(), &comm->deviceStream));
for (int r=0; r < nRanks+1; ++r) {
for (int b=0; b < NCCL_MAX_CONNS; b++) {
channel->peers[r].send[b].comm = comm;
channel->peers[r].recv[b].comm = comm;
}
}
// Per-channel operation list.
NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
// GDRCOPY support
// We allocate a workFifo in GDR mapped CUDA memory
// But we still allocate the Host workFifo so that we
// can copy the work elements to CUDA memory on kernel launch
NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc));
} else {
// The device workFifo is the Host one
channel->workFifoDev = channel->workFifo;
}
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (channel->id == -1) return ncclSuccess;
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->workFifo));
if (channel->gdrMemDesc) {
// GDRCOPY support
NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc));
}
// Free Ring index to rank tables
free(channel->ring.userRanks);
CUDACHECK(hipFree(channel->ring.devUserRanks));
// Free transport proxy resources
// Note: free all send resources first due to CollNet arrangement
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
}
}
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
struct ncclChannelPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
}
// Free the peer structures.
CUDACHECK(hipFree(channel->devPeers));
free(channel->peers);
return ncclSuccess;
}
+1 -1
Ver fichero
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu alltoall_pivot.cu
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
LIBSRCFILES += functions.cu
+2 -2
Ver fichero
@@ -13,11 +13,11 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
const int *ringRanks = ring->devUserRanks;
const int *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
+9 -11
Ver fichero
@@ -8,7 +8,6 @@
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
//#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
@@ -18,7 +17,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
@@ -187,11 +186,6 @@ namespace {
}
#endif
// Make final copy from buffer to dest.
chunk = modRanks(ringIx + 1);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
@@ -200,6 +194,10 @@ namespace {
}
#endif
// Make final copy from buffer to dest.
chunk = modRanks(ringIx + 1);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
prims.directRecv(offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
@@ -223,7 +221,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem->channel.tree;
@@ -375,7 +373,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem->channel.tree;
@@ -600,9 +598,9 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
const int hasDn = (tree->down[0] >= 0) ? 1 : 0;
const int nThreadsScatter = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
const int nThreadsBcast = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int tidStartBcast = nThreadsGather;
const int tidStartScatter = tidStartBcast + nThreadsBcast;
const int tidStartReduce = tidStartScatter + nThreadsScatter;
+1 -1
Ver fichero
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
IMPL_COLL_ALLTOALL_PIVOT(AllToAllPivot);
IMPL_COLL_F(AllToAllPivot);
+4 -4
Ver fichero
@@ -12,7 +12,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nranks = ncclShmem->comm.nRanks;
const ncclRing *ring = &ncclShmem->channel.ring;
@@ -29,11 +29,11 @@ namespace {
const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0, args->connIndex << 16);
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);
for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) {
const int src_rank = ring->devUserRanks[(nranks - num_hops) % nranks];
const int dst_rank = ring->devUserRanks[num_hops];
const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
const int dst_rank = ring->userRanks[num_hops];
const ssize_t send_offset =
dst_rank * num_elems * elem_size + chunk_offset +
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
+3 -3
Ver fichero
@@ -12,7 +12,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
@@ -20,8 +20,8 @@ namespace {
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
const int root = args->root;
T *inputBuf = (T*)args->sendbuff;
+192 -197
Ver fichero
@@ -10,7 +10,6 @@
#include "collectives.h"
#include "devcomm.h"
#include "op128.h"
#define COLL_UNROLL 2
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
@@ -320,154 +319,71 @@ class ncclFunction {
};
#ifdef ENABLE_COLLTRACE
#define traceColl(elem,launch_type) \
#define traceColl(launch_type) { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.comm.collTrace[pos].bid = blockIdx.x; \
shmem.comm.collTrace[pos].funcIndex = shmem.work.header.funcIndex; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (shmem.comm.collTrace[pos].data_0)); \
if (elem.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = (struct ncclWorkElemP2p *)&elem; \
shmem.comm.collTrace[pos].p2p[0].connIndex = p2pElems[0].connIndex; \
shmem.comm.collTrace[pos].p2pOpCount[0] = p2pElems[0].opCount; \
shmem.comm.collTrace[pos].p2p[0].ngroups = p2pElems[0].ngroups; \
shmem.comm.collTrace[pos].p2p[0].nWarps = p2pElems[0].nWarps; \
shmem.comm.collTrace[pos].p2p[0].warpStart = p2pElems[0].warpStart; \
shmem.comm.collTrace[pos].p2p[0].peer = (uint16_t)(p2pElems[0].peer); \
shmem.comm.collTrace[pos].p2p[1].connIndex = p2pElems[1].connIndex; \
shmem.comm.collTrace[pos].p2pOpCount[1] = p2pElems[1].opCount; \
shmem.comm.collTrace[pos].p2p[1].ngroups = p2pElems[1].ngroups; \
shmem.comm.collTrace[pos].p2p[1].nWarps = p2pElems[1].nWarps; \
shmem.comm.collTrace[pos].p2p[1].warpStart = p2pElems[1].warpStart; \
shmem.comm.collTrace[pos].p2p[1].peer = (uint16_t)(p2pElems[1].peer); \
shmem.comm.collTrace[pos].type = (ncclCollTraceP2pElemType|launch_type); \
} else { \
shmem.comm.collTrace[pos].opCount = elem.opCount; \
shmem.comm.collTrace[pos].coll.nWarps = elem.header.nWarps; \
shmem.comm.collTrace[pos].coll.bid = elem.bid; \
shmem.comm.collTrace[pos].coll.nChannels = elem.nChannels; \
shmem.comm.collTrace[pos].type = (ncclCollTraceCollElemType|launch_type); \
}
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->funcIndex = shmem.work.header.funcIndex; \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0)); \
if (shmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = shmem.work.p2pElems; \
collTrace->p2p[0].connIndex = 0; \
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
collTrace->p2p[1].connIndex = 0; \
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
} else if (shmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = shmem.work.elems; \
collTrace->opCount = elems[0].opCount; \
collTrace->coll.nWarps = elems[0].nWarps; \
collTrace->coll.bid = elems[0].bid; \
collTrace->coll.nChannels = elems[0].nChannels; \
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
} \
}
#define traceKernelLaunch(elem,firstLaunch) { \
traceColl(elem,(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType)); \
#define traceKernelLaunch(firstLaunch) { \
traceColl(firstLaunch?ncclCollTraceKernelLaunchType:ncclCollTraceCollLaunchType); \
}
#define traceKernelEnd() { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.comm.collTrace[pos].bid = bid; \
shmem.comm.collTrace[pos].type = ncclCollTraceKernelEndType; \
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceKernelEndType; \
}
#define traceAbort() { \
uint32_t pos = __atomic_fetch_add(shmem.comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
shmem.comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.comm.collTrace[pos].bid = bid; \
shmem.comm.collTrace[pos].type = ncclCollTraceAbortType; \
struct ncclCollTrace* collTrace = shmem.comm.collTrace+pos; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->bid = blockIdx.x; \
collTrace->type = ncclCollTraceAbortType; \
}
// traceData(int16_t data2, uint32_t data4, uint64_t data8_0, uint64_t data8_1)
#define traceData(data2, data4, data8_0, data8_1) { \
uint32_t pos = atomicAdd(ncclShmem->comm.collTraceTail, 1)%COLLTRACE_NUM_ITEMS; \
ncclShmem->comm.collTrace[pos].bid = blockIdx.x; \
ncclShmem->comm.collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \
ncclShmem->comm.collTrace[pos].funcIndex = data2; \
ncclShmem->comm.collTrace[pos].data_0 = data4; \
ncclShmem->comm.collTrace[pos].opCount = data8_0; \
ncclShmem->comm.collTrace[pos].data_1 = data8_1; \
ncclShmem->comm.collTrace[pos].type = ncclCollTraceDataType; \
uint32_t pos = __atomic_fetch_add(ncclShmem->comm.collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem->comm.collTrace+pos; \
collTrace->bid = blockIdx.x; \
collTrace->timeStamp = __builtin_amdgcn_s_memrealtime(); \
collTrace->funcIndex = data2; \
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
}
#else
#define traceKernelLaunch()
#define traceAbort()
#define traceData(data2, data4, data8_0, data8_1)
#endif
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
shmem.prof.elem[shmem.prof.count].line = line_num; \
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.prof.count++; \
} \
} while(0);
#else
#define __insert_timestamp(line_num)
#endif
// Copy src to dst and fill extra size with zeroes
template<typename Tdst, typename Tsrc>
__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
"copyToShmem needs sizes which are multiple of 16B");
static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
uint64_t *d = reinterpret_cast<uint64_t*>(dst);
uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
uint64_t *shmemPtr = d;
int offset = 2*tid;
uint64_t v0, v1;
if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
v0 = v1 = 0ULL;
} else {
v0 = s[offset] ; v1 = s[offset+1];
}
if (offset < sizeof(Tdst)/sizeof(uint64_t)) {
shmemPtr[offset] = v0; shmemPtr[offset+1] = v1;
}
}
template<typename T>
__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
uint64_t *d = reinterpret_cast<uint64_t*>(dst);
uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
int t = threadIdx.x - turn;
if (t < 0) t += blockDim.x;
int n = sizeof(T)/sizeof(uint64_t);
int delta = (n + WARP_SIZE-1) & -WARP_SIZE; // round up to warp lane 0
if (delta < blockDim.x) {
turn += delta;
if (turn >= blockDim.x) turn -= blockDim.x;
}
else
turn = 0;
n -= t;
d += t;
s += t;
#pragma unroll
for (int i=0; i < divUp(sizeof(T), WARP_SIZE*sizeof(uint64_t)); i++) {
if (n > 0) {
*d = *s;
d += blockDim.x;
s += blockDim.x;
n -= blockDim.x;
}
}
return turn;
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWorkElement {
__device__ void run(ncclWorkElem*) {
// Put NOT IMPLEMENTED behavior here.
}
};
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWork {
// This __forceinline__ is necessary. The compiler was inserting a function call
// here from the LL ncclKernel.
__device__ __forceinline__ void run(ncclWork *w) {
int wid = threadIdx.x / WARP_SIZE;
int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
#pragma unroll 1
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
if (wid < w->header.nWarps)
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
}
}
};
struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_DIRECT_ARITY];
@@ -484,18 +400,67 @@ struct ncclShmemData {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
};
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
struct ncclDevComm comm;
struct ncclChannel channel;
uint64_t pad[2];
struct ncclWork work;
int channelId;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
alignas(16) struct ncclWork work;
#ifdef ENABLE_PROFILING
struct ncclProf prof;
#endif
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (shmem.prof.count < PROFILE_NUM_ITEMS) { \
shmem.prof.elem[shmem.prof.count].line = line_num; \
shmem.prof.elem[shmem.prof.count].timeStamp = __builtin_amdgcn_s_memrealtime(); \
shmem.prof.count++; \
} \
} while(0);
#else
#define __insert_timestamp(line_num)
#endif
// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
int offset = 16*tid;
if (offset < bytes) {
ulong2 *src2, *dst2;
src2 = (ulong2*)((char const*)src + offset);
dst2 = (ulong2*)((char*)dst + offset);
dst2->x = src2->x;
dst2->y = src2->y;
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWorkElement {
__device__ void run(ncclWorkElem*) {
// Put NOT IMPLEMENTED behavior here.
}
};
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWork {
// This __forceinline__ is necessary. The compiler was inserting a function call
// here from the LL ncclKernel.
__device__ __forceinline__ void run(ncclWork *w) {
int wid = threadIdx.x / WARP_SIZE;
ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
#pragma unroll 1
while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
if (wid < we->nWarps) {
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
}
we = (ncclWorkElem*)((char*)we + stride);
}
}
};
static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
if (we->isUsed && we->redOpArgIsPtr) {
/* redOpArg is a pointer to the scalar value, so we'll dereference it
* here so that redOpArg holds the bits of the scalar going forward.
* The tricky thing is we don't know its type T since that's encoded in
@@ -518,10 +483,10 @@ static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
extern __device__ struct ncclShmemData *ncclShmem;
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE, bool USING_LL128>
__device__ void ncclKernel(struct ncclDevComm* comm) {
__device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
) {
int tid = threadIdx.x;
int nthreads = blockDim.x;
int bid = blockIdx.x;
__shared__ struct ncclShmemData shmem;
ncclShmem = &shmem;
if (tid == 0) {
@@ -529,47 +494,72 @@ __device__ void ncclKernel(struct ncclDevComm* comm) {
shmem.groups[i].barrier = 0;
for (auto j = 0; j < NCCL_MAX_GROUPS; j++) shmem.groups[i].barrier_next[j] = 0;
}
}
__syncthreads();
}
// To map blockId to channelId, we need the n'th set bit of channelMask which
// is the inverse of counting the number of set bits among the the first n.
if (tid < WARP_SIZE) {
int x = tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) shmem.channelId = x;
}
if (32 < MAXCHANNELS) {
x = 32 + tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) shmem.channelId = x;
}
}
}
__syncthreads(); // publish shmem.channelId
int channelId = shmem.channelId;
int turn = copyToShmem(&shmem.comm, comm);
if (true) {
void *dst, *src;
int bytes;
// Use first 3 warps to load comm, channel, and work into shmem
switch (tid/WARP_SIZE) {
case 0:
dst = &shmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
dst = &shmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
break;
case 2:
dst = &shmem.work;
src = workHead + blockIdx.x;
bytes = sizeof(ncclWork);
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
break;
default:
bytes = 0;
break;
}
copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
__syncthreads(); // publish shmem
#ifdef ENABLE_PROFILING
if (tid == 0) {
shmem.prof.count = 0;
shmem.prof.seq = shmem.comm.devProf[bid].seq;
shmem.prof.seq = shmem.comm.devProf[blockIdx.x].seq;
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
// get address of channel without incurring indirect load from ncclDevCom::channels
ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
turn = copyToShmem(&shmem.channel, channel, turn);
__syncthreads(); // publish ncclShmem
if (tid == 0) __insert_timestamp(__LINE__);
if (tid == 0) __insert_timestamp(__LINE__);
ncclWork *workFifoHost = shmem.channel.workFifo;
ncclWork *workFifoDev = shmem.channel.workFifoDev;
int workFifoIx = shmem.channel.index;
bool firstLaunch = true;
if (COLLTRACE && tid == 0) traceKernelLaunch(true);
while (true) {
copyToShmem(&shmem.work, &workFifoDev[workFifoIx], tid, nthreads);
if (tid == 0) __insert_timestamp(__LINE__);
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (__any(aborted)) { // publish shmem.work
if (COLLTRACE && tid == 0) traceAbort();
break;
}
if (tid == 0)
workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
// Notify host that all fifo reads are complete.
if (tid == 0 && shmem.work.header.isLast && shmem.work.header.inFifo) {
*shmem.channel.workFifoDone = shmem.work.header.doneAcks;
}
if (tid == 0) __insert_timestamp(__LINE__);
workFifoIx = (workFifoIx + 1)%NCCL_MAX_OPS;
if (tid == 0)
channel->index = workFifoIx; // write back to real channel, not shmem shadow
__syncwarp();
if (shmem.work.header.type == ncclWorkTypeColl) {
@@ -579,52 +569,57 @@ __device__ void ncclKernel(struct ncclDevComm* comm) {
}
__syncthreads();
if (COLLTRACE && tid == 0) {
traceKernelLaunch(shmem.work.elems[0],firstLaunch);
firstLaunch = false;
#pragma unroll 1
for(int e=1; e < NCCL_MAX_WORK_ELEMENTS && shmem.work.elems[e].header.type != ncclWorkTypeUnused; e ++) {
traceColl(shmem.work.elems[e], 0);
if (tid == 0) __insert_timestamp(__LINE__);
if (shmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
} else {
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
}
int workIxNext = shmem.work.header.workNext;
__syncthreads();
if (shmem.work.header.isLast) break;
copyToShmem16(tid, &shmem.work, workHead + workIxNext, sizeof(ncclWork));
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (__any(aborted)) { // publish shmem.work
traceAbort();
break;
}
}
if (tid == 0) __insert_timestamp(__LINE__);
if (shmem.work.header.funcIndex == FnIndex)
RunWork<Fn, T, RedOp, Algo, Proto>().run(&shmem.work);
else
NCCL_CALL_FUNCTIONS<USING_LL128>(shmem.work.header.funcIndex);
if (shmem.work.header.isLast) break;
__syncthreads();
if (COLLTRACE && tid == 0) traceColl(false);
}
if (COLLTRACE && tid == 0) traceKernelEnd()
if (COLLTRACE && tid == 0) traceKernelEnd();
#ifdef ENABLE_PROFILING
if (shmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
__syncthreads();
copyToShmem(shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof);
if (tid == 0) shmem.comm.devProf[bid].seq++;
copyToShmem16(tid, shmem.comm.devProf+MAXCHANNELS*shmem.prof.seq+blockIdx.x, &shmem.prof, sizeof(struct ncclProf));
if (tid == 0) shmem.comm.devProf[blockIdx.x].seq++;
}
#endif
}
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm); \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, false>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm); \
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, false>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm); \
__global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false, true>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm); \
__global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true, true>(comm, channelMask, workHead); \
}
// Examples : AllReduce, RING, LL, Sum, uint8
@@ -683,7 +678,7 @@ __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, dev
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
// AllToAll Pivot primitive only has one function.
#define IMPL_COLL_ALLTOALL_PIVOT(func) \
#define IMPL_COLL_F(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);
#endif
+5 -6
Ver fichero
@@ -23,7 +23,7 @@ __device__ struct ncclShmemData* ncclShmem;
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET, devredop, type, nullify)
#if defined(RCCL_BFLOAT16)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
@@ -35,7 +35,7 @@ __device__ struct ncclShmemData* ncclShmem;
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat), \
NCCL_FUNC4(func, devredop, rccl_bfloat16, nullForFloat)
NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
@@ -89,13 +89,12 @@ __device__ struct ncclShmemData* ncclShmem;
NCCL_FUNCS3B(func, Sum)
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNC_NAME(AllToAllPivot, RING, SIMPLE, Sum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
@@ -105,8 +104,8 @@ __device__ ncclKern_t ncclFuncs[2+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedO
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
#if defined(RCCL_BFLOAT16)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16),
#if defined(__CUDA_BF16_TYPES_EXIST__)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
#endif
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
+1 -1
Ver fichero
@@ -17,7 +17,7 @@ namespace {
int tid = threadIdx.x;
int tn = blockDim.x;
#pragma unroll 1
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
ncclWorkElem *we = &w->elems[e];
intptr_t eltN = we->count;
int bid = we->bid;
-4
Ver fichero
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,8 +7,6 @@
#ifndef OP128_H_
#define OP128_H_
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
: "=l"(v0), "=l"(v1) : "l"(ptr));
@@ -67,6 +64,5 @@ inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1
v0 = tmp8[0];
v1 = tmp8[1];
}
#endif
#endif
-1
Ver fichero
@@ -155,5 +155,4 @@ struct PrimitivesWithoutDirect {
#include "prims_simple.h"
#include "prims_ll.h"
#include "prims_ll128.h"
#endif
+4 -4
Ver fichero
@@ -183,7 +183,7 @@ private:
template<int BeginIx>
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
#pragma unroll
#pragma unroll 1
for (int i=BeginIx; i < MaxRecv; i++) {
if (i < fan.nrecv()) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
@@ -412,7 +412,7 @@ private:
}
if (RECV) {
data = !SRC ? peerData : MULTI<RedOp,T>()(redOp, peerData, data);
#pragma unroll MaxRecv
#pragma unroll 1
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i);
data = MULTI<RedOp,T>()(redOp, peerData, data);
@@ -502,11 +502,11 @@ private:
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
+14 -4
Ver fichero
@@ -5,11 +5,12 @@
* See LICENSE.txt for license information
************************************************************************/
#include "op128.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
@@ -53,6 +54,15 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
uint64_t* barriers;
uint64_t* barrier_next;
#if defined(ENABLE_NPKIT)
public:
int npKitCtxIdx = 0;
uint64_t npKitDataProcessEntryTime = 0;
uint64_t npKitDataProcessExitTime = 0;
uint64_t npKitDataProcessTotalTime = 0;
private:
#endif
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (nthreads != WARP_SIZE)
@@ -405,11 +415,11 @@ public:
auto *channel = &ncclShmem->channel;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->devPeers[recvPeers[nrecv]].recv->conn, nrecv);
loadRecvConn(&channel->peers[recvPeers[nrecv]].recv[0], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->devPeers[sendPeers[nsend]].send->conn, nsend);
loadSendConn(&channel->peers[sendPeers[nsend]].send[0], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
+9 -10
Ver fichero
@@ -50,7 +50,6 @@ class Primitives<
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
uint64_t* barriers;
uint64_t* barrier_next;
const uint64_t opCount;
uint32_t* next_hdp_reg;
#if defined(ENABLE_NPKIT)
@@ -377,6 +376,7 @@ private:
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
#pragma unroll 1
// Loop over peers
for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend();
int peerOffset = i*peerElem;
@@ -423,9 +423,9 @@ private:
}
}
__device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex].conn;
auto *conn = &peer->recv[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
@@ -463,14 +463,14 @@ private:
}
}
__device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) {
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
if (flags & (RoleWaitSend|RolePostSend)) {
auto *conn = &peer->send[connIndex].conn;
auto *conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostSend) {
connStepPtr = conn->tail;
next_hdp_reg = conn->next_hdp_reg;
next_hdp_reg = conn->next_hdp_reg;
}
if (flags & RoleWaitSend) {
ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
@@ -513,8 +513,7 @@ private:
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr
):
tid(tid),
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)),
opCount(ncclShmem->work.elems[0].opCount) {
stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
this->nthreads = nthreads;
@@ -552,8 +551,8 @@ private:
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e);
loadRecvConn(&ncclShmem->channel.peers[peer], connIndex, e);
loadSendConn(&ncclShmem->channel.peers[peer], connIndex, e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
+2 -2
Ver fichero
@@ -13,7 +13,7 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
@@ -23,7 +23,7 @@ namespace {
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
const int rank = ncclShmem->comm.rank;
const int prevRank = ring->devUserRanks[nranks-1];
const int prevRank = ring->userRanks[nranks-1];
const int root = args->root;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+2 -2
Ver fichero
@@ -13,11 +13,11 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->header.nWarps*WARP_SIZE;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem->channel.ring;
int const *ringRanks = ring->devUserRanks;
int const *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
+19 -16
Ver fichero
@@ -15,6 +15,8 @@
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
size_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
@@ -38,34 +40,35 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->peer == ncclShmem->comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
if (args->buff != recvArgs->buff) {
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&buff, 1, (T**)&recvBuff, count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, args->count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, __builtin_amdgcn_s_memrealtime(),
ncclShmem->comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
@@ -73,11 +76,10 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
} else {
using Proto = ProtoSimple<1, 1>;
ssize_t const count = args->count;
int const chunkSize = args->chunkSize/sizeof(T);
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
(tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -93,9 +95,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
#endif
ssize_t offset = 0;
size_t offset = 0;
do {
int nelem = min(chunkSize, count-offset);
int nelem = min(size_t(chunkSize), count-offset);
prims.directSend(offset, offset, nelem);
offset += nelem;
} while(offset < count);
@@ -133,11 +135,12 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
if (args->peer != ncclShmem->comm.rank) {
using Proto = ProtoSimple<1, 1>;
ssize_t const count = args->count;
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
int const chunkSize = args->chunkSize/sizeof(T);
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group);
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
@@ -153,9 +156,9 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
#endif
ssize_t offset = 0;
size_t offset = 0;
do {
int nelem = min(chunkSize, count-offset);
int nelem = min(size_t(chunkSize), count-offset);
prims.directRecv(offset, nelem);
offset += nelem;
} while(offset < count);
@@ -182,11 +185,11 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
args += group;
if (args->header.type == ncclWorkTypeUnused) return;
tid -= args->warpStart * WARP_SIZE;
int nthreads = args->nWarps * WARP_SIZE;
group |= (args->connIndex<<16); // Used to select connIndex 1
if (args->p2pType == ncclWorkP2pTypeUnused) return;
if (tid >= nthreads || args->peer == -1) return;
if ((group%2) == 0) {
runRecv(tid, nthreads, group, args);
+59 -39
Ver fichero
@@ -9,29 +9,37 @@
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
#include <sys/syscall.h>
int ncclDebugLevel = -1;
static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
std::chrono::steady_clock::time_point ncclEpoch;
static __thread int tid = -1;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
ncclDebugLevel = NCCL_LOG_VERSION;
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
ncclDebugLevel = NCCL_LOG_WARN;
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
ncclDebugLevel = NCCL_LOG_INFO;
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
ncclDebugLevel = NCCL_LOG_ABORT;
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
ncclDebugLevel = NCCL_LOG_TRACE;
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
@@ -65,6 +73,8 @@ void ncclDebugInit() {
mask = NCCL_ENV;
} else if (strcasecmp(subsys, "ALLOC") == 0) {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
@@ -76,12 +86,16 @@ void ncclDebugInit() {
free(ncclDebugSubsys);
}
// Cache pid and hostname
getHostName(hostname, 1024, '.');
pid = getpid();
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
@@ -95,12 +109,10 @@ void ncclDebugInit() {
*dfn++ = '%';
break;
case 'h': // %h = hostname
char hostname[1024];
getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
@@ -111,15 +123,15 @@ void ncclDebugInit() {
*dfn = '\0';
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != NULL) {
if (file != nullptr) {
setbuf(file, nullptr); // disable buffering
ncclDebugFile = file;
}
}
}
#ifdef ENABLE_TRACE
ncclEpoch = std::chrono::high_resolution_clock::now();
#endif
ncclEpoch = std::chrono::steady_clock::now();
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
pthread_mutex_unlock(&ncclDebugLock);
}
@@ -128,45 +140,53 @@ void ncclDebugInit() {
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
// Save the last error (WARN) as a human readable string
if (level == NCCL_LOG_WARN) {
pthread_mutex_lock(&ncclDebugLock);
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
va_end(vargs);
pthread_mutex_unlock(&ncclDebugLock);
}
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
// Gather the rank information. This can take > 1us so we want to make sure
// we only do it when needed.
char hostname[1024];
getHostName(hostname, 1024, '.');
if (tid == -1) {
tid = syscall(SYS_gettid);
}
int cudaDev;
hipGetDevice(&cudaDev);
int pid = getpid();
int tid = syscall(SYS_gettid);
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
hipGetDevice(&cudaDev);
}
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugLock);
if (level == NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
else if (level == NCCL_LOG_INFO)
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
#ifdef ENABLE_TRACE
else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
if (level == NCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
hostname, pid, tid, cudaDev, filefunc, line);
} else if (level == NCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
} else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
fprintf(ncclDebugFile,"%s\n", buffer);
fflush(ncclDebugFile);
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugLock);
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+1027 -942
Ver fichero
La diferencia del archivo ha sido suprimido porque es demasiado grande Cargar Diff
+12 -13
Ver fichero
@@ -448,10 +448,10 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
// remote proxies without risking deadlocks
int ncclPxnDisable() {
int ncclPxnDisable(struct ncclComm* comm) {
static int pxnDisable = -1;
if (pxnDisable == -1) {
if (ncclNetVersion() == 4) {
if (comm && ncclNetVersion(comm) == 4) {
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
pxnDisable = 1;
} else {
@@ -490,7 +490,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
return ncclSuccess;
}
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
// Precompute paths between GPUs/NICs.
// Remove everything in case we're re-computing
@@ -518,16 +518,16 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
}
}
if (peerInfos == NULL) continue;
if (comm == NULL) continue;
// Remove GPUs we can't talk to because of containers.
struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank[0];
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank[0];
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank[0];
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank[0];
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if (shm == 0 && p2p == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
@@ -543,7 +543,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
for (int g=0; g<system->nodes[GPU].count; g++) {
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
if (ncclPxnDisable(comm) != 1 && gpu->paths[NET][n].type > PATH_PXB) {
int pxnGpu = -1;
for (int p=0; p<system->nodes[GPU].count; p++) {
@@ -556,7 +556,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
pxnGpu = p;
int netDev;
NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank[0], &netDev));
// To ensure proper balancing, use preferably a local GPU which advertised that NIC as its preferred one.
if (netDev == netNode->id) break;
@@ -602,8 +601,8 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
}
for (int j=0; j<gpu->gpu.nRanksPerGpu; j++ ) {
if (gpu->gpu.rank[j] == comm->rank) {
myDomain = domains[g];
break;
myDomain = domains[g];
break;
}
}
}
@@ -768,7 +767,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
// We want to spread channels used when there aren't many and progressively
// fill the whole space of nChannels. To do so we mirror the bits in the
// nChannels space.
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
for (int c=0; c<comm->p2pnChannels; c++) {
int mirror = 0;
for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
comm->p2pChannels[c] = mirror;
+8 -4
Ver fichero
@@ -275,8 +275,8 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
for (int i=0; i<ngpus; i++) {
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
if (system->nodes[GPU].nodes[i].gpu.rank[j] == nextRank) {
*g = i;
return ncclSuccess;
*g = i;
return ncclSuccess;
}
}
}
@@ -1103,10 +1103,14 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
*proxyRank = rank;
int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
int pxnLevel = ncclPxnDisable(comm) == 1 ? 0 : ncclParamP2pPxnLevel();
// See whether we can use the remote rank preferred device.
if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
int netDev = comm->peerInfo[peerRank].netDev;
// Find local NIC number close to local cudaDev
int cudaDev = comm->peerInfo[peerRank].cudaDev;
int localRank;
if (ncclTopoDevToRank(comm->topo, cudaDev, &localRank) != ncclSuccess) return ncclSuccess;
int netDev = comm->peerInfo[localRank].netDev;
int n;
// Check that device exists on our node
if (ncclParamCrossNic() == 0) {
+13 -9
Ver fichero
@@ -724,11 +724,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (collNetSupport()) {
NCCLCHECK(collNetDevices(&netDevCount));
if (collNetSupport(comm)) {
NCCLCHECK(collNetDevices(comm, &netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(n, &props));
NCCLCHECK(collNetGetProperties(comm, n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -737,16 +737,18 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
}
}
if (netDevCount == 0) {
NCCLCHECK(ncclNetDevices(&netDevCount));
NCCLCHECK(ncclNetDevices(comm, &netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(n, &props));
NCCLCHECK(ncclNetGetProperties(comm, n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
@@ -756,7 +758,9 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
@@ -903,8 +907,8 @@ ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int*
for (int g=0; g<system->nodes[GPU].count; g++) {
for ( int j=0; j<system->nodes[GPU].nodes[g].gpu.nRanksPerGpu; j++ ){
if (system->nodes[GPU].nodes[g].gpu.rank[j] == rank) {
*localRank = g;
return ncclSuccess;
*localRank = g;
return ncclSuccess;
}
}
}
+14 -3
Ver fichero
@@ -198,20 +198,31 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
for (int i=0; i<system->nodes[GPU].count; i++) {
for (int j=0; j<system->nodes[GPU].nodes[i].gpu.nRanksPerGpu; j++ ) {
if (system->nodes[GPU].nodes[i].gpu.rank[j] == rank) {
*index = i;
return ncclSuccess;
*index = i;
return ncclSuccess;
}
}
}
return ncclInternalError;
}
static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, int* rank) {
*rank = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == dev) {
*rank = system->nodes[GPU].nodes[i].gpu.rank[0];
return ncclSuccess;
}
}
return ncclInternalError;
}
// Returns XGMI speed in GB/s
static float ncclTopoXGMISpeed(int gcn) {
return gcn == 910 ? MI200_XGMI_WIDTH : VEGA_XGMI_WIDTH;
}
#define ncclGetKernelIndex(p_comm) \
(((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->hostDevComm.collTraceThread ? 1 : 0))
(((p_comm)->topo->ll128Enabled ? 1 : 0)*2 + ((p_comm)->collTraceThread ? 1 : 0))
#endif
+3 -3
Ver fichero
@@ -235,11 +235,11 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
#else
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
+212 -396
Ver fichero
@@ -11,446 +11,262 @@
#include "transport.h"
#include "channel.h"
#define MAX_ASYNC_OPS 128
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
thread_local int ncclGroupIndex = 0;
thread_local int ncclGroupMode = 0;
thread_local ncclResult_t ncclGroupError = ncclSuccess;
extern struct allocationTracker allocTracker[];
__thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
__thread ncclResult_t ncclGroupError = ncclSuccess;
__thread struct ncclComm* ncclGroupCommHead = nullptr;
__thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
__thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
bool ncclAsyncMode() {
return ncclGroupMode > 0;
}
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
return ret;
}
struct ncclInitArgs {
ncclInitFunc_t func;
int cudaDev;
ncclComm_t* newcomm;
int ndev;
ncclUniqueId commId;
int myrank;
int virtualId;
};
struct ncclCollArgs {
ncclComm_t comm;
uint16_t connIndex;
};
enum ncclAsyncFuncType {
ASYNC_FUNC_INVALID = 0,
ASYNC_FUNC_INIT = 1,
ASYNC_FUNC_COLL = 2,
};
struct ncclAsyncArgs {
ncclResult_t ret;
enum ncclAsyncFuncType funcType;
union {
ncclCollArgs coll;
ncclInitArgs init;
};
};
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
void* ncclAsyncThreadMain(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank,
args->init.cudaDev, args->init.virtualId));
return args;
}
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId) {
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
return ncclAsyncErrCheck(ncclInvalidUsage);
ncclResult_t ncclAsyncLaunch(
struct ncclAsyncJob* job,
ncclResult_t(*func)(struct ncclAsyncJob*),
void(*undo)(struct ncclAsyncJob*),
void(*destructor)(void*)
) {
if (0 == ncclGroupDepth) {
ncclResult_t res = func(job);
if (res != ncclSuccess && undo) undo(job);
if (destructor) destructor(job);
return res;
} else {
job->func = func;
job->undo = undo;
job->destructor = destructor;
ncclIntruQueueEnqueue(&ncclAsyncJobs, job);
return ncclSuccess;
}
int index = ncclGroupIndex++;
struct ncclAsyncArgs* args = ncclGroupArgs+index;
args->funcType = ASYNC_FUNC_INIT;
args->init.func = func;
args->init.cudaDev = cudaDev;
args->init.newcomm = newcomm;
args->init.ndev = ndev;
memcpy(&args->init.commId, &commId, sizeof(commId));
args->init.myrank = myrank;
args->init.virtualId = virtualId;
return ncclSuccess;
}
ncclResult_t ncclAsyncColl(ncclComm_t comm) {
struct ncclAsyncArgs* args = ncclGroupArgs;
for (int i=0; i<ncclGroupIndex; i++) {
if (args->coll.comm == comm) return ncclSuccess;
args++;
void* ncclAsyncJobMain(void* arg) {
struct ncclAsyncJob* job = (struct ncclAsyncJob*)arg;
job->result = job->func(job);
if (job->result != ncclSuccess) {
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, job->result);
}
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
return ncclAsyncErrCheck(ncclInvalidUsage);
}
ncclGroupIndex++;
args->funcType = ASYNC_FUNC_COLL;
args->coll.comm = comm;
return ncclSuccess;
return arg;
}
NCCL_API(ncclResult_t, ncclGroupStart);
ncclResult_t ncclGroupStart() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
}
ncclGroupMode++;
NCCLCHECK(ncclGroupStartInternal());
TRACE_CALL("ncclGroupStart()");
return ncclSuccess;
}
static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
1, 1 };
info.channelId = channelId;
info.opCount = opCount;
info.connIndex = connIndex;
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
}
static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff, uint64_t opCount, uint16_t connIndex) {
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
1, 1 };
info.channelId = channelId;
info.opCount = opCount;
info.connIndex = connIndex;
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
}
void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(hipSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, args->coll.connIndex));
return args;
}
static size_t getP2pChunkSize(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
size_t size = std::max(minSize, DIVUP(totalSize, minChannels));
int nChannels = minChannels;
while (size > maxSize && nChannels <= maxChannels/2) {
nChannels *= 2;
size = DIVUP(totalSize, nChannels);
}
ALIGN_SIZE(size, minSize);
return size;
}
RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
NVTX3_FUNC_RANGE_IN(nccl_domain);
if (ncclGroupMode == 0) {
NCCLCHECK(ncclGroupEndInternal());
TRACE_CALL("ncclGroupEnd()");
return ncclSuccess;
}
struct ncclPreconnectJob {
struct ncclAsyncJob base;
struct ncclComm* comm;
};
ncclResult_t ncclPreconnectFunc(struct ncclAsyncJob* job_) {
struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
struct ncclComm* comm = job->comm;
CUDACHECK(hipSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
if (comm->p2pNet) NCCLCHECK(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P_NET));
return ncclSuccess;
}
static ncclResult_t doLaunches(struct ncclComm* head) {
ncclResult_t result = ncclSuccess;
struct ncclComm* cliqueComm0 = head->intraComm0;
struct ncclComm* cliqueHead = head;
struct ncclComm* cliqueNextHead;
bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
// This outer loop iterates over cliques of comms which are siblings of the
// same global entity. We calculate a clique as all comms which have the same
// `intraComm0` value.
do {
struct ncclComm* comm = cliqueHead;
bool capturingYes = false, capturingNo = false;
do {
(ncclCudaGraphValid(comm->tasks.capturingGraph) ? capturingYes : capturingNo) = true;
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
comm = comm->groupNext;
} while (comm != nullptr && comm->intraComm0 == cliqueComm0);
cliqueNextHead = comm;
if (capturingYes && capturingNo) {
// We have entered barriers but are aborting without leaving them. Thus
// these comms are permanently trashed. We need a good mechanism for
// tracking and reporting that.
WARN("Either none or all communicators in a ncclGroup() can be CUDA graph captured.");
result = ncclInvalidUsage;
goto failure;
}
while (true) { // Iterate rounds of launches for clique.
bool moreRounds;
comm = cliqueHead;
do { // Iterate clique members.
struct ncclComm* next = comm->groupNext;
if (useBarrier) {
// Barrier reduction result tells us if this was the final round.
moreRounds = 0 != ncclCommIntraBarrierOut(comm);
} else {
moreRounds = comm->unlaunchedPlansHead != nullptr;
}
if (moreRounds) {
// Pop next unlaunched kernel
struct ncclKernelPlan* plan = comm->unlaunchedPlansHead;
if (plan != nullptr) {
comm->unlaunchedPlansHead = plan->next;
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
}
// Barrier reduction input indicates if we require further rounds.
if (useBarrier) ncclCommIntraBarrierIn(comm, comm->unlaunchedPlansHead != nullptr ? 1 : 0);
if (plan != nullptr) {
NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
}
} else { // Final round.
CUDACHECKGOTO(hipSetDevice(comm->cudaDev), result, failure);
NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
}
comm = next;
} while (comm != cliqueNextHead);
if (!moreRounds) break;
}
cliqueHead = cliqueNextHead;
} while (cliqueHead != nullptr);
failure:
return result;
}
ncclResult_t ncclGroupEndInternal() {
if (ncclGroupDepth == 0) {
WARN("ncclGroupEnd: not in a group call.");
return ncclInvalidUsage;
}
ncclGroupMode--;
if (ncclGroupMode > 0) return ncclSuccess;
ncclGroupDepth--;
if (ncclGroupDepth > 0) return ncclSuccess;
int savedDev;
CUDACHECK(hipGetDevice(&savedDev));
int activeThreads = 0;
int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
ncclResult_t ret = ncclGroupError;
int usingCudaGraphAll = -1;
hipGraph_t* graphs = NULL;
if (ret != ncclSuccess) goto group_cleanup;
bool jobsDone = false;
if (ret != ncclSuccess) goto failure;
/* Launch async ncclCommInitRank */
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
activeThreads++;
doneArray[i] = 0;
}
}
/* For init, since we use threads, we just wait for threads to complete */
while (activeThreads) {
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
if (err == EBUSY) continue;
if (err != 0) ret = ncclSystemError;
if (args->ret != ncclSuccess) ret = args->ret;
doneArray[i] = 1;
activeThreads--;
}
}
if (ncclGroupCommPreconnectHead != nullptr) {
struct ncclComm* comm = ncclGroupCommPreconnectHead;
do {
struct ncclPreconnectJob* job;
NCCLCHECK(ncclCalloc(&job, 1));
job->base.func = ncclPreconnectFunc;
job->base.undo = nullptr;
job->base.destructor = free;
job->comm = comm;
ncclIntruQueueEnqueue(&ncclAsyncJobs, &job->base);
struct ncclComm* next = comm->preconnectNext;
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
comm = next;
} while (comm != nullptr);
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
args->coll.connIndex = 1;
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
}
if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
struct ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
do {
pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job);
job = job->next;
} while (job != nullptr);
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[1]) {
int err = pthread_join(ncclGroupThreads[i], NULL);
job = ncclIntruQueueHead(&ncclAsyncJobs);
do {
int err = pthread_join(job->thread, nullptr);
if (err != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
ret = ncclSystemError;
}
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect[1] = 0;
}
if (ret == ncclSuccess && job->result != ncclSuccess) ret = job->result;
job = job->next;
} while (job != nullptr);
jobsDone = true;
if (ret != ncclSuccess) goto failure;
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
args->coll.connIndex = NCCL_CONN_IDX_P2P_NET;
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
}
if (ncclGroupCommHead != nullptr) {
NCCLCHECKGOTO(doLaunches(ncclGroupCommHead), ret, failure);
do {
struct ncclComm* comm = ncclGroupCommHead;
struct ncclComm* next = comm->groupNext;
ncclGroupCommLeave(comm);
ncclGroupCommHead = next;
} while (ncclGroupCommHead != nullptr);
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET]) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
if (false) {
failure:
struct ncclComm* comm = ncclGroupCommHead;
while (comm != nullptr) {
struct ncclComm* next = comm->groupNext;
ncclGroupCommLeave(comm); // overwrites comm->groupNext
// We don't know if preconnect succeeded or happened at all, so clear
// the flags that let `taskAppend()` skip over checking if preconnect
// is needed.
comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
for (int i=0; i < comm->nRanks; i++) {
comm->tasks.peers[i].sendSeen = false;
comm->tasks.peers[i].recvSeen = false;
comm->connectSend[i] = 0;
comm->connectRecv[i] = 0;
comm->connectSend[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
comm->connectRecv[i+comm->nRanks*NCCL_CONN_IDX_P2P_NET] = 0;
}
INFO(NCCL_INIT, "comm %p rank %d total %ld bytes - P2P NET preconnect COMPLETE", args->coll.comm, args->coll.comm->rank, allocTracker[args->coll.comm->cudaDev].totalAllocSize);
NCCLCHECKGOTO(args->ret, ret, end);
args->coll.comm->connect[NCCL_CONN_IDX_P2P_NET] = 0;
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
struct ncclComm* comm = args->coll.comm;
int node = comm->node;
int nNodes = comm->nNodes;
int localRank = comm->localRank;
// Compute how much to split operations
// Natural step size matching buffer steps.
ssize_t stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
// Try to use all channels
int nChannelsMax = comm->p2pnChannelsPerPeer;
int nChannelsMin = nChannelsMax;
// Try to use all channels, but one channel per operation.
//while (nChannelsMin*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels) && nChannelsMin > 1) nChannelsMin /= 2;
// Avoid overloading channels with 8+ operations as we loose the sync warp, hence a bit of bandwidth.
//while (nChannelsMax*comm->nRanks > std::max(comm->nChannels, comm->p2pnChannels)*4 && nChannelsMax > 1) nChannelsMax /= 2;
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d<=nNodes/4; d++) {
int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
int index = 0;
int delta = deltas[index];
sched_delta:
uint32_t recvNode = (node+nNodes-delta)%nNodes;
uint32_t sendNode = (node+delta)%nNodes;
int steps = comm->maxLocalRanks;
for (int s=0; s<steps; s++) {
int recvIndex = (localRank-s+steps)%steps;
int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
int sendIndex = (localRank+s)%steps;
int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
if (recv != NULL || send != NULL) {
ssize_t totRecvBytes = -1, totSendBytes = -1;
if (recv != NULL) totRecvBytes = recv->nbytes;
if (send != NULL) totSendBytes = send->nbytes;
if (recv) comm->p2pRecvCount--;
if (send) comm->p2pSendCount--;
if (recvPeer == comm->rank) { // Check self send/recv
if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
}
void* recvBuff = recv ? recv->buff : NULL;
void* sendBuff = send ? send->buff : NULL;
// After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
uint16_t sendIdx = 1, recvIdx = 1;
if(comm->p2pNet && totSendBytes > rcclParamP2pNetThreshold())
sendIdx = NCCL_CONN_IDX_P2P_NET;
if(comm->p2pNet && totRecvBytes > rcclParamP2pNetThreshold())
recvIdx = NCCL_CONN_IDX_P2P_NET;
ssize_t sendOffset = 0;
ssize_t recvOffset = 0;
int sendRemaining = 1, recvRemaining = 1;
int chunk = 0;
do {
int channelId;
// Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
// to use multiple channels to guarantee progress on all ranks from the same node.
ssize_t recvbytes = totRecvBytes-recvOffset;
ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
// (total size == 0), otherwise set size to -1.
if (sendbytes < 0 || (sendbytes == 0 && totSendBytes != 0)) send = NULL;
if (recvbytes < 0 || (recvbytes == 0 && totRecvBytes != 0)) recv = NULL;
if (send || recv) {
if (recv) {
NCCLCHECK(ncclChannelCompute(comm, recvPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncRecv, &channelId));
}
else
recvPeer = -1;
if (send) {
NCCLCHECK(ncclChannelCompute(comm, sendPeer, chunk%comm->p2pnChannelsPerPeer, ncclFuncSend, &channelId));
}
else
sendPeer = -1;
NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, recv ? ((char*)recvBuff)+recvOffset : NULL, recv ? recv->opCount : 0, recvIdx), ret, group_cleanup);
NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, send ? ((char*)sendBuff)+sendOffset : NULL, send ? send->opCount : 0, sendIdx), ret, group_cleanup);
}
recvOffset += recvChunkSize;
sendOffset += sendChunkSize;
chunk++;
} while (sendRemaining || recvRemaining);
comm->unlaunchedPlansHead = nullptr;
// Reclaim abandoned kernel plan memory. Note ncclWork structs were already
// reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
while (!ncclIntruQueueEmpty(&comm->planQueue)) {
struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planQueue);
// Persistent plans will be reclaimed via the callbackQueue when the
// graph drops its UserObject reference.
if (!plan->persistent) {
for (int c=0; c < MAXCHANNELS; c++) {
while (!ncclIntruQueueEmpty(&plan->channels[c].proxyOpQueue)) {
struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->channels[c].proxyOpQueue);
ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
}
}
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
if (index == 3 && deltas[3] == deltas[2]) index++;
if (index == 3 && deltas[3] == deltas[1]) index++;
if (index < 4) {
delta = deltas[index];
goto sched_delta;
}
ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
}
}
// Reset comm->tasks to empty.
comm->tasks.nTasksColl = 0;
comm->tasks.nTasksP2p = 0;
comm->tasks.streams = nullptr;
ncclIntruQueueConstruct(&comm->tasks.collQueue);
comm->tasks.collBytesTotal = 0;
for (int i=0; i < comm->nRanks; i++) {
ncclIntruQueueConstruct(&comm->tasks.peers[i].sendQueue);
ncclIntruQueueConstruct(&comm->tasks.peers[i].recvQueue);
}
comm = next;
}
}
/* Collectives are done in three steps :
* 0. Save kernels previously enqueued. Compute channel, algo, proto, etc.
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
* 2. Barrier Wait. No CUDA call is permitted
* 3. Enqueue Events. CUDA event wait/enqueue.
* This is needed because step 2 cannot call any CUDA primitive, otherwise if
* cudaFree happens between 1 and 3, it could block that CUDA call and
* prevent some ranks from launching their network threads, which would
* prevent the NCCL call from completing, blocking the cudaFree call.
*/
// Check whether we are in cuda graph mode
NCCLCHECK(ncclCalloc(&graphs, ncclGroupIndex));
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup);
if (usingCudaGraphAll == -1) {
usingCudaGraphAll = comm->usingCudaGraph;
} else if (usingCudaGraphAll != comm->usingCudaGraph) {
WARN("Illegal to have some communicators in graph mode while others not");
ret = ncclInvalidUsage;
goto group_cleanup;
}
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
ncclComm_t comm = args->coll.comm;
NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == hipStreamDefault/* ||
args->coll.comm->userStream == hipStreamPerThread ||
args->coll.comm->userStream == hipStreamLegacy*/)
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
if (usingCudaGraphAll == 1) {
NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end);
} else {
ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo);
}
NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end);
}
}
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
if (args->coll.comm->userStream == hipStreamDefault/* ||
args->coll.comm->userStream == hipStreamPerThread ||
args->coll.comm->userStream == hipStreamLegacy*/)
CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end);
NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end);
}
while (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
struct ncclAsyncJob* job = ncclIntruQueueDequeue(&ncclAsyncJobs);
if (ret != ncclSuccess && jobsDone && job->undo) job->undo(job);
if (job->destructor) job->destructor((void*)job);
}
goto end;
group_cleanup:
if (ret != ncclSuccess) {
// At least one call in the group failed. Since we want to make that group
// an atomic operation, we need to cancel all operations.
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT) {
if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
*args->init.newcomm = NULL;
} else {
struct ncclComm* comm = args->coll.comm;
// Reset aggregation counters
comm->asyncOpCount = 0;
comm->asyncTotalSize = 0;
// Dequeue p2p lists
if (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
for (int peer=0; peer<comm->nRanks; peer++) {
if (comm->p2pSends[peer]) comm->p2pSends[peer]->recycle();
if (comm->p2pRecvs[peer]) comm->p2pRecvs[peer]->recycle();
}
comm->p2pSendCount = comm->p2pRecvCount = 0;
}
ncclLaunchReset(comm);
}
}
}
end:
ncclGroupError = ncclSuccess;
ncclGroupIndex = 0;
ncclGroupCommHead = nullptr;
ncclGroupCommPreconnectHead = nullptr;
CUDACHECK(hipSetDevice(savedDev)); // do other clean-ups first before calling hipSetDevice, because this call can fail too
if (graphs) free(graphs);
return ret;
}
Archivo normal → Archivo ejecutable
Ver fichero
+123 -39
Ver fichero
@@ -11,28 +11,40 @@
#include "nccl.h"
#include "checks.h"
#include "align.h"
#include "utils.h"
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include "rccl_vars.h"
uint64_t clockNano(); // from utils.h with which we have a circular dependency
template <typename T>
static ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
CUDACHECK(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped));
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t result = ncclSuccess;
uint64_t time = 0;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
time = clockNano();
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*sizeof(T), hipHostMallocMapped), result, finish);
time = clockNano() - time;
memset(*ptr, 0, nelem*sizeof(T));
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
return ncclSuccess;
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p seconds: hipHostAlloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(hipHostFree(ptr));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
@@ -46,7 +58,7 @@ static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc,
#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
template <typename T>
static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
if (nelem < oldNelem) return ncclInternalError;
if (nelem == oldNelem) return ncclSuccess;
@@ -78,54 +90,126 @@ static_assert(sizeof(struct allocationTracker) == 64, "allocationTracker must be
extern struct allocationTracker allocTracker[];
template <typename T>
static ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
// Need async stream for P2P pre-connect + CUDA Graph
static bool streamCreated = false;
static hipStream_t stream;
if (rcclParamEnableHipGraph() && !streamCreated)
{
// Create stream only once to avoid performance penalty
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
streamCreated = true;
}
ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
ncclResult_t result = ncclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
uint64_t time = clockNano();
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
time = clockNano() - time;
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
return result;
}
#define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
if (rcclParamEnableHipGraph()) {
CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream));
CUDACHECK(hipStreamSynchronize(stream));
// NOTE: Currently the re-used stream is not destroyed
//CUDACHECK(hipStreamDestroy(stream));
} else {
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
CUDACHECK(hipStreamSynchronize(NULL));
}
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
template <typename T>
ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
ncclResult_t result = ncclSuccess;
uint64_t time0=0, time1=0, time2=0;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
hipStream_t stream;
time0 = clockNano();
CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
time1 = clockNano();
if (isFineGrain)
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
time2 = clockNano();
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
int dev;
CUDACHECK(hipGetDevice(&dev));
if (dev < MAX_ALLOC_TRACK_NGPU) {
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_SEQ_CST);
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
}
return ncclSuccess;
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipStreamCreateWithFlags=%g hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time1-time0)/1.e9, double(time2-time1)/1.e9);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
#define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
return ncclSuccess;
ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
ncclResult_t result = ncclSuccess;
uint64_t time = 0;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
*ptr = nullptr;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
time = clockNano();
if (isFineGrain)
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
else
CUDACHECKGOTO(hipMalloc(ptr, nelem*sizeof(T)), result, finish);
time = clockNano() - time;
CUDACHECKGOTO(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
int dev;
CUDACHECK(hipGetDevice(&dev));
if (dev < MAX_ALLOC_TRACK_NGPU) {
__atomic_fetch_add(&allocTracker[dev].totalAlloc, 1, __ATOMIC_RELAXED);
__atomic_fetch_add(&allocTracker[dev].totalAllocSize, nelem*sizeof(T), __ATOMIC_RELAXED);
}
INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p seconds: hipMalloc=%g", filefunc, line, nelem*sizeof(T), *ptr, double(time)/1.e9);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__FILE__, __LINE__, __VA_ARGS__)
template <typename T>
ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
ncclResult_t result = ncclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
// Need a side stream so as not to interfere with graph capture.
hipStream_t stream;
CUDACHECKGOTO(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking), result, finish);
NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
CUDACHECKGOTO(hipStreamSynchronize(stream), result, finish);
CUDACHECKGOTO(hipStreamDestroy(stream), result, finish);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, hipStream_t stream) {
ncclResult_t result = ncclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(hipMemcpyAsync(dst, src, nelem*sizeof(T), hipMemcpyDefault, stream), result, finish);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
template <typename T>
ncclResult_t ncclCudaFree(T* ptr) {
ncclResult_t result = ncclSuccess;
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
CUDACHECKGOTO(hipFree(ptr), result, finish);
finish:
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
return result;
}
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
// allocated on separate pages as those pages will be marked DONTFORK
// and if they are shared, that could cause a crash in a child process
static ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
size_t page_size = sysconf(_SC_PAGESIZE);
void* p;
int size_aligned = ROUNDUP(size, page_size);
-1
Ver fichero
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
+2 -1
Ver fichero
@@ -31,7 +31,8 @@ static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int
}
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
//*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
*channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
return ncclSuccess;
}
+4 -4
Ver fichero
@@ -10,7 +10,7 @@
#include "debug.h"
// Check CUDA calls
// Check CUDA RT calls
#define CUDACHECK(cmd) do { \
hipError_t err = cmd; \
if( err != hipSuccess ) { \
@@ -143,9 +143,9 @@
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
} while (!(cond));
#define NCCLCHECKTHREAD(a) do { \
if ((args->ret = (a)) != ncclSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
return args; \
} \
} while(0)
+17 -16
Ver fichero
@@ -10,25 +10,26 @@
#include "nccl.h"
#include "nccl_net.h"
extern ncclCollNet_t* ncclCollNet;
typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
// Translation to external API
static const char* collNetName() { return ncclCollNet->name; }
static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetIflush(void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
#endif
+4 -4
Ver fichero
@@ -47,10 +47,10 @@ struct ncclDevRedOpFull {
/* Declare all collective operations */
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm); \
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm);
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_LL128_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
#define CONCAT(a,b) a##b
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
+197 -78
Ver fichero
@@ -10,25 +10,13 @@
#include "transport.h"
#include "p2p.h"
// [RCCL]
//#include "clique/CliqueManager.h"
// [/RCCL]
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
#define LOAD(VAR) *(VAR)
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define HIPRT_CB
#else
#include "collectives.h"
#include "proxy.h"
#include "strongstream.h"
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define HIPRT_CB
#else
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
@@ -77,8 +65,6 @@ struct ncclRecvMem {
};
};
typedef hipError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
enum helperThreadState {ThreadStart, ThreadStop};
#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
@@ -104,15 +90,87 @@ struct ncclNodeRanks {
int* localRankToRank;
};
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclDestructor {
struct ncclDestructor* next;
void* obj;
ncclResult_t(*fn)(struct ncclDestructor* me);
};
struct ncclCommCallback {
struct ncclCommCallback* next;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclChannel {
struct ncclChannelPeer* peers;
struct ncclDevChannelPeer* devPeers;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
struct ncclDirect collTree;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
uint64_t p2pOpCount;
};
struct ncclWorkList {
struct ncclWorkList* next;
struct ncclWork work;
};
struct ncclPointerList {
struct ncclPointerList* next;
void *ptr;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
struct ncclCommCallback reclaimer;
struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
struct ncclComm* comm;
struct ncclKernelPlan* next;
bool persistent; // aka captured in a graph
void *kernelFn;
int channelUbound; // only channels c < channelUbound are present
int channelCount; // number of channels present
uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
int threadPerBlock;
// workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
struct ncclWork* workHead;
int collOpCount; // zero based for this plan
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
struct Channel {
int nWork;
union {
int nWorkElem; // used for coll and reg coll
int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
};
size_t collBytes;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
} channels[MAXCHANNELS];
};
struct ncclComm {
struct ncclMemoryStack memPermanent, memScoped;
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
int connect[NCCL_MAX_CONNS];
uint32_t* connectSend;
uint32_t* connectRecv;
@@ -135,19 +193,13 @@ struct ncclComm {
// localRanks and localRanktoRank for all nodes
struct ncclNodeRanks* nodeRanks;
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
hipStream_t userStream;
bool userStreamSet;
hipEvent_t doneEvent;
hipEvent_t intDoneEvent;
bool checkPointers;
bool dmaBufSupport;
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Collective operation counter
uint64_t collOpCount;
// P2P operation counter
uint64_t p2pOpCount;
// Channels for collectives
int nChannels;
@@ -165,10 +217,6 @@ struct ncclComm {
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
hipStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
@@ -178,26 +226,33 @@ struct ncclComm {
// Flags for enable P2P NET
uint32_t p2pNet;
uint32_t useIntraNet;
bool hasFineGrain;
// Device side of the communicator
struct ncclDevComm *devComm;
// Host copy of the devComm (to free CUDA allocs)
struct ncclDevComm hostDevComm;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
// Operation pool.
int workFifoDepth; // size of workFifoHeap[], power of 2
struct ncclWork* workFifoHeap;
struct ncclWork* devWorkFifoHeap;
void* workFifoHeapGdrHandle;
// Work completion notificaion
uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
// Intra-process sync
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
int intraRefs; // reference count from intra-process comms (zero if not leader else intraRanks)
int intraRank;
int intraRanks;
int* intraBarrier;
int intraPhase;
// Storage for deferred intra-process launch
hipLaunchParams * intraParams;
hipLaunchParams *myParams;
pthread_t* intraThreads;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
void* argsptrs[1];
uint32_t intraBarrierPhase;
char intraPad1[64 - sizeof(uint64_t)];
uint64_t intraBarrierCounter; // only used if this is intraComm0
char intraPad2[64 - sizeof(uint64_t)];
uint64_t intraBarrierGate; // only used if this is intraComm0
struct ncclProxyState proxyState;
@@ -205,44 +260,108 @@ struct ncclComm {
int collNetSupport;
int intraHighestTransportType;
// Store info of async operations
struct ncclInfo* asyncOps;
int asyncOpCount;
size_t asyncTotalSize;
ssize_t channelSize;
int lastChannel;
enum { ROUND_ROBIN, SHORTEST_QUEUE } asyncAllocMode;
size_t channelSize; // User requested work size (bytes) for channel partitions
//list of async p2p operation queued in a group semantics
ncclP2Plist** p2pSends;
ncclP2Plist** p2pRecvs;
int p2pSendCount;
int p2pRecvCount;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
// [RCCL]
//CliqueManager* cliqueManager; // CliqueManager handles pointer collection / distribution for clique-based kernels
//int rootPid; // Process ID of root
// [/RCCL]
// Store info for cudaGraph
int usingCudaGraph; // Only use it during capture time, not launch time
struct ncclQueueInfo* enqueueInfo;
int nQueueInfoCreated;
int nQueueInfoDestroyed;
hipGraphNode_t lastSetupNode;
unsigned long long lastCudaGraphId;
int driverVersion;
pfn_cuMemGetAddressRange_t pfnCuMemGetAddressRange;
pthread_t graphHelperThread;
struct ncclGraphHelperResources* graphHelperResources;
int disableGraphHelper;
int graphRegister;
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
struct ncclMemoryPool memPool_ncclPointerList;
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
struct ncclComm* preconnectNext;
int persistentRefs; // number of persistent plan-lists capturing this comm
struct ncclTasks tasks;
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
ncclUserRedOp *userRedOps;
// Queue of things for the main thread to do
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
// List of kernel plans built form tasks.
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
// First of the unlaunched kernels in `planQueue`
struct ncclKernelPlan* unlaunchedPlansHead;
hipEvent_t doneEvent;
hipStream_t lastStream;
#ifdef ENABLE_COLLTRACE
struct ncclCollTrace* collTrace;
volatile uint32_t *collTraceTail;
pthread_t collTraceThread;
volatile bool collTraceExit;
#endif
};
// Set to true during an `atexit()` handler. We use this to intentionally leak
// unfreed CUDA resources when cleaning up after return of `main()` to avoid
// CUDA calls after CUDA runtime teardown.
extern bool ncclMainExited;
enum ncclLaunchMode {
ncclLaunchModeInvalid=0,
ncclLaunchModeParallel,
ncclLaunchModeGroup
};
extern enum ncclLaunchMode ncclParamLaunchMode;
void ncclCommPushFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm) {
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, /*waitSome=*/false);
while (cb != nullptr) {
struct ncclCommCallback* next = cb->next;
NCCLCHECK(cb->fn(comm, cb)); // may reclaim memory of cb
cb = next;
}
return ncclSuccess;
}
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
int phase = comm->intraBarrierPhase;
if (comm->intraRanks == 1) {
// Release everyone (just me).
comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
} else {
struct ncclComm* comm0 = comm->intraComm0;
uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
if (uint32_t(count) == uint32_t(comm->intraRanks)) {
// Reset.
__atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
// Release everyone.
__atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
}
}
}
// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
struct ncclComm* comm0 = comm->intraComm0;
comm->intraBarrierPhase ^= 1;
uint32_t phase = comm->intraBarrierPhase;
uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
if ((gate & 1) != phase) {
uint64_t t0 = clockNano();
do {
// Spin vigorously for first 5us.
if (clockNano()-t0 >= 5*1000) sched_yield();
gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
} while ((gate & 1) != phase);
}
if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
return gate>>32;
}
// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
// communicator memory address. Used to catch bugs so that integer handles
// associated with this communicator won't collide with handles of other
+3
Ver fichero
@@ -37,7 +37,9 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
case ncclUint8:
return 1;
case ncclFloat16:
#if defined(RCCL_BFLOAT16)
case ncclBfloat16:
#endif
return 2;
case ncclInt32:
case ncclUint32:
@@ -54,6 +56,7 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
#include "debug.h"
#include "checks.h"
#include "rocmwrap.h"
#include "alloc.h"
#include "utils.h"
#include "param.h"
+88
Ver fichero
@@ -0,0 +1,88 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CUDAWRAP_H_
#define NCCL_CUDAWRAP_H_
#include <cuda.h>
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
#else
typedef CUresult (CUDAAPI *PFN_cuInit)(unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion)(int *driverVersion);
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
#endif
#define CUPFN(symbol) pfn_##symbol
// Check CUDA PFN driver calls
#define CUCHECK(cmd) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUCHECKGOTO(cmd, res, label) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
// Report failure but clear error and continue
#define CUCHECKIGNORE(cmd) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \
} \
} while(false)
#define CUCHECKTHREAD(cmd, args) do { \
CUresult err = pfn_##cmd; \
if (err != CUDA_SUCCESS) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
args->ret = ncclUnhandledCudaError; \
return args; \
} \
} while(0)
#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate_v3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
DECLARE_CUDA_PFN_EXTERN(cuInit);
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion);
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress);
ncclResult_t cudaLibraryInit(void);
#endif
+5 -3
Ver fichero
@@ -10,8 +10,8 @@
#include "nccl_net.h"
#include <stdio.h>
#include <chrono>
#include <type_traits>
#include <sys/syscall.h>
#include <limits.h>
#include <string.h>
#include <pthread.h>
@@ -21,7 +21,7 @@
extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern pthread_mutex_t ncclDebugLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
@@ -29,13 +29,15 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
extern char ncclLastError[];
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
#ifdef ENABLE_TRACE
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
extern std::chrono::high_resolution_clock::time_point ncclEpoch;
extern std::chrono::steady_clock::time_point ncclEpoch;
#else
#define TRACE(...)
#endif
+87 -88
Ver fichero
@@ -15,9 +15,6 @@
#include "npkit/npkit_struct.h"
#endif
#include <stdint.h>
// [RCCL] Support for clique-based kernels
//#include "clique/CliqueCommon.h"
// [/RCCL]
#define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
@@ -33,7 +30,6 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_CLIQUE 1 // [RCCL] Clique takes up same protocol as unused LL128
#define NCCL_PROTO_SIMPLE 2
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
@@ -83,10 +79,6 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_LL128_MAX_NTHREADS 256
#define NCCL_LL128_ELEMS_PER_THREAD 28
// Receiving from up to 3 sources is more compute intensive than sending
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
@@ -145,7 +137,6 @@ struct ncclRing {
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int* devUserRanks;
int index; // This rank's index in the ring
};
@@ -171,7 +162,7 @@ struct ncclDirect {
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_CONNS 3
struct ncclPeer {
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
};
@@ -185,31 +176,43 @@ struct ncclDevComm;
/* Make sure to adjust padding at the end of ncclWorkElem. */
#define NCCL_WORK_SIZE 256
enum ncclWorkElemType : uint8_t {
enum ncclWorkType : uint8_t {
ncclWorkTypeUnused=0,
ncclWorkTypeColl=1,
ncclWorkTypeP2p=2,
ncclWorkTypeRegColl=3
};
enum ncclWorkElemSubType : uint8_t {
ncclWorkSubTypeUnused =0,
ncclWorkSubTypeSend,
ncclWorkSubTypeRecv
enum ncclWorkP2PType : uint8_t {
ncclWorkP2pTypeUnused=0,
ncclWorkP2pTypeSend,
ncclWorkP2pTypeRecv
};
struct ncclWorkElemHeader {
struct ncclWorkHeader {
union {
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t funcIndex;
enum ncclWorkElemType type;
uint8_t nWarps:5;
uint8_t isLast:1;
uint8_t isLast:1; // last work for this kernel
uint8_t inFifo:1; // is this work in the fifo
enum ncclWorkType type;
};
struct ncclWorkElem {
struct ncclWorkElemHeader header;
uint8_t regUsed;
union {
uint8_t flagBits;
struct {
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, pad_0:1, nWarps:4;
};
};
uint8_t direct;
uint8_t redOpArgIsPtr;
uint8_t pad_0;
uint8_t bid;
uint8_t nChannels;
struct {
uint32_t root:30;
uint32_t connIndex:2;
};
const void * sendbuff;
void * recvbuff;
@@ -221,29 +224,40 @@ struct ncclWorkElem {
// Instead, it needs the number of bidirectional rings.
size_t pivotA2ANumBiRings;
};
uint32_t root;
uint8_t bid;
uint8_t nChannels;
uint16_t connIndex;
uint64_t redOpArg;
uint64_t opCount;
};
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
#define NCCL_MAX_WORK_ELEMENTS 1
struct ncclWorkElemP2p {
struct ncclWorkElemHeader header;
int32_t peer;
void* buff;
size_t count;
struct {
int32_t peer:30;
uint32_t connIndex:2;
};
union {
uint16_t flagBits;
struct {
enum ncclWorkP2PType p2pType:4;
uint16_t nWarps:4;
uint16_t warpStart:4;
uint16_t ngroups:4;
};
};
uint16_t opCount;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
//void* buff;
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
//size_t count;
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
int chunkSize;
uint8_t ngroups:4;
uint8_t warpStart:4;
uint8_t nWarps:4;
enum ncclWorkElemSubType subType:4;
uint16_t opCount:12;
uint16_t connIndex:4;
};
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
struct ncclWorkElemReg {
struct ncclWorkElem elem;
@@ -251,56 +265,31 @@ struct ncclWorkElemReg {
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
};
static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
#define NCCL_MAX_WORK_ELEMENTS 1
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
// Number of named barriers supported by CUDA
#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclWork {
struct ncclWorkHeader header;
union {
char pad[NCCL_WORK_SIZE];
struct ncclWorkElemHeader header;
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
};
};
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
struct ncclChannel {
union {
struct {
struct ncclRing ring;
struct ncclTree tree;
struct ncclDirect collTree;
int id;
// Communication structures
struct ncclPeer* peers;
struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclWork* workFifo;
int workCount;
size_t totalSize;
uint64_t workFifoTail; // Only used by CPU
uint16_t index; // Only used by GPU
// GDRCOPY support
struct ncclWork* workFifoGdr;
struct ncclWork* workFifoDev;
void* gdrMemDesc;
};
int data[0x80];
};
struct ncclDevChannelPeer {
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
// instead of the full ncclConnector.
struct ncclConnInfo send[NCCL_MAX_CONNS];
struct ncclConnInfo recv[NCCL_MAX_CONNS];
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
@@ -361,38 +350,48 @@ static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must
#define COLLTRACE_NUM_ITEMS 8192
#endif
struct alignas(16) ncclDevChannel {
struct ncclDevChannelPeer *peers;
struct ncclRing ring;
struct ncclTree tree;
struct ncclDirect collTree;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
};
struct ncclDevComm {
int rank;
int nRanks;
int buffSizes[NCCL_NUM_PROTOCOLS];
// Operation list for aggregation
int workFifoDepth;
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t* abortFlag;
// Channels, device side
struct ncclChannel* channels;
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
uint64_t* cpuTimestamp;
#endif
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf* devProf;
#endif
#ifdef ENABLE_COLLTRACE
struct ncclCollTrace* collTrace;
uint32_t collTraceHead, *collTraceTail;
volatile uint32_t *collTraceTail;
pthread_t collTraceThread;
bool collTraceExit;
#endif
#ifdef ENABLE_PROFILING
struct ncclProf* devProf;
#endif
};
struct ncclDevCommAndChannels {
ncclDevComm comm;
ncclChannel channels[MAXCHANNELS];
struct alignas(16) ncclDevCommAndChannels {
struct ncclDevComm comm;
struct ncclDevChannel channels[MAXCHANNELS];
};
#endif
+6 -113
Ver fichero
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +10,7 @@
#include "comm.h"
#include "group.h"
#include "collectives.h"
#include "utils.h"
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
@@ -19,117 +19,10 @@ size_t ncclKernMaxLocalSize();
size_t ncclKernLocalSize(int i);
ncclResult_t ncclKernSetSharedMemoryCarveout(int carveOut);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm);
ncclResult_t ncclLaunchKernel(ncclComm_t comm);
ncclResult_t ncclRecordEvents(struct ncclComm* comm);
ncclResult_t ncclLaunchReset(ncclComm_t comm);
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info);
ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm);
template<int USING_CUDA_GRAPH>
void HIPRT_CB ncclEnqueueHostSetup(void* arg);
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, hipGraph_t* graph);
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, hipGraph_t graph);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
struct ncclBuffRegInfo {
void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
int nBuffs;
};
// Enqueue information (for kernel and proxy) for each operation
struct ncclQueueElem {
struct ncclWork work;
struct ncclProxyOp proxyOp;
struct ncclBuffRegInfo buffRegInfo;
};
typedef ncclRecyclableList<struct ncclQueueElem> ncclQueueElemList;
// Structure passed to CUDA graph
struct ncclQueueInfo {
ncclComm_t comm;
int maxChannels; // Dynamic version of gridDim
ncclResult_t ret; // Return value of host setup call
int nRegBuffs;
ncclQueueElemList* elemList;
};
static ncclResult_t ncclCreateQueueInfo(struct ncclQueueInfo** eqInfo, ncclComm_t comm) {
NCCLCHECK(ncclCalloc(eqInfo, 1));
(*eqInfo)->comm = comm;
(*eqInfo)->elemList = new ncclQueueElemList();
(*eqInfo)->comm->nQueueInfoCreated++;
return ncclSuccess;
}
// Reset element queue
static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) {
if (eqInfo == NULL) return ncclInternalError;
eqInfo->maxChannels = 0;
eqInfo->ret = ncclSuccess;
eqInfo->nRegBuffs = 0;
eqInfo->elemList->recycle();
return ncclSuccess;
}
// Destroy enqueue info space
// used by both CUDA graph and non CUDA graph
static void ncclDestroyQueueInfo(void* ptr) {
if (ptr == NULL) return;
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr;
struct ncclComm* comm = eqInfo->comm;
// Close IPC mem handles for registered buffers
struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
#if 0
// Ideally, the deregistration should happen here
// but currently the destroy function of CUDA objects does not allow CUDA API calls
while (eqElem != NULL) {
for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
if (i == eqInfo->comm->localRank) continue;
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
}
eqElem = eqInfo->elemList->getNext();
}
#else
// Instead, we push these pointers to a pool owned by ncclComm
// and asks a helper thread to close mem handles
struct ncclGraphHelperResources* res = comm->graphHelperResources;
int ipcTailOld = 0;
if (res == NULL || (!comm->graphHelperThread) || eqInfo->nRegBuffs == 0) goto skip;
pthread_mutex_lock(&res->threadLock);
ipcTailOld = res->ipcTail;
while (eqElem != NULL) {
for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
if (eqElem->buffRegInfo.sendbuffsBase[i] != NULL) {
res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.sendbuffsBase[i];
res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
}
if (eqElem->buffRegInfo.recvbuffsBase[i] != NULL) {
res->ipcBases[res->ipcTail] = eqElem->buffRegInfo.recvbuffsBase[i];
res->ipcTail = (res->ipcTail+1)%NCCL_IPC_POOL_SIZE;
}
}
eqElem = eqInfo->elemList->getNext();
}
if (res->ipcTail != ipcTailOld) {
res->threadState = ThreadStart;
TRACE(NCCL_COLL, "CUDA Graph destroy function signaling helper thread with %d IPC handles", res->ipcTail-ipcTailOld);
pthread_cond_signal(&res->threadCond);
}
pthread_mutex_unlock(&res->threadLock);
#endif
skip:
delete eqInfo->elemList;
free(eqInfo);
comm->nQueueInfoDestroyed++;
return;
}
#endif // End include guard
+1 -1
Ver fichero
@@ -263,7 +263,7 @@ static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
CUDACHECK(hipFree(md->gdrDevMem));
CUDACHECK(cudaFree(md->gdrDevMem));
free(md);
return ncclSuccess;
Archivo normal → Archivo ejecutable
Ver fichero
+2 -2
Ver fichero
@@ -24,7 +24,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
@@ -37,7 +37,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int ne
#define MAX_XGMI_INTER_GPUS 4
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
int ncclPxnDisable();
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
+70 -7
Ver fichero
@@ -11,15 +11,78 @@
#include "nccl.h"
#include "comm.h"
bool ncclAsyncMode();
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
void ncclGroupCommJoin(struct ncclComm* comm);
void ncclGroupCommPreconnect(struct ncclComm* comm);
void ncclGroupCommLeave(struct ncclComm* comm);
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
struct ncclAsyncJob {
struct ncclAsyncJob* next;
pthread_t thread;
ncclResult_t result;
ncclResult_t(*func)(struct ncclAsyncJob*);
void(*undo)(struct ncclAsyncJob*);
void(*destructor)(void*);
};
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev, int virtualId);
ncclResult_t ncclAsyncLaunch(
struct ncclAsyncJob* job,
ncclResult_t(*func)(struct ncclAsyncJob*),
void(*undo)(struct ncclAsyncJob*),
void(*destructor)(void*)
);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGroupStartInternal();
ncclResult_t ncclGroupEndInternal();
////////////////////////////////////////////////////////////////////////////////
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
extern __thread ncclResult_t ncclGroupError;
extern __thread struct ncclComm* ncclGroupCommHead;
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
inline ncclResult_t ncclGroupStartInternal() {
ncclGroupDepth++;
return ncclSuccess;
}
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
if (ncclGroupDepth > 0) {
if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
}
return ret;
}
// Add comm to this thread's group
inline void ncclGroupCommJoin(struct ncclComm* comm) {
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
// the users program order yet insures siblings occur consecutively. This
// is required by doLaunches() in "group.cc".
struct ncclComm** pp = &ncclGroupCommHead;
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
pp = &(*pp)->groupNext;
comm->groupNext = *pp;
*pp = comm;
// Comms gets a new memory stack scope upon joining. Each task batched for
// this comm is allocated there.
ncclMemoryStackPush(&comm->memScoped);
}
}
// Add comm to this thread's group needing preconnect
inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
comm->preconnectNext = ncclGroupCommPreconnectHead;
ncclGroupCommPreconnectHead = comm;
}
}
// Comm has left group
inline void ncclGroupCommLeave(struct ncclComm* comm) {
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
ncclMemoryStackPop(&comm->memScoped);
}
ncclResult_t ncclAsyncColl(ncclComm_t comm);
#endif
+3
Ver fichero
@@ -1067,6 +1067,9 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
/* DMA-BUF support */
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+63 -2
Ver fichero
@@ -11,6 +11,9 @@
#include "nccl.h"
#include "devcomm.h"
#include "collectives.h"
#include "core.h"
#include "utils.h"
#include "strongstream.h"
typedef enum : uint8_t {
ncclPatternRing,
@@ -53,8 +56,66 @@ struct ncclInfo {
int nchunksPerLoop;
int chunkSize;
int channelId;
uint16_t connIndex;
uint64_t opCount;
};
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
info->count = info->nBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
return ncclSuccess;
}
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclDevRedOpFull op;
int chunkSteps, sliceSteps;
};
struct ncclTaskP2p {
ncclTaskP2p *next;
void *buff;
size_t bytes;
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
// of where it left off.
int chunk;
};
struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
hipStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
};
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
size_t collBytesTotal;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder/*[nRanks]*/, *p2pRecvOrder/*[nRanks]*/;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// Keep track of the number of user streams
int numStreams;
// The most recent user stream. Ignored if streams==nullptr
hipStream_t streamRecent;
// The graph capturing all user streams or invalid if none. Thus we restrict the
// user that all streams must be captured in the same graph or not captured
// at all. Technically we could probably relax this, but that would mean
// collecting a different `ncclTasks` per graph and one for non-graph.
struct ncclCudaGraph capturingGraph;
};
#endif
+107 -14
Ver fichero
@@ -14,12 +14,13 @@
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
#define NCCL_PTR_DMABUF 0x4
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -28,15 +29,15 @@ typedef struct {
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v5_t;
}ncclNetProperties_v6_t;
typedef ncclNetProperties_v5_t ncclNetProperties_t;
typedef ncclNetProperties_v6_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
@@ -46,7 +47,103 @@ typedef struct {
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v6_t;
typedef ncclNet_v6_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v6_t;
typedef ncclCollNet_v6_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
// v5 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
@@ -83,10 +180,7 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v5_t;
typedef ncclNet_v5_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
// v5 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
@@ -96,7 +190,7 @@ typedef struct {
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
@@ -125,10 +219,7 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v5_t;
typedef ncclCollNet_v5_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
// v4 struct for backwards compatibility
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
@@ -140,6 +231,7 @@ typedef struct {
int maxComms; // Maximum number of comms we can create
} ncclNetProperties_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
@@ -179,6 +271,7 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
+22 -19
Ver fichero
@@ -9,33 +9,36 @@
#include "nccl.h"
#include "nccl_net.h"
#include "comm.h"
#include "checks.h"
extern ncclNet_t* ncclNet;
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ncclNetInit();
int ncclNetVersion();
ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetInit(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
static const char* ncclNetName(struct ncclComm* comm) { return comm->ncclNet->name; }
static ncclResult_t ncclNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(struct ncclComm* comm, int dev, void* handle, void** sendComm) { NCCLCHECK(comm->ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(struct ncclComm* comm, void* listenComm, void** recvComm) { NCCLCHECK(comm->ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(struct ncclComm* comm, void* netComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclNet->regMr(netComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t ncclNetRegMrDmaBuf(struct ncclComm* comm, void* netComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclNet->regMrDmaBuf(netComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(struct ncclComm* comm, void* netComm, void* mhandle) { NCCLCHECK(comm->ncclNet->deregMr(netComm, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetIsend(struct ncclComm* comm, void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(comm->ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
static ncclResult_t ncclNetIrecv(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetIflush(struct ncclComm* comm, void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(comm->ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
static ncclResult_t ncclNetTest(struct ncclComm* comm, void* request, int* done, int* sizes) { NCCLCHECK(comm->ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(struct ncclComm* comm, void* sendComm) { NCCLCHECK(comm->ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(struct ncclComm* comm, void* recvComm) { NCCLCHECK(comm->ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclNet->closeListen(listenComm)); return ncclSuccess; }
// Test whether the current GPU support GPU Direct RDMA.
ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
+15 -15
Ver fichero
@@ -8,7 +8,7 @@
#include "nvToolsExt.h"
#include "cuda.h"
#include "hip/hip_runtime.h"
#ifndef NVTOOLSEXT_CUDA_V3
#define NVTOOLSEXT_CUDA_V3
@@ -42,10 +42,10 @@ extern "C" {
*/
typedef enum nvtxResourceCUDAType_t
{
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* hipDevice_t */
NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* hipCtx_t */
NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* hipStream_t */
NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* hipEvent_t */
} nvtxResourceCUDAType_t;
@@ -59,8 +59,8 @@ typedef enum nvtxResourceCUDAType_t
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -73,16 +73,16 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
*
* \par Example:
* \code
* CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
* if ( CUDA_SUCCESS != status )
* hipError_t status = hipCtxCreate( &cuContext, 0, cuDevice );
* if ( hipSuccess != status )
* goto Error;
* nvtxNameCuContext(cuContext, "CTX_NAME");
* \endcode
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -95,8 +95,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -109,8 +109,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
+8 -8
Ver fichero
@@ -8,8 +8,8 @@
#include "nvToolsExt.h"
#include "cuda.h"
#include "driver_types.h"
#include "hip/hip_runtime.h"
#include "hip/driver_types.h"
#ifndef NVTOOLSEXT_CUDART_V3
#define NVTOOLSEXT_CUDART_V3
@@ -44,8 +44,8 @@ extern "C" {
typedef enum nvtxResourceCUDARTType_t
{
NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* hipStream_t */
NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* hipEvent_t */
} nvtxResourceCUDARTType_t;
@@ -73,8 +73,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name);
/** @} */
/* ------------------------------------------------------------------------- */
@@ -87,8 +87,8 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
*
* \version \NVTX_VERSION_1
* @{ */
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name);
/** @} */
/** @} */ /* END RESOURCE_NAMING */
@@ -16,10 +16,10 @@ extern "C" {
typedef void (NVTX_API * nvtxNameCudaDeviceA_impl_fntype)(int device, const char* name);
typedef void (NVTX_API * nvtxNameCudaDeviceW_impl_fntype)(int device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(cudaStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(cudaStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(cudaEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(cudaEvent_t event, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaStreamA_impl_fntype)(hipStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCudaStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCudaEventA_impl_fntype)(hipEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCudaEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name)
{
@@ -39,7 +39,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(hipStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamA_impl_fntype local = (nvtxNameCudaStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr;
@@ -48,7 +48,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(hipStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaStreamW_impl_fntype local = (nvtxNameCudaStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr;
@@ -57,7 +57,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(hipEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventA_impl_fntype local = (nvtxNameCudaEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr;
@@ -66,7 +66,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(hipEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCudaEventW_impl_fntype local = (nvtxNameCudaEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr;
+16 -16
Ver fichero
@@ -15,16 +15,16 @@
extern "C" {
#endif /* __cplusplus */
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(CUdevice device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(CUdevice device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(CUcontext context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(CUcontext context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(CUstream stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(CUstream stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(CUevent event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(CUevent event, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuDeviceA_impl_fntype)(hipDevice_t device, const char* name);
typedef void (NVTX_API * nvtxNameCuDeviceW_impl_fntype)(hipDevice_t device, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuContextA_impl_fntype)(hipCtx_t context, const char* name);
typedef void (NVTX_API * nvtxNameCuContextW_impl_fntype)(hipCtx_t context, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuStreamA_impl_fntype)(hipStream_t stream, const char* name);
typedef void (NVTX_API * nvtxNameCuStreamW_impl_fntype)(hipStream_t stream, const wchar_t* name);
typedef void (NVTX_API * nvtxNameCuEventA_impl_fntype)(hipEvent_t event, const char* name);
typedef void (NVTX_API * nvtxNameCuEventW_impl_fntype)(hipEvent_t event, const wchar_t* name);
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(hipDevice_t device, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceA_impl_fntype local = (nvtxNameCuDeviceA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr;
@@ -33,7 +33,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(hipDevice_t device, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuDeviceW_impl_fntype local = (nvtxNameCuDeviceW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr;
@@ -42,7 +42,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(hipCtx_t context, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextA_impl_fntype local = (nvtxNameCuContextA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr;
@@ -51,7 +51,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(hipCtx_t context, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuContextW_impl_fntype local = (nvtxNameCuContextW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr;
@@ -60,7 +60,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t*
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(hipStream_t stream, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamA_impl_fntype local = (nvtxNameCuStreamA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr;
@@ -69,7 +69,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(hipStream_t stream, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuStreamW_impl_fntype local = (nvtxNameCuStreamW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr;
@@ -78,7 +78,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* na
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(hipEvent_t event, const char* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventA_impl_fntype local = (nvtxNameCuEventA_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr;
@@ -87,7 +87,7 @@ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name)
#endif /*NVTX_DISABLE*/
}
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name)
NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(hipEvent_t event, const wchar_t* name)
{
#ifndef NVTX_DISABLE
nvtxNameCuEventW_impl_fntype local = (nvtxNameCuEventW_impl_fntype)NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr;
+1 -1
Ver fichero
@@ -18,7 +18,7 @@
/* ------ Dependency-free types binary-compatible with real types ------- */
/* In order to avoid having the NVTX core API headers depend on non-NVTX
* headers like cuda.h, NVTX defines binary-compatible types to use for
* headers like hip/hip_runtime.h, NVTX defines binary-compatible types to use for
* safely making the initialization versions of all NVTX functions without
* needing to have definitions for the real types. */
-17
Ver fichero
@@ -9,21 +9,4 @@
#ifndef NCCL_P2P_H_
#define NCCL_P2P_H_
struct ncclP2Pinfo {
void* buff;
ssize_t nbytes;
uint64_t opCount;
};
typedef ncclRecyclableList<struct ncclP2Pinfo> ncclP2Plist;
static ncclResult_t ncclSaveP2pInfo(ncclP2Plist* &p2p, void* buff, ssize_t nBytes, uint64_t opCount) {
if (p2p == NULL) p2p = new ncclP2Plist();
struct ncclP2Pinfo* next;
NCCLCHECK(p2p->getNewElem(&next));
next->buff = buff;
next->nbytes = nBytes;
next->opCount = opCount;
return ncclSuccess;
}
#endif
+18 -10
Ver fichero
@@ -26,18 +26,26 @@ struct ncclProxyOp {
int channelId;
int nsteps;
ssize_t nbytes;
int root;
struct {
int root:30;
uint32_t connIndex:2;
};
int next;
uint64_t opCount;
int sliceSteps;
int chunkSteps;
int chunkSize;
ncclDataType_t dtype;
ncclRedOp_t redOp;
ncclPattern_t pattern; // uint8_t
uint8_t /*ncclDataType_t*/ dtype;
uint8_t /*ncclDevRedOp_t*/ redOp;
uint8_t /*ncclPattern_t*/ pattern;
uint8_t protocol;
uint16_t connIndex;
union {
uint64_t unused;
// For use by enqueue.cc
struct ncclProxyOp *enqNext;
};
};
static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
@@ -73,9 +81,9 @@ struct ncclProxyArgs {
int sliceSteps;
int chunkSteps;
int chunkSize;
ncclDataType_t dtype;
ncclRedOp_t redOp;
ncclPattern_t pattern;
uint8_t /*ncclDataType_t*/ dtype;
uint8_t /*ncclDevRedOp_t*/ redOp;
uint8_t /*ncclPattern_t*/ pattern;
uint8_t protocol;
int state;
char* sharedBuff[NCCL_STEPS];
@@ -164,6 +172,7 @@ struct ncclProxyState {
pthread_t thread;
struct ncclSocket* listenSock;
int stop;
hipCtx_t cudaCtx;
// Used by main thread
union ncclSocketAddress* peerAddresses;
@@ -193,9 +202,8 @@ enum proxyMode {
proxyTo = 2
};
ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
+73
Ver fichero
@@ -0,0 +1,73 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ROCMWRAP_H_
#define NCCL_ROCMWRAP_H_
#include <hsa/hsa.h>
typedef hsa_status_t (*PFN_hsa_init)();
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
typedef hsa_status_t (*PFN_hsa_status_string)(hsa_status_t status, const char ** status_string);
typedef hsa_status_t (*PFN_hsa_amd_portable_export_dmabuf)(const void* ptr, size_t size, int* dmabuf, uint64_t* offset);
#define CUPFN(symbol) pfn_##symbol
// Check CUDA PFN driver calls
#define CUCHECK(cmd) do { \
hsa_status_t err = pfn_##cmd; \
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
WARN("ROCr failure '%s'", errStr); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUCHECKGOTO(cmd, res, label) do { \
hsa_status_t err = pfn_##cmd; \
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
WARN("ROCr failure '%s'", errStr); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
// Report failure but clear error and continue
#define CUCHECKIGNORE(cmd) do { \
hsa_status_t err = pfn_##cmd; \
if( err != HSA_STATUS_SUCCESS ) { \
const char *errStr; \
pfn_hsa_status_string(err, &errStr); \
INFO(NCCL_ALL,"%s:%d ROCr failure '%s'", __FILE__, __LINE__, errStr); \
} \
} while(false)
#define CUCHECKTHREAD(cmd, args) do { \
hsa_status_t err = pfn_##cmd; \
if (err != HSA_STATUS_SUCCESS) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
args->ret = ncclUnhandledCudaError; \
return args; \
} \
} while(0)
#define DECLARE_ROCM_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
DECLARE_ROCM_PFN_EXTERN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN_EXTERN(hsa_init);
DECLARE_ROCM_PFN_EXTERN(hsa_system_get_info);
DECLARE_ROCM_PFN_EXTERN(hsa_status_string);
ncclResult_t rocmLibraryInit(void);
#endif
Archivo normal → Archivo ejecutable
Ver fichero
+142
Ver fichero
@@ -0,0 +1,142 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_STRONGSTREAM_H_
#define NCCL_STRONGSTREAM_H_
#include "nccl.h"
#include "checks.h"
#include <stdint.h>
/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
* easily.
*/
struct ncclCudaGraph {
#if CUDART_VERSION >= 11030
cudaGraph_t graph;
uint64_t graphId;
#endif
};
inline struct ncclCudaGraph ncclCudaGraphNull() {
struct ncclCudaGraph tmp;
#if CUDART_VERSION >= 11030
tmp.graph = nullptr;
tmp.graphId = ULLONG_MAX;
#endif
return tmp;
}
inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
#if CUDART_VERSION >= 11030
return graph.graph != nullptr;
#else
return false;
#endif
}
inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
#if CUDART_VERSION >= 11030
return a.graphId == b.graphId;
#else
return true;
#endif
}
ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, hipStream_t stream);
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg);
/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
* identity while being captured. Regular streams have the deficiency that the
* captured form of a stream in one graph launch has no relation to the
* uncaptured stream or to the captured form in other graph launches. This makes
* streams unfit for the use of serializing access to a persistent resource.
* Strong streams have been introduced to address this need.
*
* Constraints of using strong streams:
*
* - Operations that enqueue work to the strong stream need to be enclosed by
* ncclStrongStream[Acquire/Release] pairs. Acquire/release act like fences,
* the strong stream is not stateful so there is no harm in redundant acquire
* or releases.
*
* - An {Acquire; ...; Release} sequence must not be concurrent with any
* other operations against the strong stream including graph launches which
* reference this stream.
*
* - All strong stream functions take a "graph" parameter which must reference
* the currently capturing graph, or null if none.
*/
struct ncclStrongStream;
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
// Has this strong stream ever been captured in a graph.
bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss);
// Acquire-fence the strong stream.
ncclResult_t ncclStrongStreamAcquire(
struct ncclCudaGraph graph, struct ncclStrongStream* ss
);
// Acquire-fence the strong stream assuming no graph is capturing. This permits
// the caller to enqueue directly to the `ss->stream` member using native CUDA
// calls. Strong stream must be released via:
// ncclStrongStreamRelease(ncclCudaGraphNull(), graphRefs, ss);
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
// Release-fence of the strong stream.
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
// Add a host launch to the stream.
ncclResult_t ncclStrongStreamLaunchHost(
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
hipHostFn_t fn, void* arg
);
// Add a kernel launch to the stream.
ncclResult_t ncclStrongStreamLaunchKernel(
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
);
// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
);
// `b` must be capturing within `graph`.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
);
// `a` must be capturing within `graph`.
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
);
// Synchrnoization does not need the strong stream to be acquired.
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
////////////////////////////////////////////////////////////////////////////////
struct ncclStrongStream {
hipStream_t stream;
hipEvent_t event;
#if CUDART_VERSION >= 11030
cudaGraphNode_t node; // null if never captured, otherwise never null again
uint64_t graphId:63, eventIsLagging:1;
#endif
};
inline bool ncclStrongStreamEverCaptured(struct ncclStrongStream* ss) {
#if CUDART_VERSION >= 11030
return ss->node != nullptr;
#else
return false;
#endif
}
#endif
+7 -2
Ver fichero
@@ -21,7 +21,12 @@
#include "proxy.h"
extern struct ncclTransport ncclTransports[];
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
extern struct ncclTransport netTransport;
extern struct ncclTransport collNetTransport;
extern struct ncclTransport* ncclTransports[];
// Forward declarations
struct ncclRing;
@@ -66,7 +71,7 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
enum { collNetRecv=0, collNetSend=1 };
+433 -64
Ver fichero
@@ -8,8 +8,12 @@
#define NCCL_UTILS_H_
#include "nccl.h"
#include "alloc.h"
#include "checks.h"
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <new>
int ncclCudaCompCap();
@@ -38,81 +42,446 @@ static long log2i(long n) {
return l;
}
// Recyclable list that avoids frequent malloc/free
inline uint64_t clockNano() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
}
////////////////////////////////////////////////////////////////////////////////
template<typename Int>
inline void ncclAtomicRefCountIncrement(Int* refs) {
__atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
}
template<typename Int>
inline Int ncclAtomicRefCountDecrement(Int* refs) {
return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
}
////////////////////////////////////////////////////////////////////////////////
/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
* granularity of LIFO is not per object, instead frames containing many objects
* are pushed and popped. Therefor deallocation is extremely cheap since its
* done at the frame granularity.
*
* The initial state of the stack is with one frame, the "nil" frame, which
* cannot be popped. Therefor objects allocated in the nil frame cannot be
* deallocated sooner than stack destruction.
*/
struct ncclMemoryStack;
void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
void ncclMemoryStackPush(struct ncclMemoryStack* me);
void ncclMemoryStackPop(struct ncclMemoryStack* me);
template<typename T>
struct ncclListElem {
T data;
struct ncclListElem* next;
T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
////////////////////////////////////////////////////////////////////////////////
/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
* a pool instance to ever hold objects whose type have differing
* (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
* a backing `ncclMemoryStack` passed during Alloc(). If memory
* backing any currently held object is deallocated then it is an error to do
* anything other than reconstruct it, after which it is a valid empty pool.
*/
struct ncclMemoryPool;
// Equivalent to zero-initialization
void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
template<typename T>
T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
template<typename T>
void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
////////////////////////////////////////////////////////////////////////////////
/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
* field is given via the `next` template argument.
*
* Example:
* struct Foo {
* struct Foo *next1, *next2; // can be a member of two lists at once
* };
* ncclIntruQueue<Foo, &Foo::next1> list1;
* ncclIntruQueue<Foo, &Foo::next2> list2;
*/
template<typename T, T *T::*next>
struct ncclIntruQueue;
template<typename T, T *T::*next>
void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
template<typename T, T *T::*next>
T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
template<typename T, T *T::*next>
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
////////////////////////////////////////////////////////////////////////////////
/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
* and "cond" fields are part of the public interface.
*/
struct ncclThreadSignal {
pthread_mutex_t mutex;
pthread_cond_t cond;
};
template<typename T>
class ncclRecyclableList {
private:
struct ncclListElem<T>* head;
struct ncclListElem<T>* tail;
struct ncclListElem<T>* cursor;
int n;
// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
public:
ncclRecyclableList() {
tail = cursor = head = NULL;
n = 0;
}
void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
int count() const { return n; }
// A convenience instance per-thread.
extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
// Get a new element from the list and return pointer
ncclResult_t getNewElem(T** dataOut) {
if (tail != NULL) {
*dataOut = &tail->data;
memset(*dataOut, 0, sizeof(T));
} else {
NCCLCHECK(ncclCalloc(&tail, 1));
*dataOut = &tail->data;
cursor = head = tail;
}
if (tail->next == NULL) {
NCCLCHECK(ncclCalloc(&tail->next, 1));
}
tail = tail->next;
n += 1;
return ncclSuccess;
}
////////////////////////////////////////////////////////////////////////////////
T* begin() {
if (head == NULL || head == tail) return NULL;
cursor = head->next;
return &head->data;
}
template<typename T, T *T::*next>
struct ncclIntruQueueMpsc;
// Get next element from the list during an iteration
T* getNext() {
// tail always points to the next element to be enqueued
// hence does not contain valid data
if (cursor == NULL || cursor == tail) return NULL;
T* rv = &cursor->data;
cursor = cursor->next;
return rv;
}
template<typename T, T *T::*next>
void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
template<typename T, T *T::*next>
bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
// Enqueue element. Returns true if queue is not abandoned. Even if queue is
// abandoned the element enqueued, so the caller needs to make arrangements for
// the queue to be tended.
template<typename T, T *T::*next>
bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
// Dequeue all elements at a glance. If there aren't any and `waitSome` is
// true then this call will wait until it can return a non empty list.
template<typename T, T *T::*next>
T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
// Dequeue all elements and set queue to abandoned state.
template<typename T, T *T::*next>
T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);
T* peakNext() {
if (cursor == NULL || cursor == tail) return NULL;
return &cursor->data;
}
////////////////////////////////////////////////////////////////////////////////
// Recycle the list without freeing the space
void recycle() {
tail = cursor = head;
n = 0;
}
struct ncclMemoryStack {
struct Hunk {
struct Hunk* above; // reverse stack pointer
size_t size; // size of this allocation (including this header struct)
};
struct Unhunk { // proxy header for objects allocated out-of-hunk
struct Unhunk* next;
void* obj;
};
struct Frame {
struct Hunk* hunk; // top of non-empty hunks
uintptr_t bumper, end; // points into top hunk
struct Unhunk* unhunks;
struct Frame* below;
};
~ncclRecyclableList() {
while (head != NULL) {
struct ncclListElem<T>* temp = head;
head = head->next;
free(temp);
}
}
static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
struct Hunk stub;
struct Frame topFrame;
};
inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
me->stub.above = nullptr;
me->stub.size = 0;
me->topFrame.hunk = &me->stub;
me->topFrame.bumper = 0;
me->topFrame.end = 0;
me->topFrame.unhunks = nullptr;
me->topFrame.below = nullptr;
}
inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
void* obj;
if (__builtin_expect(o + size <= me->topFrame.end, true)) {
me->topFrame.bumper = o + size;
obj = reinterpret_cast<void*>(o);
} else {
obj = allocateSpilled(me, size, align);
}
return obj;
}
template<typename T>
inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
memset(obj, 0, n*sizeof(T));
return (T*)obj;
}
inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
using Frame = ncclMemoryStack::Frame;
Frame tmp = me->topFrame;
Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
*snapshot = tmp; // C++ struct assignment
me->topFrame.unhunks = nullptr;
me->topFrame.below = snapshot;
}
inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
while (un != nullptr) {
free(un->obj);
un = un->next;
}
me->topFrame = *me->topFrame.below; // C++ struct assignment
}
////////////////////////////////////////////////////////////////////////////////
struct ncclMemoryPool {
struct Cell {
Cell *next;
};
template<int Size, int Align>
union CellSized {
Cell cell;
alignas(Align) char space[Size];
};
struct Cell* head;
struct Cell* tail; // meaningful only when head != nullptr
};
inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
me->head = nullptr;
}
template<typename T>
inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
using Cell = ncclMemoryPool::Cell;
using CellSized = ncclMemoryPool::CellSized<sizeof(T), alignof(T)>;
Cell* cell;
if (__builtin_expect(me->head != nullptr, true)) {
cell = me->head;
me->head = cell->next;
} else {
// Use the internal allocate() since it doesn't memset to 0 yet.
cell = (Cell*)ncclMemoryStack::allocate(backing, sizeof(CellSized), alignof(CellSized));
}
memset(cell, 0, sizeof(T));
return reinterpret_cast<T*>(cell);
}
template<typename T>
inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
using Cell = ncclMemoryPool::Cell;
Cell* cell = reinterpret_cast<Cell*>(obj);
cell->next = me->head;
if (me->head == nullptr) me->tail = cell;
me->head = cell;
}
inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
if (from->head != nullptr) {
from->tail->next = me->head;
if (me->head == nullptr) me->tail = from->tail;
me->head = from->head;
from->head = nullptr;
}
}
////////////////////////////////////////////////////////////////////////////////
template<typename T, T *T::*next>
struct ncclIntruQueue {
T *head, *tail;
};
template<typename T, T *T::*next>
inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
me->head = nullptr;
me->tail = nullptr;
}
template<typename T, T *T::*next>
inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
return me->head == nullptr;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
return me->head;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
return me->tail;
}
template<typename T, T *T::*next>
inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
x->*next = nullptr;
(me->head ? me->tail->*next : me->head) = x;
me->tail = x;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
T *ans = me->head;
me->head = ans->*next;
if (me->head == nullptr) me->tail = nullptr;
return ans;
}
template<typename T, T *T::*next>
inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
T *ans = me->head;
if (ans != nullptr) {
me->head = ans->*next;
if (me->head == nullptr) me->tail = nullptr;
}
return ans;
}
template<typename T, T *T::*next>
void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
T *head = me->head;
me->head = nullptr;
me->tail = nullptr;
while (head != nullptr) {
T *tmp = head->*next;
ncclMemoryPoolFree(pool, tmp);
head = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
}
inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
pthread_mutex_init(&me->mutex, nullptr);
pthread_cond_init(&me->cond, nullptr);
}
inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
pthread_mutex_destroy(&me->mutex);
pthread_cond_destroy(&me->cond);
}
////////////////////////////////////////////////////////////////////////////////
template<typename T, T *T::*next>
struct ncclIntruQueueMpsc {
T* head;
uintptr_t tail;
struct ncclThreadSignal* waiting;
};
template<typename T, T *T::*next>
void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
me->head = nullptr;
me->tail = 0x0;
me->waiting = nullptr;
}
template<typename T, T *T::*next>
bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
}
template<typename T, T *T::*next>
bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
__atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
T* prev = reinterpret_cast<T*>(utail);
T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
__atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
if (utail == 0x1) { // waiting
__atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
// This lock/unlock is essential to ensure we don't race ahead of the consumer
// and signal the cond before they begin waiting on it.
struct ncclThreadSignal* waiting = me->waiting;
pthread_mutex_lock(&waiting->mutex);
pthread_mutex_unlock(&waiting->mutex);
pthread_cond_broadcast(&waiting->cond);
}
return utail != 0x2; // not abandoned
}
template<typename T, T *T::*next>
T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
if (head == nullptr) {
if (!waitSome) return nullptr;
uint64_t t0 = clockNano();
bool sleeping = false;
do {
if (clockNano()-t0 >= 10*1000) { // spin for first 10us
struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
pthread_mutex_lock(&waitSignal->mutex);
uintptr_t expected = sleeping ? 0x1 : 0x0;
uintptr_t desired = 0x1;
me->waiting = waitSignal; // release done by successful compare exchange
if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
sleeping = true;
pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
}
pthread_mutex_unlock(&waitSignal->mutex);
}
head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
} while (head == nullptr);
}
__atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
T *x = head;
while (x != tail) {
T *x1;
int spins = 0;
while (true) {
x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
if (x1 != nullptr) break;
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
}
x = x1;
}
return head;
}
template<typename T, T *T::*next>
T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
uintptr_t expected = 0x0;
if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
return nullptr;
} else {
int spins = 0;
T* head;
while (true) {
head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
if (head != nullptr) break;
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
}
__atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
T *x = head;
while (x != tail) {
T *x1;
spins = 0;
while (true) {
x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
if (x1 != nullptr) break;
if (++spins == 1024) { spins = 1024-1; sched_yield(); }
}
x = x1;
}
return head;
}
}
#endif
+421 -312
Ver fichero
La diferencia del archivo ha sido suprimido porque es demasiado grande Cargar Diff
+1 -6
Ver fichero
@@ -45,12 +45,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
return ncclInvalidArgument;
}
// Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P/AllToAllPivot calls to chars.
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
info->count = info->nBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
NCCLCHECK(ncclInfoSetDerived(info, info->comm->nRanks));
if (info->op < 0 || ncclMaxRedOp < info->op) {
WARN("%s : invalid reduction operation %d", info->opName, info->op);
+163
Ver fichero
@@ -0,0 +1,163 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "debug.h"
#include "cudawrap.h"
#include <dlfcn.h>
#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN(cuDeviceGet);
DECLARE_CUDA_PFN(cuDeviceGetAttribute);
DECLARE_CUDA_PFN(cuGetErrorString);
DECLARE_CUDA_PFN(cuGetErrorName);
/* enqueue.cc */
DECLARE_CUDA_PFN(cuMemGetAddressRange);
/* proxy.cc */
DECLARE_CUDA_PFN(cuCtxCreate_v3020);
DECLARE_CUDA_PFN(cuCtxDestroy);
DECLARE_CUDA_PFN(cuCtxSetCurrent);
#if CUDA_VERSION >= 11070
/* transport/collNet.cc/net.cc*/
DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
DECLARE_CUDA_PFN(cuInit);
DECLARE_CUDA_PFN(cuDriverGetVersion);
DECLARE_CUDA_PFN(cuGetProcAddress);
static enum { cudaUninitialized, cudaInitializing, cudaInitialized, cudaError } cudaState = cudaUninitialized;
#define CUDA_DRIVER_MIN_VERSION 11030
static void *cudaLib;
static int cudaDriverVersion;
#if CUDART_VERSION >= 11030
/*
Load the CUDA symbols
*/
static int cudaPfnFuncLoader(void) {
CUresult res;
#define LOAD_SYM(symbol, ignore) do { \
res = pfn_cuGetProcAddress(#symbol, (void **) (&pfn_##symbol), cudaDriverVersion, 0); \
if (res != 0) { \
if (!ignore) { \
WARN("Retrieve %s version %d failed with %d", #symbol, cudaDriverVersion, res); \
return ncclSystemError; } \
} } while(0)
LOAD_SYM(cuGetErrorString, 0);
LOAD_SYM(cuGetErrorName, 0);
LOAD_SYM(cuDeviceGet, 0);
LOAD_SYM(cuDeviceGetAttribute, 0);
LOAD_SYM(cuMemGetAddressRange, 1);
LOAD_SYM(cuCtxCreate_v3020, 1);
LOAD_SYM(cuCtxDestroy, 1);
LOAD_SYM(cuCtxSetCurrent, 1);
#if CUDA_VERSION >= 11070
LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
#endif
return ncclSuccess;
}
#endif
ncclResult_t cudaLibraryInit(void) {
CUresult res;
if (cudaState == cudaInitialized)
return ncclSuccess;
if (cudaState == cudaError)
return ncclSystemError;
if (__sync_bool_compare_and_swap(&cudaState, cudaUninitialized, cudaInitializing) == false) {
// Another thread raced in front of us. Wait for it to be done.
while (cudaState == cudaInitializing) sched_yield();
return (cudaState == cudaInitialized) ? ncclSuccess : ncclSystemError;
}
/*
* Load CUDA driver library
*/
char path[1024];
char *ncclCudaPath = getenv("NCCL_CUDA_PATH");
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libcuda.so");
else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libcuda.so");
cudaLib = dlopen(path, RTLD_LAZY);
if (cudaLib == NULL) {
WARN("Failed to find CUDA library in %s (NCCL_CUDA_PATH=%s)", ncclCudaPath, ncclCudaPath);
goto error;
}
/*
* Load initial CUDA functions
*/
pfn_cuInit = (PFN_cuInit) dlsym(cudaLib, "cuInit");
if (pfn_cuInit == NULL) {
WARN("Failed to load CUDA missing symbol cuInit");
goto error;
}
pfn_cuDriverGetVersion = (PFN_cuDriverGetVersion) dlsym(cudaLib, "cuDriverGetVersion");
if (pfn_cuDriverGetVersion == NULL) {
WARN("Failed to load CUDA missing symbol cuDriverGetVersion");
goto error;
}
res = pfn_cuDriverGetVersion(&cudaDriverVersion);
if (res != 0) {
WARN("cuDriverGetVersion failed with %d", res);
goto error;
}
INFO(NCCL_INIT, "cudaDriverVersion %d", cudaDriverVersion);
if (cudaDriverVersion < CUDA_DRIVER_MIN_VERSION) {
// WARN("CUDA Driver version found is %d. Minimum requirement is %d", cudaDriverVersion, CUDA_DRIVER_MIN_VERSION);
// Silently ignore version check mismatch for backwards compatibility
goto error;
}
pfn_cuGetProcAddress = (PFN_cuGetProcAddress) dlsym(cudaLib, "cuGetProcAddress");
if (pfn_cuGetProcAddress == NULL) {
WARN("Failed to load CUDA missing symbol cuGetProcAddress");
goto error;
}
/*
* Required to initialize the CUDA Driver.
* Multiple calls of cuInit() will return immediately
* without making any relevant change
*/
pfn_cuInit(0);
#if CUDART_VERSION >= 11030
if (cudaPfnFuncLoader()) {
WARN("CUDA some PFN functions not found in the library");
goto error;
}
#endif
cudaState = cudaInitialized;
return ncclSuccess;
error:
cudaState = cudaError;
return ncclSystemError;
}
+1 -1
Ver fichero
@@ -57,7 +57,7 @@ ncclResult_t wrap_gdr_symbols(void) {
if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) {
// Another thread raced in front of us. Wait for it to be done.
while (gdrState == gdrInitializing) pthread_yield();
while (gdrState == gdrInitializing) sched_yield();
return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError;
}
+20 -3
Ver fichero
@@ -30,6 +30,8 @@ struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -49,7 +51,7 @@ ncclResult_t wrap_ibv_symbols(void) {
if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
// Another thread raced in front of us. Wait for it to be done.
while (ibvState == ibvInitializing) pthread_yield();
while (ibvState == ibvInitializing) sched_yield();
return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
}
@@ -98,6 +100,8 @@ ncclResult_t wrap_ibv_symbols(void) {
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
// Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
// Cherry-pick the ibv_reg_dmabuf_mr API from IBVERBS 1.12
LOAD_SYM_VERSION(ibvhandle, "ibv_reg_dmabuf_mr", ibv_internal_reg_dmabuf_mr, "IBVERBS_1.12");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -126,6 +130,7 @@ teardown:
ibv_internal_dealloc_pd = NULL;
ibv_internal_reg_mr = NULL;
ibv_internal_reg_mr_iova2 = NULL;
ibv_internal_reg_dmabuf_mr = NULL;
ibv_internal_dereg_mr = NULL;
ibv_internal_create_cq = NULL;
ibv_internal_destroy_cq = NULL;
@@ -259,7 +264,7 @@ ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or
}
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
@@ -275,7 +280,19 @@ ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void
return ncclInternalError;
}
if (ret == NULL) { return ncclSuccess; } // Assume dummy call
IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
}
/* DMA-BUF support */
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
IBV_PTR_CHECK_ERRNO(ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
}
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
if (ibv_internal_reg_dmabuf_mr == NULL) {
return NULL;
}
return ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
}
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+119
Ver fichero
@@ -0,0 +1,119 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "debug.h"
#include "rocmwrap.h"
#include <dlfcn.h>
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
DECLARE_ROCM_PFN(hsa_status_string);
static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaState = hsaUninitialized;
static void *hsaLib;
static uint16_t version_major, version_minor;
ncclResult_t rocmLibraryInit(void) {
hsa_status_t res;
if (hsaState == hsaInitialized)
return ncclSuccess;
if (hsaState == hsaError)
return ncclSystemError;
if (__sync_bool_compare_and_swap(&hsaState, hsaUninitialized, hsaInitializing) == false) {
// Another thread raced in front of us. Wait for it to be done.
while (hsaState == hsaInitializing) sched_yield();
return (hsaState == hsaInitialized) ? ncclSuccess : ncclSystemError;
}
/*
* Load ROCr driver library
*/
char path[1024];
char *ncclCudaPath = getenv("RCCL_ROCR_PATH");
if (ncclCudaPath == NULL)
snprintf(path, 1024, "%s", "libhsa-runtime64.so");
else
snprintf(path, 1024, "%s%s", ncclCudaPath, "libhsa-runtime64.so");
hsaLib = dlopen(path, RTLD_LAZY);
if (hsaLib == NULL) {
WARN("Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s)", ncclCudaPath, ncclCudaPath);
goto error;
}
/*
* Load initial ROCr functions
*/
pfn_hsa_init = (PFN_hsa_init) dlsym(hsaLib, "hsa_init");
if (pfn_hsa_init == NULL) {
WARN("Failed to load ROCr missing symbol hsa_init");
goto error;
}
pfn_hsa_system_get_info = (PFN_hsa_system_get_info) dlsym(hsaLib, "hsa_system_get_info");
if (pfn_hsa_system_get_info == NULL) {
WARN("Failed to load ROCr missing symbol hsa_system_get_info");
goto error;
}
pfn_hsa_status_string = (PFN_hsa_status_string) dlsym(hsaLib, "hsa_status_string");
if (pfn_hsa_status_string == NULL) {
WARN("Failed to load ROCr missing symbol hsa_status_string");
goto error;
}
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
if (res != 0) {
WARN("pfn_hsa_system_get_info failed with %d", res);
goto error;
}
res = pfn_hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &version_minor);
if (res != 0) {
WARN("pfn_hsa_system_get_info failed with %d", res);
goto error;
}
INFO(NCCL_INIT, "ROCr version %d.%d", version_major, version_minor);
//if (hsaDriverVersion < ROCR_DRIVER_MIN_VERSION) {
// WARN("ROCr Driver version found is %d. Minimum requirement is %d", hsaDriverVersion, ROCR_DRIVER_MIN_VERSION);
// Silently ignore version check mismatch for backwards compatibility
//goto error;
//}
pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
goto error;
}
/*
* Required to initialize the ROCr Driver.
* Multiple calls of hsa_init() will return immediately
* without making any relevant change
*/
pfn_hsa_init();
hsaState = hsaInitialized;
return ncclSuccess;
error:
hsaState = hsaError;
return ncclSystemError;
}
+3 -3
Ver fichero
@@ -59,15 +59,15 @@ ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void**
NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
if (devShmPtr) {
CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, cudaError);
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
CUDACHECKGOTO(hipHostRegister(ptr, shmSize, hipHostRegisterMapped), res, hipError_t);
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
}
*shmPtr = ptr;
return ncclSuccess;
sysError:
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
cudaError:
hipError_t:
if (fd != -1) close(fd);
if (create) shm_unlink(shmPath);
if (ptr != MAP_FAILED) munmap(ptr, shmSize);
Archivo normal → Archivo ejecutable
Ver fichero
+24 -23
Ver fichero
@@ -15,6 +15,9 @@
#include <vector>
#include <utility>
#include <unordered_set>
#include <unistd.h>
#include <sys/syscall.h>
static std::vector<std::pair<int, std::unordered_set<std::string>>> clientPortPool;
/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
@@ -337,9 +340,10 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
#endif
}
/* make all new sockets non-blocking */
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
if (sock->asyncFlag) {
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
// addr port should be 0 (Any port)
SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
@@ -378,7 +382,7 @@ static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
}
if (ret == EINPROGRESS)
if (ret == EINPROGRESS || ret == ECONNREFUSED)
*state = ncclSocketConnecting;
else if (ret == 0)
*state = ncclSocketConnected;
@@ -414,10 +418,12 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
const int one = 1;
SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
/* support non-blocking socket; by default, the socket is non-blocking */
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
if (sock->asyncFlag) {
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
/* const int bufsize = 128*1024;
SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
@@ -458,31 +464,26 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock, int portReuse) {
int timedout_retries = 0;
int refused_retries = 0;
retry:
/* async connect; abort when error happens and abortFlag is present. */
/* blocking/non-blocking connect() is determined by asyncFlag. */
ret = connect(fd, &sock->addr.sa, salen);
if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
if (!sock->asyncFlag && (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
(errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES))) {
if (errno == ECONNREFUSED && refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
usleep(SLEEP_INT);
goto retry;
} else if (errno == EINPROGRESS && !sock->asyncFlag) {
enum ncclSocketState state;
do {
if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
NCCLCHECK(getFdState(fd, &state));
} while (state == ncclSocketConnecting);
EQCHECK(state, ncclSocketError);
ret = 0;
}
if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
/* If connect() fails with errno == EAGAIN/EINPROGRESS/ETIMEDOUT, we may want to try connect again.
* However, it can return EISCONN instead of success which indicates connection is built up in
* background already. No need to call connect() again. */
if (ret == 0 || ((errno == EINPROGRESS || errno == ECONNREFUSED) && sock->asyncFlag) || errno == EISCONN) {
sock->fd = fd;
return ncclSuccess;
}
WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclSystemError;
return ncclRemoteError;
}
ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
@@ -535,7 +536,7 @@ static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void*
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
return ncclSystemError;
return ncclRemoteError;
} else {
bytes = 0;
}
@@ -555,7 +556,7 @@ ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int
if (closed) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
return ncclSystemError;
return ncclRemoteError;
}
return ncclSuccess;
}
+273
Ver fichero
@@ -0,0 +1,273 @@
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "strongstream.h"
#include "checks.h"
#include "param.h"
////////////////////////////////////////////////////////////////////////////////
ncclResult_t ncclCudaGetCapturingGraph(
struct ncclCudaGraph* graph, hipStream_t stream
) {
#if CUDART_VERSION >= 11030
thread_local int driver = -1;
if (driver == -1) {
CUDACHECK(cudaDriverGetVersion(&driver));
}
if (driver < 11030) {
cudaStreamCaptureStatus status;
unsigned long long gid;
graph->graph = nullptr;
CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
if (status != cudaStreamCaptureStatusNone) {
WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
return ncclInvalidUsage;
}
} else {
cudaStreamCaptureStatus status;
unsigned long long gid;
CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
if (status != cudaStreamCaptureStatusActive) {
graph->graph = nullptr;
gid = ULLONG_MAX;
}
graph->graphId = gid;
}
#endif
return ncclSuccess;
}
ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, hipHostFn_t fn, void* arg) {
#if CUDART_VERSION >= 11030
cudaUserObject_t object;
CUDACHECK(cudaUserObjectCreate(
&object, arg, fn, /*initialRefcount=*/1, cudaUserObjectNoDestructorSync
));
// Hand over ownership to CUDA Graph
CUDACHECK(cudaGraphRetainUserObject(graph.graph, object, 1, cudaGraphUserObjectMove));
return ncclSuccess;
#else
return ncclInvalidUsage;
#endif
}
////////////////////////////////////////////////////////////////////////////////
ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
CUDACHECK(hipStreamCreateWithFlags(&ss->stream, hipStreamNonBlocking));
CUDACHECK(hipEventCreateWithFlags(&ss->event, hipEventDisableTiming));
#if CUDART_VERSION >= 11030
ss->node = nullptr;
ss->graphId = (1ull<<(8*sizeof(long long)-1))-1;
ss->eventIsLagging = 0;
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
#if CUDART_VERSION >= 11030
CUDACHECK(cudaEventDestroy(ss->event));
#endif
CUDACHECK(hipStreamDestroy(ss->stream));
return ncclSuccess;
}
NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
ncclResult_t ncclStrongStreamAcquire(
struct ncclCudaGraph graph, struct ncclStrongStream* ss
) {
#if CUDART_VERSION >= 11030
bool mixing = ncclParamGraphMixingSupport();
if (graph.graph == nullptr) {
if (mixing && ncclStrongStreamEverCaptured(ss)) {
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
ss->eventIsLagging = 0;
}
} else {
if (ss->graphId != graph.graphId) {
if (mixing && ss->eventIsLagging) {
// Can only be here if previous release was for uncaptured work that
// elided updating the event because no capture had yet occurred.
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
CUDACHECK(cudaEventRecord(ss->event, ss->stream));
}
ss->graphId = graph.graphId;
ss->eventIsLagging = 0;
if (mixing) {
CUDACHECK(cudaGraphAddEventWaitNode(&ss->node, graph.graph, nullptr, 0, ss->event));
} else {
CUDACHECK(cudaGraphAddEmptyNode(&ss->node, graph.graph, nullptr, 0));
}
}
}
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
#if CUDART_VERSION >= 11030
bool mixing = ncclParamGraphMixingSupport();
if (mixing && ncclStrongStreamEverCaptured(ss)) {
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
}
ss->eventIsLagging = 1; // Assume the caller is going to add work to stream.
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
#if CUDART_VERSION >= 11030
bool mixing = ncclParamGraphMixingSupport();
if (mixing && ss->eventIsLagging) {
if (graph.graph == nullptr) {
if (ncclStrongStreamEverCaptured(ss)) {
CUDACHECK(cudaEventRecord(ss->event, ss->stream));
ss->eventIsLagging = 0;
}
} else {
CUDACHECK(cudaGraphAddEventRecordNode(&ss->node, graph.graph, &ss->node, 1, ss->event));
ss->eventIsLagging = 0;
}
}
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamLaunchHost(
struct ncclCudaGraph graph, struct ncclStrongStream* ss, hipHostFn_t fn, void* arg
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
CUDACHECK(cudaLaunchHostFunc(ss->stream, fn, arg));
} else {
cudaHostNodeParams p;
p.fn = fn;
p.userData = arg;
CUDACHECK(cudaGraphAddHostNode(&ss->node, graph.graph, &ss->node, 1, &p));
}
ss->eventIsLagging = 1;
#else
//CUDACHECK(hipLaunchHostFunc(ss->stream, fn, arg));
CUDACHECK(hipStreamAddCallback(ss->stream, (hipStreamCallback_t)fn, arg, 0));
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamLaunchKernel(
struct ncclCudaGraph graph, struct ncclStrongStream* ss,
void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
} else {
cudaGraphNode_t tip = ss->node;
cudaKernelNodeParams p;
p.func = fn;
p.gridDim = grid;
p.blockDim = block;
p.kernelParams = args;
p.sharedMemBytes = sharedMemBytes;
p.extra = nullptr;
CUDACHECK(cudaGraphAddKernelNode(&ss->node, graph.graph, &tip, 1, &p));
}
ss->eventIsLagging = 1;
#else
CUDACHECK(hipLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->stream));
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
if (b->eventIsLagging) {
b->eventIsLagging = 0;
CUDACHECK(cudaEventRecord(b->event, b->stream));
}
CUDACHECK(cudaStreamWaitEvent(a->stream, b->event, 0));
a->eventIsLagging = 1;
} else {
cudaGraphNode_t pair[2] = {a->node, b->node};
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
}
#else
CUDACHECK(hipEventRecord(b->event, b->stream));
CUDACHECK(hipStreamWaitEvent(a->stream, b->event, 0));
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, struct ncclStrongStream* a, hipStream_t b
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
CUDACHECK(cudaEventRecord(a->event, b));
CUDACHECK(cudaStreamWaitEvent(a->stream, a->event, 0));
// We used a->event to record b so it no longer reflects anything about a.
a->eventIsLagging = 1;
} else {
cudaStreamCaptureStatus status;
unsigned long long gid1;
cudaGraphNode_t const* deps;
size_t depN = 0;
CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &gid1, nullptr, &deps, &depN));
if (status != cudaStreamCaptureStatusActive || graph.graphId != gid1) {
WARN("Stream is not being captured by the expected graph.");
return ncclInvalidUsage;
}
if (depN > 0 && (depN > 1 || deps[0] != a->node)) {
cudaGraphNode_t tie;
if (depN == 1) {
tie = deps[0];
} else {
CUDACHECK(cudaGraphAddEmptyNode(&tie, graph.graph, deps, depN));
}
cudaGraphNode_t pair[2] = {a->node, tie};
CUDACHECK(cudaGraphAddEmptyNode(&a->node, graph.graph, pair, 2));
}
// a->eventIsLagging doesn't change since we are just updating the
// dependencies of a->node.
}
#else
CUDACHECK(hipEventRecord(a->event, b));
CUDACHECK(hipStreamWaitEvent(a->stream, a->event, 0));
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamWaitStream(
struct ncclCudaGraph graph, hipStream_t a, struct ncclStrongStream* b
) {
#if CUDART_VERSION >= 11030
if (graph.graph == nullptr) {
if (b->eventIsLagging) {
b->eventIsLagging = 0;
CUDACHECK(cudaEventRecord(b->event, b->stream));
}
CUDACHECK(cudaStreamWaitEvent(a, b->event, 0));
} else {
CUDACHECK(cudaStreamUpdateCaptureDependencies(a, &b->node, 1, cudaStreamAddCaptureDependencies));
}
#else
CUDACHECK(hipEventRecord(b->event, b->stream));
CUDACHECK(hipStreamWaitEvent(a, b->event, 0));
#endif
return ncclSuccess;
}
ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
#if CUDART_VERSION >= 11030
CUDACHECK(cudaStreamWaitEvent(ss->stream, ss->event, 0));
#endif
CUDACHECK(hipStreamSynchronize(ss->stream));
return ncclSuccess;
}
+101
Ver fichero
@@ -11,6 +11,8 @@
#include "nvmlwrap.h"
#include <hip/hip_runtime.h>
#include <stdlib.h>
// Get current Compute Capability
int ncclCudaCompCap() {
int cudaDev;
@@ -192,3 +194,102 @@ bool matchIfList(const char* string, int port, struct netIf* ifList, int listSiz
}
return false;
}
__thread struct ncclThreadSignal ncclThreadSignalLocalInstance = ncclThreadSignalStaticInitializer();
void* ncclMemoryStack::allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align) {
// `me->hunks` points to the top of the stack non-empty hunks. Hunks above
// this (reachable via `->above`) are empty.
struct Hunk* top = me->topFrame.hunk;
size_t mallocSize = 0;
// If we have lots of space left in hunk but that wasn't enough then we'll
// allocate the object unhunked.
if (me->topFrame.end - me->topFrame.bumper >= 8<<10)
goto unhunked;
// If we have another hunk (which must be empty) waiting above this one and
// the object fits then use that.
if (top && top->above) {
struct Hunk* top1 = top->above;
uintptr_t uobj = (reinterpret_cast<uintptr_t>(top1) + sizeof(struct Hunk) + align-1) & -uintptr_t(align);
if (uobj + size <= reinterpret_cast<uintptr_t>(top1) + top1->size) {
me->topFrame.hunk = top1;
me->topFrame.bumper = uobj + size;
me->topFrame.end = reinterpret_cast<uintptr_t>(top1) + top1->size;
return reinterpret_cast<void*>(uobj);
}
}
{ // If the next hunk we're going to allocate wouldn't be big enough but the
// Unhunk proxy fits in the current hunk then go allocate as unhunked.
size_t nextSize = (top ? top->size : 0) + (64<<10);
constexpr size_t maxAlign = 64;
if (nextSize < sizeof(struct Hunk) + maxAlign + size) {
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
if (uproxy + sizeof(struct Unhunk) <= me->topFrame.end)
goto unhunked;
}
// At this point we must need another hunk, either to fit the object
// itself or its Unhunk proxy.
mallocSize = nextSize;
INFO(NCCL_ALLOC, "%s:%d memory stack hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
struct Hunk *top1 = (struct Hunk*)malloc(mallocSize);
if (top1 == nullptr) goto malloc_exhausted;
top1->size = nextSize;
top1->above = nullptr;
if (top) top->above = top1;
top = top1;
me->topFrame.hunk = top;
me->topFrame.end = reinterpret_cast<uintptr_t>(top) + nextSize;
me->topFrame.bumper = reinterpret_cast<uintptr_t>(top) + sizeof(struct Hunk);
}
{ // Try to fit object in the new top hunk.
uintptr_t uobj = (me->topFrame.bumper + align-1) & -uintptr_t(align);
if (uobj + size <= me->topFrame.end) {
me->topFrame.bumper = uobj + size;
return reinterpret_cast<void*>(uobj);
}
}
unhunked:
{ // We need to allocate the object out-of-band and put an Unhunk proxy in-band
// to keep track of it.
uintptr_t uproxy = (me->topFrame.bumper + alignof(Unhunk)-1) & -uintptr_t(alignof(Unhunk));
Unhunk* proxy = reinterpret_cast<Unhunk*>(uproxy);
me->topFrame.bumper = uproxy + sizeof(Unhunk);
proxy->next = me->topFrame.unhunks;
me->topFrame.unhunks = proxy;
mallocSize = size;
proxy->obj = malloc(mallocSize);
INFO(NCCL_ALLOC, "%s:%d memory stack non-hunk malloc(%llu)", __FILE__, __LINE__, (unsigned long long)mallocSize);
if (proxy->obj == nullptr) goto malloc_exhausted;
return proxy->obj;
}
malloc_exhausted:
WARN("%s:%d Unrecoverable error detected: malloc(size=%llu) returned null.", __FILE__, __LINE__, (unsigned long long)mallocSize);
abort();
}
void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
// Free unhunks first because both the frames and unhunk proxies lie within the hunks.
struct ncclMemoryStack::Frame* f = &me->topFrame;
while (f != nullptr) {
struct ncclMemoryStack::Unhunk* u = f->unhunks;
while (u != nullptr) {
free(u->obj);
u = u->next;
}
f = f->below;
}
// Free hunks
struct ncclMemoryStack::Hunk* h = me->stub.above;
while (h != nullptr) {
struct ncclMemoryStack::Hunk *h1 = h->above;
free(h);
h = h1;
}
}
+14 -3
Ver fichero
@@ -41,7 +41,8 @@ typedef enum { ncclSuccess = 0,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclNumResults = 6 } ncclResult_t;
ncclRemoteError = 6,
ncclNumResults = 7 } ncclResult_t;
/*! @brief Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
*
@@ -135,11 +136,21 @@ ncclResult_t ncclCommAbort(ncclComm_t comm);
ncclResult_t pncclCommAbort(ncclComm_t comm);
/// @endcond
/*! @brief Returns a human-readable error message. */
/*! @brief Returns a string for each error code. */
const char* ncclGetErrorString(ncclResult_t result);
/// @cond include_hidden
const char* pncclGetErrorString(ncclResult_t result);
/// @endcond
/*! @brief Checks whether the comm has encountered any asynchronous errors */
/*! @brief Returns a human-readable message of the last error that occurred.
* comm is currently unused and can be set to NULL
*/
const char* ncclGetLastError(ncclComm_t comm);
/// @cond include_hidden
const char* pncclGetError(ncclComm_t comm);
/// @endcond
/* Checks whether the comm has encountered any asynchronous errors */
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
/// @cond include_hidden
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+192 -97
Ver fichero
@@ -9,15 +9,16 @@
//#include <sys/stat.h>
//#include <unistd.h>
ncclNet_t *ncclNet;
ncclCollNet_t *ncclCollNet;
static ncclNet_v5_t ncclNet_v4_as_v5;
static ncclNet_v6_t ncclNet_v4_as_v6;
static ncclNet_v6_t ncclNet_v5_as_v6;
static ncclNet_v4_t *ncclNet_v4;
static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
static ncclNet_v5_t *ncclNet_v5;
static ncclCollNet_v6_t ncclCollNet_v4_as_v6;
static ncclCollNet_v6_t ncclCollNet_v5_as_v6;
static ncclCollNet_v4_t *ncclCollNet_v4;
static ncclCollNet_v5_t *ncclCollNet_v5;
static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
static ncclResult_t ncclNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
ncclNetProperties_v4_t p4;
ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
if (ans != ncclSuccess) return ans;
@@ -33,17 +34,17 @@ static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5
return ncclSuccess;
}
static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
static ncclResult_t ncclNet_v4_as_v6_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
}
static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
static ncclResult_t ncclNet_v4_as_v6_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
if (n == 0) return ncclSuccess;
if (n != 1) return ncclInvalidArgument;
return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
}
static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
static ncclResult_t ncclNet_v4_as_v6_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
if (n == 0) return ncclSuccess;
if (n != 1) return ncclInvalidArgument;
return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
@@ -51,27 +52,51 @@ static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data,
// We use a wrapper around the v4 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
static ncclResult_t ncclNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclNet_v4->init(logfn));
ncclNet_v4_as_v5.name = ncclNet_v4->name;
ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
ncclNet_v4_as_v5.test = ncclNet_v4->test;
ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
ncclNet_v4_as_v6.name = ncclNet_v4->name;
ncclNet_v4_as_v6.devices = ncclNet_v4->devices;
ncclNet_v4_as_v6.getProperties = ncclNet_v4_as_v6_getProperties;
ncclNet_v4_as_v6.listen = ncclNet_v4->listen;
ncclNet_v4_as_v6.connect = ncclNet_v4->connect;
ncclNet_v4_as_v6.accept = ncclNet_v4->accept;
ncclNet_v4_as_v6.regMr = ncclNet_v4->regMr;
ncclNet_v4_as_v6.regMrDmaBuf = NULL;
ncclNet_v4_as_v6.deregMr = ncclNet_v4->deregMr;
ncclNet_v4_as_v6.isend = ncclNet_v4_as_v6_isend;
ncclNet_v4_as_v6.irecv = ncclNet_v4_as_v6_irecv;
ncclNet_v4_as_v6.iflush = ncclNet_v4_as_v6_iflush;
ncclNet_v4_as_v6.test = ncclNet_v4->test;
ncclNet_v4_as_v6.closeSend = ncclNet_v4->closeSend;
ncclNet_v4_as_v6.closeRecv = ncclNet_v4->closeRecv;
ncclNet_v4_as_v6.closeListen = ncclNet_v4->closeListen;
return ncclSuccess;
}
static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
// We use a wrapper around the v5 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclNet_v5->init(logfn));
ncclNet_v5_as_v6.name = ncclNet_v5->name;
ncclNet_v5_as_v6.devices = ncclNet_v5->devices;
ncclNet_v5_as_v6.getProperties = ncclNet_v5->getProperties;
ncclNet_v5_as_v6.listen = ncclNet_v5->listen;
ncclNet_v5_as_v6.connect = ncclNet_v5->connect;
ncclNet_v5_as_v6.accept = ncclNet_v5->accept;
ncclNet_v5_as_v6.regMr = ncclNet_v5->regMr;
ncclNet_v5_as_v6.regMrDmaBuf = NULL;
ncclNet_v5_as_v6.deregMr = ncclNet_v5->deregMr;
ncclNet_v5_as_v6.isend = ncclNet_v5->isend;
ncclNet_v5_as_v6.irecv = ncclNet_v5->irecv;
ncclNet_v5_as_v6.iflush = ncclNet_v5->iflush;
ncclNet_v5_as_v6.test = ncclNet_v5->test;
ncclNet_v5_as_v6.closeSend = ncclNet_v5->closeSend;
ncclNet_v5_as_v6.closeRecv = ncclNet_v5->closeRecv;
ncclNet_v5_as_v6.closeListen = ncclNet_v5->closeListen;
return ncclSuccess;
}
static ncclResult_t ncclCollNet_v4_as_v6_getProperties(int dev, ncclNetProperties_v6_t* props) {
ncclNetProperties_v4_t p4;
ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
if (ans != ncclSuccess) return ans;
@@ -89,25 +114,58 @@ static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetPropertie
// We use a wrapper around the v4 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
static ncclResult_t ncclCollNet_v4_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclCollNet_v4->init(logfn));
ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
ncclCollNet_v4_as_v6.devices = ncclCollNet_v4->devices;
ncclCollNet_v4_as_v6.getProperties = ncclCollNet_v4_as_v6_getProperties;
ncclCollNet_v4_as_v6.listen = ncclCollNet_v4->listen;
ncclCollNet_v4_as_v6.connect = ncclCollNet_v4->connect;
ncclCollNet_v4_as_v6.reduceSupport = ncclCollNet_v4->reduceSupport;
ncclCollNet_v4_as_v6.regMr = ncclCollNet_v4->regMr;
ncclCollNet_v4_as_v6.regMrDmaBuf = NULL;
ncclCollNet_v4_as_v6.deregMr = ncclCollNet_v4->deregMr;
ncclCollNet_v4_as_v6.iallreduce = ncclCollNet_v4->iallreduce;
ncclCollNet_v4_as_v6.iflush = ncclCollNet_v4->iflush;
ncclCollNet_v4_as_v6.test = ncclCollNet_v4->test;
ncclCollNet_v4_as_v6.closeColl = ncclCollNet_v4->closeColl;
ncclCollNet_v4_as_v6.closeListen = ncclCollNet_v4->closeListen;
return ncclSuccess;
}
static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
// We use a wrapper around the v5 init to copy over the struct contents
// post-init since they may not be initialized before hand.
static ncclResult_t ncclCollNet_v5_as_v6_init(ncclDebugLogger_t logfn) {
NCCLCHECK(ncclCollNet_v5->init(logfn));
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
ncclCollNet_v5_as_v6.devices = ncclCollNet_v5->devices;
ncclCollNet_v5_as_v6.getProperties = ncclCollNet_v5->getProperties;
ncclCollNet_v5_as_v6.listen = ncclCollNet_v5->listen;
ncclCollNet_v5_as_v6.connect = ncclCollNet_v5->connect;
ncclCollNet_v5_as_v6.reduceSupport = ncclCollNet_v5->reduceSupport;
ncclCollNet_v5_as_v6.regMr = ncclCollNet_v5->regMr;
ncclCollNet_v5_as_v6.regMrDmaBuf = NULL;
ncclCollNet_v5_as_v6.deregMr = ncclCollNet_v5->deregMr;
ncclCollNet_v5_as_v6.iallreduce = ncclCollNet_v5->iallreduce;
ncclCollNet_v5_as_v6.iflush = ncclCollNet_v5->iflush;
ncclCollNet_v5_as_v6.test = ncclCollNet_v5->test;
ncclCollNet_v5_as_v6.closeColl = ncclCollNet_v5->closeColl;
ncclCollNet_v5_as_v6.closeListen = ncclCollNet_v5->closeListen;
return ncclSuccess;
}
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
enum ncclNetState {
ncclNetStateInit = 0,
ncclNetStateEnabled = 1,
ncclNetStateDisabled = 2
};
enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
ncclResult_t ncclNetPluginInit() {
char ncclNetPluginName[128];
const char* envPluginName = getenv("NCCL_NET_PLUGIN");
if (envPluginName && strlen(envPluginName)) {
@@ -126,67 +184,104 @@ static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
} else {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
}
return;
return ncclSuccess;
}
*net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
if (*net == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
if (ncclNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
if (netPluginLib != nullptr) dlclose(netPluginLib);
return;
ncclNets[0] = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
if (ncclNets[0] == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
// Try v5 plugin
ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
if (ncclNet_v5 == nullptr) {
ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
if (ncclNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (v4 or v5).");
if (netPluginLib != nullptr) dlclose(netPluginLib);
return ncclSuccess;
}
ncclNets[0] = &ncclNet_v4_as_v6;
ncclNet_v4_as_v6.init = ncclNet_v4_as_v6_init;
// Set the name right away to allow for NCCL_NET=... to work
ncclNet_v4_as_v6.name = ncclNet_v4->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v4)", ncclNets[0]->name);
} else {
ncclNets[0] = &ncclNet_v5_as_v6;
ncclNet_v5_as_v6.init = ncclNet_v5_as_v6_init;
// Set the name right away to allow for NCCL_NET=... to work
ncclNet_v5_as_v6.name = ncclNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
}
*net = &ncclNet_v4_as_v5;
ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
}
// Check for CollNet
*collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
if (*collnet == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
if (ncclCollNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
ncclCollNets[0] = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
if (ncclCollNets[0] == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
if (ncclCollNet_v5 == nullptr) {
ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
if (ncclCollNet_v4 == nullptr) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).");
} else {
ncclCollNets[0] = &ncclCollNet_v4_as_v6;
ncclCollNet_v4_as_v6.init = ncclCollNet_v4_as_v6_init;
ncclCollNet_v4_as_v6.name = ncclCollNet_v4->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v4)", ncclCollNets[0]->name);
}
} else {
*collnet = &ncclCollNet_v4_as_v5;
ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
ncclCollNets[0] = &ncclCollNet_v5_as_v6;
ncclCollNet_v5_as_v6.init = ncclCollNet_v5_as_v6_init;
ncclCollNet_v5_as_v6.name = ncclCollNet_v5->name;
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded coll plugin %s (v5)", ncclCollNets[0]->name);
}
}
return;
return ncclSuccess;
}
ncclResult_t ncclNetInit() {
// Always initialize bootstrap network
NCCLCHECK(bootstrapNetInit());
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
pthread_mutex_lock(&netLock);
if (ncclNetStates[i] == ncclNetStateInit) {
int ndev;
if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
else ncclNetStates[i] = ncclNetStateEnabled;
}
*state = ncclNetStates[i];
pthread_mutex_unlock(&netLock);
return ncclSuccess;
}
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
if (ncclCollNetStates[i] == ncclNetStateInit) {
int ndev;
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
else ncclCollNetStates[i] = ncclNetStateEnabled;
}
*state = ncclCollNetStates[i];
return ncclSuccess;
}
ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network
ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
initPlugin(&nets[0], &collNets[0]);
char* netName = getenv("NCCL_NET");
bool ok = false;
for (int i=0; i<3; i++) {
if (nets[i] == nullptr) continue;
if (netName && strcmp(netName, nets[i]->name) != 0) continue;
if (ncclNets[i] == nullptr) continue;
enum ncclNetState state;
NCCLCHECK(netGetState(i, &state));
if (state != ncclNetStateEnabled) continue;
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
// net plugin is already initialized
int ndev;
if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
if (nets[i]->devices(&ndev) != ncclSuccess) continue;
if (ndev <= 0) continue;
ncclNet = nets[i];
comm->ncclNet = ncclNets[i];
ok = true;
if (collNets[i]) {
do {
if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
if (collNets[i]->devices(&ndev) != ncclSuccess) break;
if (ndev <= 0) break;
ncclCollNet = collNets[i];
} while(0);
if (ncclCollNets[i]) {
NCCLCHECK(collNetGetState(i, &state));
if (state == ncclNetStateEnabled) {
comm->ncclCollNet = ncclCollNets[i];
}
}
break;
}
@@ -198,7 +293,7 @@ ncclResult_t ncclNetInit() {
return ncclSuccess;
}
ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
constexpr int GPU_BUF_SIZE = 2*1024*1024;
#if CUDART_VERSION >= 11030
// In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
@@ -213,12 +308,12 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
}
#endif
int netDevs;
NCCLCHECK(ncclNetDevices(&netDevs));
NCCLCHECK(ncclNetDevices(comm, &netDevs));
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(dev, &props));
NCCLCHECK(ncclNetGetProperties(comm, dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
*gdrSupport = 1;
@@ -232,34 +327,34 @@ ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
void* mHandle = NULL;
ncclResult_t ret;
ncclDebugNoWarn = NCCL_NET;
NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
NCCLCHECKGOTO(ncclNetListen(comm, dev, &handle, &lComm), ret, cleanup1);
while (sComm == NULL) {
NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
NCCLWAITGOTO(ncclNetConnect(comm, dev, &handle, &sComm), sComm != NULL, comm->abortFlag, ret, cleanup2);
}
while (rComm == NULL) {
NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
NCCLWAITGOTO(ncclNetAccept(comm, lComm, &rComm), rComm != NULL, comm->abortFlag, ret, cleanup3);
}
CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
if (ncclNetRegMr(comm, sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(comm, sComm, mHandle));
NCCLCHECK(ncclNetRegMr(comm, rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(ncclNetDeregMr(comm, rComm, mHandle));
*gdrSupport = 1;
}
ncclDebugNoWarn = 0;
CUDACHECK(hipFree(gpuPtr));
cleanup4:
NCCLCHECK(ncclNetCloseRecv(rComm));
NCCLCHECK(ncclNetCloseRecv(comm, rComm));
cleanup3:
NCCLCHECK(ncclNetCloseSend(sComm));
NCCLCHECK(ncclNetCloseSend(comm, sComm));
cleanup2:
NCCLCHECK(ncclNetCloseListen(lComm));
NCCLCHECK(ncclNetCloseListen(comm, lComm));
cleanup1:
break;
}
return ncclSuccess;
}
int ncclNetVersion() {
return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
int ncclNetVersion(struct ncclComm* comm) {
return (comm->ncclNet == &ncclNet_v4_as_v6) ? 4 : ((comm->ncclNet == &ncclNet_v5_as_v6) ? 5 : 6);
}
+110 -56
Ver fichero
@@ -14,6 +14,8 @@
#define ENABLE_TIMER 0
#include "timer.h"
#include <sys/syscall.h>
enum { proxyRecv=0, proxySend=1 };
static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
@@ -350,10 +352,10 @@ ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector*
return ncclSuccess;
}
static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
if (peer < 0) return ncclSuccess;
struct ncclPeer* peerComm = channel->peers+peer;
struct ncclChannelPeer* peerComm = channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
if (connector->transportComm == NULL) {
WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
@@ -362,35 +364,62 @@ static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, s
}
if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
if (justInquire) *justInquire = true;
else {
NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
}
return ncclSuccess;
}
ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
struct ncclChannel* channel = comm->channels+op->channelId;
int pattern = op->pattern;
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
struct ncclRing* ring = &channel->ring;
if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex));
if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex));
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
struct ncclTree* tree = &channel->tree;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
struct ncclTree* tree = &channel->tree;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
}
if (pattern == ncclPatternCollTreeUpDown) {
// CollTree up
NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push
// CollTree down
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
// justInquire != nullptr means don't actually do anything, just assertain need of
// ncclProxySaveOp for this op.
ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
struct ncclChannel* channel = &comm->channels[op->channelId];
if (justInquire) *justInquire = false;
switch (op->pattern) {
case ncclPatternRing:
case ncclPatternRingTwice:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo: {
struct ncclRing* ring = &channel->ring;
if (NeedProxy(proxyRecv, op->pattern, op->root, ring, comm->nRanks)) {
NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, op->connIndex, justInquire));
}
if (NeedProxy(proxySend, op->pattern, op->root, ring, comm->nRanks)) {
NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, op->connIndex, justInquire));
}
} break;
case ncclPatternTreeUp:
case ncclPatternTreeDown:
case ncclPatternTreeUpDown: {
if (op->pattern != ncclPatternTreeDown) { // Tree up
struct ncclTree* tree = &channel->tree;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) {
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0, justInquire));
}
NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0, justInquire));
}
if (op->pattern != ncclPatternTreeUp) { // Tree down
struct ncclTree* tree = &channel->tree;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) {
NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0, justInquire));
}
NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0, justInquire));
}
} break;
case ncclPatternCollTreeUpDown: {
// CollTree up
NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1, justInquire)); // For CollTree up, we are using push
// CollTree down
NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0, justInquire));
} break;
case ncclPatternSend:
case ncclPatternRecv: {
if (op->root == comm->rank) return ncclSuccess;
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
if (op->nsteps == 0) op->nsteps = 1;
NCCLCHECK(SaveProxy(channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, op->connIndex, justInquire));
} break;
}
return ncclSuccess;
}
@@ -406,26 +435,24 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
op->chunkSteps = 1;
op->protocol = NCCL_PROTO_SIMPLE;
op->dtype = info->datatype;
op->connIndex = info->connIndex;
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
if (info->comm->nNodes > 1) stepSize /= SENDRECV_SLICEFACTOR;
info->chunkSize = stepSize;
op->root = info->root;
op->nbytes = info->count;
if (info->root == -1) return ncclSuccess;
struct ncclPeer* peer = channel->peers + op->root;
struct ncclChannelPeer* peer = channel->peers + op->root;
if (info->coll == ncclFuncSend) {
op->pattern = ncclPatternSend;
if (op->root != info->comm->rank && peer->send[info->connIndex].transportComm && peer->send[info->connIndex].transportComm->proxyProgress) {
if (op->root != info->comm->rank && peer->send[1].transportComm == &netTransport.send) {
// Tune chunk size for the network
if (info->count < stepSize) info->chunkSize /= 4;
else if (info->count < 8*stepSize) info->chunkSize /= 2;
}
} else if (info->coll == ncclFuncRecv) {
op->pattern = ncclPatternRecv;
if (op->root != info->comm->rank && peer->recv[info->connIndex].transportComm && peer->recv[info->connIndex].transportComm->proxyProgress) {
if (op->root != info->comm->rank && peer->recv[1].transportComm == &netTransport.recv) {
// Tune chunk size for the network
if (info->count < stepSize) info->chunkSize /= 4;
else if (info->count < 8*stepSize) info->chunkSize /= 2;
@@ -441,22 +468,6 @@ ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op)
return ncclSuccess;
}
ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
struct ncclChannel* channel = comm->channels+op->channelId;
op->opCount = channel->workFifoTail-1;
if (op->root == comm->rank) return ncclSuccess;
if (op->pattern == ncclPatternRecv) {
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
if (op->nsteps == 0) op->nsteps = 1;
NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, op->connIndex));
} else if (op->pattern == ncclPatternSend) {
op->nsteps = DIVUP(op->nbytes, op->chunkSize);
if (op->nsteps == 0) op->nsteps = 1;
NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, op->connIndex));
}
return ncclSuccess;
}
static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
struct ncclProxyArgs* freeOp = *opPtr;
struct ncclProxyArgs* next = freeOp->next;
@@ -598,8 +609,48 @@ void ncclDumpProxyState(int signal) {
dumpProxyState(ncclLastProxyState);
}
NCCL_PARAM(CreateThreadContext, "CREATE_THREAD_CONTEXT", 0);
ncclResult_t ncclSetThreadContext(struct ncclComm* comm) {
#if CUDART_VERSION >= 11030
static int createThreadContext = -1;
if (createThreadContext == -1) {
createThreadContext = ncclParamCreateThreadContext();
if (createThreadContext) {
if (CUPFN(cuCtxCreate_v3020) == nullptr || CUPFN(cuCtxDestroy) == nullptr || CUPFN(cuCtxSetCurrent) == nullptr) {
WARN("Unable to create thread context due to old driver, disabling.");
createThreadContext = 0;
}
}
}
if (createThreadContext) {
if (comm->proxyState.cudaCtx == NULL) {
if (CUPFN(cuCtxCreate_v3020(&comm->proxyState.cudaCtx,
CU_CTX_SCHED_SPIN|CU_CTX_MAP_HOST, comm->cudaDev)) != CUDA_SUCCESS) {
WARN("Failed to create CUDA context on device %d", comm->cudaDev);
createThreadContext = 0;
return ncclSuccess;
}
} else {
if (CUPFN(cuCtxSetCurrent(comm->proxyState.cudaCtx)) != CUDA_SUCCESS) {
WARN("Failed to set CUDA context on device %d", comm->cudaDev);
return ncclUnhandledCudaError;
}
}
}
#endif
return ncclSuccess;
}
void* ncclProxyProgress(void *comm_) {
struct ncclComm* comm = (struct ncclComm*)comm_;
if (ncclSetThreadContext(comm) != ncclSuccess) {
WARN("[Proxy Progress] Failed to set CUDA context on device %d", comm->cudaDev);
} else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
WARN("[Proxy Progress] Failed to set CUDA device %d", comm->cudaDev);
}
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
struct ncclProxyProgressState* state = &comm->proxyState.progressState;
state->nextOps = -1;
signal(SIGUSR1, ncclDumpProxyState);
@@ -732,9 +783,9 @@ static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool,
static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
if (connection->send) {
NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
NCCLCHECK(ncclTransports[connection->transport]->send.proxyFree(connection, comm));
} else {
NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
NCCLCHECK(ncclTransports[connection->transport]->recv.proxyFree(connection, comm));
}
return ncclSuccess;
}
@@ -778,7 +829,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
struct ncclTransportComm* tcomm = send ? &ncclTransports[transport]->send : &ncclTransports[transport]->recv;
// If we need proxy progress, map progress ops
if (tcomm->proxyProgress) {
char poolPath[] = "/dev/shm/nccl-XXXXXX";
@@ -885,7 +936,7 @@ static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclPr
NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
connection->localRank = peer->localRank;
NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
connection->tcomm = connection->send ? &ncclTransports[connection->transport]->send : &ncclTransports[connection->transport]->recv;
// If we need proxy progress, let's allocate ops and start the thread
if (connection->tcomm->proxyProgress) {
NCCLCHECK(proxyProgressInit(comm));
@@ -951,7 +1002,10 @@ static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* p
void* ncclProxyService(void* _args) {
struct ncclComm* comm = (struct ncclComm *) _args;
if (hipSetDevice(comm->cudaDev) != hipSuccess) {
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
if (ncclSetThreadContext(comm) != ncclSuccess) {
WARN("[Proxy Service] Failed to set CUDA context on device %d", comm->cudaDev);
} else if (hipSetDevice(comm->cudaDev) != hipSuccess) {
WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
}
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+28 -33
Ver fichero
@@ -11,16 +11,11 @@
#define ENABLE_TIMER 0
#include "timer.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
extern struct ncclTransport netTransport;
extern struct ncclTransport collNetTransport;
struct ncclTransport ncclTransports[NTRANSPORTS] = {
p2pTransport,
shmTransport,
netTransport,
collNetTransport
struct ncclTransport* ncclTransports[NTRANSPORTS] = {
&p2pTransport,
&shmTransport,
&netTransport,
&collNetTransport
};
template <int type>
@@ -37,10 +32,11 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
}
bool xgmi;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
for (int t=0; t<NTRANSPORTS; t++) {
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransport *transport = ncclTransports[t];
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@@ -55,18 +51,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
return ncclSystemError;
}
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t mask = 1 << channel->id;
struct ncclChannel* channel = &comm->channels[channelId];
uint32_t mask = 1 << channelId;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
return ncclSuccess;
}
@@ -82,17 +79,18 @@ void dumpData(struct ncclConnect* data, int ndata) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
int highestType = TRANSPORT_P2P; // track highest transport type
hipStream_t transportSetupStream;
CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
int highestType = TRANSPORT_P2P; // track highest transport type
struct ncclConnect data[2*MAXCHANNELS];
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
@@ -137,7 +135,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
}
}
TIME_STOP(3);
@@ -147,11 +146,11 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
conn->connected = 1;
CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
}
}
TIME_STOP(4);
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
}
CUDACHECK(hipStreamSynchronize(transportSetupStream));
CUDACHECK(hipStreamDestroy(transportSetupStream));
@@ -179,10 +178,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
peerInfo->rank = nranks;
int support = 1;
if (isMaster) {
NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
}
// send master receives connect info from peer recv master
if (isMaster && type == collNetSend) {
@@ -192,14 +187,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// select
struct ncclPeer* root = channel->peers+nranks;
struct ncclChannelPeer* root = channel->peers+nranks;
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster && support) {
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
@@ -229,11 +224,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
if (isMaster && support) {
if (isMaster) {
NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == collNetRecv) {
@@ -242,7 +237,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (support) fail = 0;
fail = 0;
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
@@ -271,7 +266,7 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+comm->nRanks;
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
+52 -26
Ver fichero
@@ -108,7 +108,7 @@ struct sendResources {
uint64_t step;
struct reqSlot (*reqFifo)[NCCL_STEPS];
int collNetRank;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct recvResources {
@@ -128,12 +128,12 @@ struct recvResources {
uint64_t step;
struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
int collNetRank;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
/* Determine if we can communicate with the peer */
static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
// This transport cannot be used for p2p
*ret = 0;
return ncclSuccess;
}
@@ -157,7 +157,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -175,7 +175,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s comm %p nRanks %02d", channelId, myInfo->rank, collNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -300,7 +300,7 @@ ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle
comm->proxyState.progressState.collNet.resources = resources;
}
if (resources->collNetComms[netDev] == NULL)
NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
NCCLCHECK(collNetListen(comm, netDev, collNetHandle, resources->collNetListenComms+netDev));
return ncclSuccess;
}
@@ -314,13 +314,13 @@ static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct nccl
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank,
ncclResult_t ret = collNetConnect(comm, (void**)handlePtrs, nranks, rank,
resources->collNetListenComms[netDev],
resources->collNetComms+netDev);
free(handlePtrs);
if (ret == ncclSuccess) {
// Close listen comm
NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
NCCLCHECK(collNetCloseListen(comm, resources->collNetListenComms[netDev]));
} else {
resources->collNetListenComms[netDev] = NULL;
}
@@ -334,7 +334,7 @@ static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
resources->commRefCount[netDev]--;
if (resources->commRefCount[netDev] == 0) {
NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
NCCLCHECK(collNetCloseColl(comm, resources->collNetComms[netDev]));
}
for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
comm->proxyState.progressState.collNet.resources = NULL;
@@ -450,9 +450,22 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
if (resources->useGdr && comm->dmaBufSupport) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
}
*((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
@@ -506,9 +519,22 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
if (resources->useGdr && comm->dmaBufSupport) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(collNetRegMrDmaBuf(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(collNetRegMr(comm, resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
}
// Pass info to send side
info->reqFifo = resources->reqFifo;
@@ -524,7 +550,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
struct sendResources* resources = (struct sendResources*)(connection->transportResources);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->sendMhandles[p]) {
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->sendMhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@@ -541,7 +567,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
struct recvResources* resources = (struct recvResources*)(connection->transportResources);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->mhandles[p]) {
NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
NCCLCHECK(collNetDeregMr(comm, resources->collNetComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@@ -621,9 +647,9 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
args->idle = 0;
//continue;
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
args->hdp_flushed = *recvTail;
*resources->curr_hdp_reg = 1;
}
}
}
@@ -634,10 +660,10 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
if (reqFifo[group][buffSlot].recvBuff != NULL) {
int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot];
int count = totalSize / ncclTypeSize(args->dtype);
int count = totalSize / ncclTypeSize((ncclDataType_t)args->dtype);
reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot];
char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot];
NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
NCCLCHECK(collNetIallreduce(comm, resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] == NULL) continue;
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]);
@@ -653,7 +679,7 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int done, size;
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size));
NCCLCHECK(collNetTest(comm, (void*)(sub->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
@@ -744,7 +770,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int startChannel = group*COLLNET_GROUP_NSUBS;
int offset;
NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
NCCLCHECK(collNetIflush(comm, resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
}
} else {
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
@@ -758,7 +784,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS;
int done = 1;
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL));
if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(comm, sub->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot);
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
+83 -127
Ver fichero
@@ -8,15 +8,11 @@
#include "comm.h"
#include "net.h"
#include "graph.h"
#include <sys/time.h>
#include "proxy.h"
#include "collectives.h"
#include "gdrwrap.h"
#include "shm.h"
#include "profiler.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
#include "graph.h"
#include "graph/topo.h"
@@ -108,7 +104,7 @@ struct sendResources {
void* mhandles[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct recvResources {
@@ -136,7 +132,7 @@ struct recvResources {
void* mhandles[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
volatile uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", 0);
@@ -178,7 +174,6 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
req.netDev = -1;
req.curr_hdp_reg = 0;
int proxyRank = myInfo->rank;
@@ -198,12 +193,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
} else {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d",
channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s comm %p nRanks %02d", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
}
*((int*)connectInfo) = proxyRank;
@@ -222,7 +215,6 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
req.channelId = channelId;
req.connIndex = connIndex;
req.netDev = -1;
// Use myInfo->rank as the receiver uses its own NIC
int proxyRank = myInfo->rank;
@@ -238,8 +230,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
req.remoteRank = peerInfo->rank;
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d",
channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s comm %p nRanks %02d", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "", comm, comm->nRanks);
return ncclSuccess;
}
@@ -448,7 +439,7 @@ static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, i
static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
int rank = comm->localRankToRank[connection->localRank];
int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
NCCLCHECK(sharedBuffersInit(comm, comm->hasFineGrain, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
return ncclSuccess;
}
@@ -470,7 +461,7 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
resources->connIndex = req->connIndex;
resources->curr_hdp_reg = req->curr_hdp_reg;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
resources->maxRecvs = props.maxRecvs;
// We don't return any data
@@ -496,11 +487,11 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
resources->channelId = req->channelId;
resources->connIndex = req->connIndex;
ncclNetProperties_t props;
NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
NCCLCHECK(ncclNetGetProperties(comm, req->netDev, &props));
resources->maxRecvs = props.maxRecvs;
if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
NCCLCHECK(ncclNetListen(comm, req->netDev, respBuff, &resources->netListenComm));
*done = 1;
return ncclSuccess;
}
@@ -527,15 +518,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, comms->sendComm+resources->channelId));
resources->netSendComm = comms->sendComm[resources->channelId];
if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
} else {
NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
}
} else {
// Connect to remote peer
NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
NCCLCHECK(ncclNetConnect(comm, resources->netDev, reqBuff, &resources->netSendComm));
connection->proxyAppendPtr = &connection->proxyAppend;
}
if (resources->netSendComm == NULL) {
@@ -609,7 +600,31 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
if (resources->buffers[p]) {
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#else
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
int dmabuf_fd;
uint64_t offset;
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@@ -643,15 +658,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
}
struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, comms->recvComm+resources->channelId));
resources->netRecvComm = comms->recvComm[resources->channelId];
if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
} else {
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
}
} else {
// Connect to remote peer
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
NCCLCHECK(ncclNetAccept(comm, resources->netListenComm, &resources->netRecvComm));
connection->proxyAppendPtr = &connection->proxyAppend;
}
if (resources->netRecvComm == NULL) {
@@ -659,7 +674,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
return ncclSuccess;
}
*done = 1;
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
NCCLCHECK(ncclNetCloseListen(comm, resources->netListenComm));
// Create structures
struct connectMap* map = &resources->map;
@@ -714,7 +729,31 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
if (resources->buffers[p]) {
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
#if CUDA_VERSION >= 11070
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport) {
int dmabuf_fd;
CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
} else // FALL-THROUGH to nv_peermem GDR path
#else
/* DMA-BUF support */
int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
if (type == NCCL_PTR_CUDA && comm->dmaBufSupport && pfn_hsa_amd_portable_export_dmabuf) {
int dmabuf_fd;
uint64_t offset;
CUCHECK(hsa_amd_portable_export_dmabuf((const void*)resources->buffers[p], resources->buffSizes[p], &dmabuf_fd, &offset));
NCCLCHECK(ncclNetRegMrDmaBuf(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, offset, dmabuf_fd, &resources->mhandles[p]));
(void)close(dmabuf_fd);
INFO(NCCL_INIT|NCCL_NET, "hsa_amd_portable_export_dmabuf buffer %p size %d handle %x offset %ld",
(const void*)resources->buffers[p], resources->buffSizes[p], dmabuf_fd, offset);
} else // FALL-THROUGH to nv_peermem GDR path
#endif
{
NCCLCHECK(ncclNetRegMr(comm, resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
}
}
}
@@ -732,7 +771,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
NCCLCHECK(ncclNetDeregMr(comm, resources->netSendComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@@ -748,12 +787,12 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
comms->sendRefCount[resources->channelId]--;
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comm, comms->sendComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
}
} else {
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
NCCLCHECK(ncclNetCloseSend(comm, resources->netSendComm));
}
free(resources);
return ncclSuccess;
@@ -767,7 +806,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
}
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (resources->buffers[p]) {
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
NCCLCHECK(ncclNetDeregMr(comm, resources->netRecvComm, resources->mhandles[p]));
}
}
struct connectMapMem* mems = resources->map.mems;
@@ -779,12 +818,12 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
comms->recvRefCount[resources->channelId]--;
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comm, comms->recvComm[resources->channelId]));
} else {
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
}
} else {
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
NCCLCHECK(ncclNetCloseRecv(comm, resources->netRecvComm));
}
free(resources);
return ncclSuccess;
@@ -792,16 +831,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
static int g_npkit_net_poll_cnt = 0;
#endif
static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt++;
#endif
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
@@ -855,11 +885,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = sizesFifo[buffSlot];
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
sub->npKitSizesFifo[buffSlot] = size;
#endif
char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
int ready = 1;
if (p == NCCL_PROTO_LL128) {
@@ -887,29 +912,13 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
if (ready) {
// flush HDP if not done
if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) {
args->hdp_flushed = LOAD(recvTail);
STORE(resources->curr_hdp_reg, 1);
if (resources->curr_hdp_reg && args->hdp_flushed < *recvTail) {
args->hdp_flushed = *recvTail;
*resources->curr_hdp_reg = 1;
}
// Data is ready, try to send.
NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
NCCLCHECK(ncclNetIsend(comm, resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
NpKit::CollectCpuEvent(
NPKIT_EVENT_NET_SEND_ENTRY,
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt,
#else
size,
#endif
uint64_t(sub->requests+buffSlot)/sizeof(void*),
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
#endif
TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
@@ -926,24 +935,8 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
if (sub->done < sub->transmitted) {
int done;
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
NCCLCHECK(ncclNetTest(comm, sub->requests[buffSlot], &done, NULL));
if (done) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_SEND_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_SEND_EXIT)
NpKit::CollectCpuEvent(
NPKIT_EVENT_NET_SEND_EXIT,
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt,
#else
sub->npKitSizesFifo[buffSlot],
#endif
uint64_t(sub->requests+buffSlot)/sizeof(void*),
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
#endif
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
@@ -969,11 +962,6 @@ static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt++;
#endif
if (args->state == ncclProxyOpReady) {
// Initialize subs and group them by same recvComm.
void* recvComm;
@@ -1051,26 +1039,10 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->posted;
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
NCCLCHECK(ncclNetIrecv(comm, resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
if (*requestPtr) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup+i;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
NpKit::CollectCpuEvent(
NPKIT_EVENT_NET_RECV_ENTRY,
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt,
#else
sizes[i],
#endif
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
#endif
sub->posted += args->sliceSteps;
for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
}
@@ -1089,29 +1061,13 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
int sizes[NCCL_PROXY_MAX_SUBS];
void* mhandles[NCCL_PROXY_MAX_SUBS];
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
NCCLCHECK(ncclNetTest(comm, subGroup->requests[step%NCCL_STEPS], &done, sizes));
if (done) {
int useGdr = 0;
int totalSize = 0;
for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup + i;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_NET_RECV_ENTRY) && defined(ENABLE_NPKIT_EVENT_NET_RECV_EXIT)
NpKit::CollectCpuEvent(
NPKIT_EVENT_NET_RECV_EXIT,
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt,
#else
sizes[i],
#endif
uint64_t(sub->requests+(step%NCCL_STEPS))/sizeof(void*),
*(volatile uint64_t*)NpKit::GetCpuTimestamp(), sub->channelId);
#if defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt = 0;
#endif
#endif
sub->received += args->sliceSteps;
for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
if (step < sub->nsteps) {
@@ -1146,7 +1102,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
}
}
struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
NCCLCHECK(ncclNetIflush(comm, resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
}
}
args->idle = 0;
@@ -1161,7 +1117,7 @@ static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArg
uint64_t step = subGroup->transmitted;
int done = 1;
void* request = subGroup->requests[step%NCCL_STEPS];
if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
if (request) NCCLCHECK(ncclNetTest(comm, request, &done, NULL));
if (done) {
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup + i;
+61 -21
Ver fichero
@@ -296,6 +296,31 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
return ncclSuccess;
}
// Detect whether DMA-BUF support is present in the kernel
// Returns :
// ncclSuccess : DMA-BUF support is available
// ncclSystemError : DMA-BUF is not supported by the kernel
ncclResult_t ncclIbDmaBufSupport(int dev) {
static int dmaBufSupported = -1;
if (dmaBufSupported == -1) {
ncclResult_t res;
struct ibv_pd* pd;
struct ibv_context* ctx;
ctx = ncclIbDevs[dev].context;
NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
// Test kernel DMA-BUF support with a dummy call (fd=-1)
(void) wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL/*offset*/, 0ULL/*len*/, 0ULL/*iova*/, -1/*fd*/, 0/*flags*/);
// ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP if not supported (EBADF otherwise)
dmaBufSupported = (errno != EOPNOTSUPP) ? 1 : 0;
NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
}
if (dmaBufSupported == 0) return ncclSystemError;
return ncclSuccess;
failure:
dmaBufSupported = 0;
return ncclSystemError;
}
static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
return ncclSuccess;
@@ -308,10 +333,11 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
props->pciPath = ncclIbDevs[dev].pciPath;
props->guid = ncclIbDevs[dev].guid;
props->ptrSupport = NCCL_PTR_HOST;
if (ncclIbGdrSupport(dev) != ncclSuccess) {
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
} else {
props->ptrSupport |= NCCL_PTR_CUDA;
if (ncclIbGdrSupport(dev) == ncclSuccess) {
props->ptrSupport |= NCCL_PTR_CUDA; // GDR support via nv_peermem
}
if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
}
props->speed = ncclIbDevs[dev].speed;
props->latency = 0; // Not set
@@ -568,6 +594,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
memset(handle, 0, sizeof(struct ncclIbHandle));
comm->dev = dev;
comm->sock.asyncFlag = 1; /* nonblocking socket is required by network communication. */
NCCLCHECK(GetSocketAddr(&comm->sock.addr));
if (ncclParamIbSockServerPortReuse()) {
// reuse the socket address and fd for listen system call
@@ -614,7 +641,7 @@ ib_connect_check:
/* expect user to call again */
return ncclSuccess;
} else if (conState == ncclSocketError) {
return ncclSystemError;
return ncclRemoteError;
}
// IB Setup
@@ -692,7 +719,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
stage->comm = rComm;
stage->state = ncclIbCommStateAccept;
lComm->sock.asyncFlag = 1;
rComm->sock.asyncFlag = 1;
ib_accept:
NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
@@ -846,7 +872,8 @@ ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
ncclResult_t ncclIbTest(void* request, int* done, int* size);
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
/* DMA-BUF support */
ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
assert(size > 0);
@@ -856,7 +883,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
uintptr_t addr = (uintptr_t)data & -pageSize;
int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
ncclResult_t res;
pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
for (int slot=0; /*true*/; slot++) {
@@ -868,14 +895,20 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhan
// Deregister / register
struct ibv_mr* mr;
unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
if (ncclIbRelaxedOrderingEnabled) {
// Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
if (fd != -1) {
/* DMA-BUF support */
NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, verbs->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
} else {
if (ncclIbRelaxedOrderingEnabled) {
// Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
}
else {
NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
}
}
else {
NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
}
TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey);
TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x fd %d", (unsigned long long)addr, (long long)pages*pageSize, mr->rkey, fd);
cache->population += 1;
cache->slots[slot].addr = addr;
cache->slots[slot].pages = pages;
@@ -897,6 +930,10 @@ returning:
return res;
}
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
return ncclIbRegMrDmaBuf(comm, data, (size_t)size, type, 0ULL, -1, mhandle);
}
ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
@@ -950,13 +987,16 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
// Write size as immediate data. In the case of multi-send, only write
// 0 or 1 as size to indicate whether there was data sent or received.
uint64_t immData = 0;
uint32_t immData = 0;
if (nreqs == 1) {
immData = reqs[0]->send.size;
} else {
uint8_t* multiImmData = (uint8_t*)&immData;
if (nreqs > 32) {
WARN("Cannot store sizes of %d requests in a 32-bits field", nreqs);
return ncclInternalError;
}
for (int r=0; r<nreqs; r++) {
multiImmData[r] = reqs[r]->send.size ? 1 : 0;
immData |= (reqs[r]->send.size ? 1 : 0) << r;
}
}
@@ -1231,7 +1271,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
return ncclSystemError;
return ncclRemoteError;
}
struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
@@ -1246,9 +1286,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
if (req->nreqs > 1) {
// In the case of a multi recv, we only set sizes to 0 or 1.
uint8_t* sizes = (uint8_t*)&wc->imm_data;
for (int i=0; i<req->nreqs; i++) {
req->recv.sizes[i] |= sizes[i];
req->recv.sizes[i] = (wc->imm_data >> i) & 0x1;
}
} else {
req->recv.sizes[0] += wc->imm_data;
@@ -1309,6 +1348,7 @@ ncclNet_t ncclNetIb = {
ncclIbConnect,
ncclIbAccept,
ncclIbRegMr,
ncclIbRegMrDmaBuf,
ncclIbDeregMr,
ncclIbIsend,
ncclIbIrecv,
+3 -1
Ver fichero
@@ -311,6 +311,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
struct ncclSocketListenComm* comm;
NCCLCHECK(ncclSocketNewListenComm(&comm));
NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
comm->sock.asyncFlag = 1;
NCCLCHECK(ncclSocketListen(&comm->sock));
memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
@@ -359,7 +360,7 @@ socket_connect_check:
/* expect user to call again */
return ncclSuccess;
} else if (conState == ncclSocketError) {
return ncclSystemError;
return ncclRemoteError;
}
stage->state = ncclSocketCommStateSend;
@@ -616,6 +617,7 @@ ncclNet_t ncclNetSocket = {
ncclSocketConnect,
ncclSocketAccept,
ncclSocketRegMr,
NULL, // No DMA-BUF support
ncclSocketDeregMr,
ncclSocketIsend,
ncclSocketIrecv,
+240 -24
Ver fichero
@@ -8,6 +8,7 @@
#include "comm.h"
#include "graph.h"
#include "utils.h"
#include "shm.h"
#include "graph.h"
#include "graph/topo.h"
@@ -20,6 +21,34 @@ struct p2pConnectInfo {
int rank;
int read;
struct ncclP2pBuff p2pBuff;
// Use by CE memcpy
char shmName[7];
int shmSize;
};
static_assert(sizeof(struct p2pConnectInfo) <= CONNECT_SIZE, "p2pConnectInfo is too large");
struct p2pShm {
struct ncclSendMem sendMem;
struct ncclRecvMem recvMem;
};
struct p2pProxyInfo {
// Shared memory between proxy and receiving GPU
struct p2pShm* shm;
struct p2pShm* devShm;
char shmName[7];
int shmSize;
// Intermediate step for sender
struct ncclRecvMem* ceRecvMem;
char* ceDevBuff;
// Receiver buffer
char* recvFifo;
// Used by progress only
uint64_t step;
hipStream_t stream;
hipEvent_t events[NCCL_STEPS];
};
static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
@@ -28,18 +57,22 @@ struct p2pSendResources {
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
void* sendMemIpc;
void* recvMemIpc;
struct p2pProxyInfo proxyInfo;
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
void* sendMemIpc;
void* recvMemIpc;
struct p2pShm* shm;
struct p2pShm* devShm;
int shmSize;
};
#include <sys/types.h>
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
int busIdToCudaDev(int64_t busId) {
static int busIdToCudaDev(int64_t busId) {
int ndev;
if (hipGetDeviceCount(&ndev) != hipSuccess)
return -1;
@@ -55,8 +88,13 @@ int busIdToCudaDev(int64_t busId) {
return -1;
}
NCCL_PARAM(P2pUseCudaMemcpy, "P2P_USE_CUDA_MEMCPY", 0);
static int useMemcpy = 0;
static void initCeOperation();
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
initCeOperation();
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (!info1->hasFineGrain || !info2->hasFineGrain) {
*ret = 0;
@@ -74,7 +112,10 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
int intermediateRank;
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
if (*ret == 0) return ncclSuccess;
if (intermediateRank != -1) return ncclSuccess;
if (intermediateRank != -1) {
if (useMemcpy) *ret = 0;
return ncclSuccess;
}
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int cudaDev1 = busIdToCudaDev(info1->busId);
@@ -94,7 +135,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
int p2p;
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
cudaDev1, info1->busId, cudaDev2, info2->busId);
cudaDev1, info1->busId, cudaDev2, info2->busId);
*ret = 0;
return ncclSuccess;
}
@@ -188,6 +229,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
send->transportResources = resources;
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
if (useMemcpy) useRead = 0;
resources->next_hdp_reg = 0;
bool isXGMI;
@@ -214,14 +256,14 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
} else {
send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, comm, comm->nRanks);
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s%s comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr, useMemcpy ? "/CE" : "", comm, comm->nRanks);
}
} else {
info->rank = intermediateRank;
@@ -231,9 +273,15 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
}
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
if (useMemcpy) {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, NULL, 0, &resources->proxyInfo, sizeof(struct p2pProxyInfo)));
info->shmSize = resources->proxyInfo.shmSize;
memcpy(info->shmName, resources->proxyInfo.shmName, sizeof(info->shmName));
} else {
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
}
NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
return ncclSuccess;
}
@@ -259,7 +307,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (intermediateRank == -1) {
info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
if (myInfo->pidHash == peerInfo->pidHash && useMemcpy == 0) {
if (ncclParamP2pDirectDisable() == 0) recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
@@ -287,31 +335,61 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
if (resources->devMem == NULL) return ncclInternalError; // We should not use read + memcpy
send->conn.buffs[p] = (char*)(resources->devMem+1);
} else {
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
}
}
send->conn.tail = &remDevMem->tail;
send->conn.head = &resources->devMem->head;
send->conn.ptrExchange = &resources->devMem->ptrExchange;
send->conn.next_hdp_reg = resources->next_hdp_reg;
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
if (useMemcpy) {
send->conn.tail = &resources->proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = resources->proxyInfo.ceRecvMem->sizesFifo;
send->conn.head = &resources->proxyInfo.devShm->sendMem.head;
// Send SIMPLE buff to proxy, and replace it by local buffer
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &send->conn.buffs[NCCL_PROTO_SIMPLE], sizeof(void*), NULL, 0));
send->conn.buffs[NCCL_PROTO_SIMPLE] = resources->proxyInfo.ceDevBuff;
} else {
send->conn.tail = &remDevMem->tail;
send->conn.head = &resources->devMem->head;
send->conn.ptrExchange = &resources->devMem->ptrExchange;
send->conn.redOpArgExchange = resources->devMem->redOpArgExchange;
}
return ncclSuccess;
}
/* Connect/Recv from this peer */
ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct ncclSendMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
struct ncclSendMem* remDevMem = NULL;
if (useMemcpy) {
char shmPath[PATH_MAX];
sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
resources->shmSize = info->shmSize;
NCCLCHECK(ncclShmOpen(shmPath, info->shmSize, (void**)&resources->shm, (void**)&resources->devShm, 0));
// Remove the file to ensure proper clean-up
NCCLCHECK(ncclShmUnlink(shmPath));
recv->conn.tail = &resources->devShm->recvMem.tail;
recv->conn.head = &resources->devShm->sendMem.head;
} else {
NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
recv->conn.tail = &resources->devMem->tail;
recv->conn.head = &remDevMem->head;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
}
char* buff = (char*)(resources->devMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
if (remDevMem == NULL) return ncclInternalError; // We should not use read + memcpy
/* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
recv->conn.buffs[p] = (char*)(remDevMem+1);
} else {
@@ -319,10 +397,6 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
buff += recv->comm->buffSizes[p];
}
}
recv->conn.tail = &resources->devMem->tail;
recv->conn.head = &remDevMem->head;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
recv->conn.redOpArgExchange = remDevMem->redOpArgExchange;
return ncclSuccess;
}
@@ -338,11 +412,52 @@ ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
if (resources->sendMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->sendMemIpc));
if (resources->recvMemIpc) CUDACHECK(hipIpcCloseMemHandle(resources->recvMemIpc));
if (useMemcpy) {
NCCLCHECK(ncclShmClose(resources->shm, resources->devShm, resources->shmSize));
}
free(resources);
return ncclSuccess;
}
static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
connection->transportResources = proxyInfo;
NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, comm->buffSizes[NCCL_PROTO_SIMPLE], true));
char shmPath[PATH_MAX];
shmPath[0] = '\0';
proxyInfo->shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
NCCLCHECK(ncclShmOpen(shmPath, proxyInfo->shmSize, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm, 1));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, proxyInfo->shmSize);
memcpy(proxyInfo->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(proxyInfo->shmName));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
if (respSize != sizeof(struct p2pProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, sizeof(struct p2pProxyInfo));
} else {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size, true));
connection->transportResources = p2pBuff->directPtr;
hipError_t res = hipIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
if (res != hipSuccess) {
WARN("hipIpcGetMemHandle failed : %s", hipGetErrorString(res));
hipFree(p2pBuff->directPtr);
free(p2pBuff);
CUDACHECK(res);
}
}
*done = 1;
return ncclSuccess;
}
static ncclResult_t p2pRecvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
if (reqSize != sizeof(int)) return ncclInternalError;
int size = *((int*)reqBuff);
if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
@@ -360,15 +475,116 @@ static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct
return ncclSuccess;
}
static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
static ncclResult_t p2pSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
if (reqSize != sizeof(void*)) return ncclInternalError;
proxyInfo->recvFifo = *((char**)reqBuff);
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventCreate(proxyInfo->events+i));
}
connection->proxyAppendPtr = &connection->proxyAppend;
return ncclSuccess;
}
static ncclResult_t p2pSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
if (useMemcpy) {
struct p2pProxyInfo* proxyInfo = (struct p2pProxyInfo*)connection->transportResources;
NCCLCHECK(ncclShmClose(proxyInfo->shm, proxyInfo->devShm, proxyInfo->shmSize));
NCCLCHECK(ncclCudaHostFree(proxyInfo->ceRecvMem));
CUDACHECK(hipFree(proxyInfo->ceDevBuff));
CUDACHECK(hipStreamDestroy(proxyInfo->stream));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventDestroy(proxyInfo->events[i]));
}
free(proxyInfo);
} else {
// Do not check return code as CUDA may have already shut down
hipFree(connection->transportResources);
}
return ncclSuccess;
}
static ncclResult_t p2pRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
// Do not check return code as CUDA may have already shut down
hipFree(connection->transportResources);
return ncclSuccess;
}
static ncclResult_t p2pSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct p2pProxyInfo* resources = (struct p2pProxyInfo*) (sub->connection->transportResources);
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses hipMemcpy
resources->step = sub->base + sub->nsteps;
args->done++;
continue;
}
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
// Check GPU has sent everything
if ((*recvTail > sub->base+sub->transmitted)) {
int size = sizesFifo[buffSlot];
CUDACHECK(hipMemcpyAsync(resources->recvFifo+buffSlot*stepSize, resources->ceDevBuff+buffSlot*stepSize, size, hipMemcpyDeviceToDevice, resources->stream));
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
sub->transmitted += args->sliceSteps;
}
}
if (sub->done < sub->transmitted) {
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
hipError_t res = hipEventQuery(resources->events[buffSlot]);
if (res != hipErrorNotReady) CUDACHECK(res);
if (res == hipSuccess) {
sub->done += args->sliceSteps;
// Notify SHM
resources->shm->recvMem.tail = sub->base + sub->done;
}
if (sub->done == sub->nsteps) {
resources->step = sub->base + sub->nsteps;
args->done++;
}
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
}
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
{ p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pSendProxySetup, NULL, p2pSendProxyFree, NULL },
{ p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pRecvProxySetup, NULL, p2pRecvProxyFree, NULL }
};
static void initCeOperation() {
static int init = 0;
if (!init) {
useMemcpy = ncclParamP2pUseCudaMemcpy();
if (useMemcpy) {
p2pTransport.send.proxyConnect = p2pSendProxyConnect;
p2pTransport.send.proxyProgress = p2pSendProxyProgress;
}
init = 1;
}
}
+266 -20
Ver fichero
@@ -31,11 +31,21 @@ struct shmRecvResources {
struct ncclRecvMem* devHostMem;
};
#define SHM_SEND_SIDE 1
#define SHM_RECV_SIDE 2
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
NCCL_PARAM(ShmUseCudaMemcpy, "SHM_USE_CUDA_MEMCPY", 0);
NCCL_PARAM(ShmMemcpyMode, "SHM_MEMCPY_MODE", SHM_SEND_SIDE); // 1 is sender-side, 2 is receiver-side, 3 is both
static int useMemcpySend = 0;
static int useMemcpyRecv = 0;
NCCL_PARAM(ShmLocality, "SHM_LOCALITY", SHM_RECV_SIDE); // 1 is sender-size, 2 is receiver-size
static int shmLocality = 0;
static void initCeOperation();
/* Determine two peers can communicate with SHM */
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
static ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 0;
initCeOperation();
if (ncclParamShmDisable() == 1) return ncclSuccess;
@@ -55,7 +65,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
@@ -65,17 +75,20 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
char shmPath[PATH_MAX];
shmPath[0] = '\0';
info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
int shmSize = sizeof(struct ncclSendMem);
if (shmLocality == SHM_SEND_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += send->comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory comm %p nRanks %02d",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, comm, comm->nRanks);
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via SHM/%s/%s comm %p nRanks %02d", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useMemcpySend?"CE":"direct", useMemcpyRecv?"CE":"direct", comm, comm->nRanks);
return ncclSuccess;
}
ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
@@ -86,7 +99,9 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
char shmPath[PATH_MAX];
shmPath[0] = '\0';
int shmSize = sizeof(struct ncclRecvMem);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
if (shmLocality == SHM_RECV_SIDE) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
}
info->shmSize = resources->shmSize = shmSize;
NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
@@ -95,8 +110,21 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
return ncclSuccess;
}
struct shmProxyInfo {
struct ncclRecvMem* ceRecvMem;
char* devFifo;
char* shmFifo;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
// used by progress only
uint64_t step;
hipStream_t stream;
hipEvent_t events[NCCL_STEPS];
};
/* Connect to this peer */
ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -109,19 +137,29 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
// Remove the file to ensure proper clean-up
NCCLCHECK(ncclShmUnlink(shmPath));
send->transportResources = resources;
int offset = 0;
char* buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
offset += send->comm->buffSizes[p];
send->conn.buffs[p] = buff;
buff += send->comm->buffSizes[p];
}
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.head = &resources->devHostMem->head;
if (useMemcpyRecv) {
send->conn.sizesFifo = resources->devRemHostMem->sizesFifo;
}
if (useMemcpySend) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, comm->rank, &send->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, send->conn.buffs[NCCL_PROTO_SIMPLE], resources->hostMem, resources->remHostMem };
NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
send->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
send->conn.tail = &proxyInfo.ceRecvMem->tail;
send->conn.sizesFifo = proxyInfo.ceRecvMem->sizesFifo;
}
return ncclSuccess;
}
ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
@@ -132,18 +170,26 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
NCCLCHECK(ncclShmUnlink(shmPath));
recv->conn.head = &resources->devRemHostMem->head;
int offset = 0;
char* buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem+1) : (char*)(resources->devRemHostMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
offset += recv->comm->buffSizes[p];
recv->conn.buffs[p] = buff;
buff += recv->comm->buffSizes[p];
}
recv->conn.head = &resources->devRemHostMem->head;
recv->conn.tail = &resources->devHostMem->tail;
if (useMemcpyRecv) {
NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, comm->rank, &recv->proxyConn));
struct shmProxyInfo proxyInfo = { NULL, NULL, recv->conn.buffs[NCCL_PROTO_SIMPLE], resources->remHostMem, resources->hostMem };
NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &proxyInfo, sizeof(struct shmProxyInfo), &proxyInfo, sizeof(struct shmProxyInfo)));
recv->conn.buffs[NCCL_PROTO_SIMPLE] = proxyInfo.devFifo;
recv->conn.tail = &proxyInfo.ceRecvMem->tail;
}
return ncclSuccess;
}
ncclResult_t shmSendFree(struct ncclConnector* send) {
static ncclResult_t shmSendFree(struct ncclConnector* send) {
struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -151,7 +197,7 @@ ncclResult_t shmSendFree(struct ncclConnector* send) {
return ncclSuccess;
}
ncclResult_t shmRecvFree(struct ncclConnector* recv) {
static ncclResult_t shmRecvFree(struct ncclConnector* recv) {
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
@@ -159,9 +205,209 @@ ncclResult_t shmRecvFree(struct ncclConnector* recv) {
return ncclSuccess;
}
static ncclResult_t shmSendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventCreate(proxyInfo->events+i));
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
}
static ncclResult_t shmRecvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
struct shmProxyInfo* proxyInfo;
NCCLCHECK(ncclCalloc(&proxyInfo, 1));
if (reqSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(proxyInfo, reqBuff, reqSize);
NCCLCHECK(ncclCudaCalloc(&proxyInfo->devFifo, comm->buffSizes[NCCL_PROTO_SIMPLE]));
NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
CUDACHECK(hipStreamCreateWithFlags(&proxyInfo->stream, hipStreamNonBlocking));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventCreate(proxyInfo->events+i));
}
connection->proxyAppendPtr = &connection->proxyAppend;
connection->transportResources = proxyInfo;
if (respSize != sizeof(struct shmProxyInfo)) return ncclInternalError;
memcpy(respBuff, proxyInfo, respSize);
return ncclSuccess;
}
static ncclResult_t shmSendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
CUDACHECK(hipStreamDestroy(resources->stream));
CUDACHECK(hipFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventDestroy(resources->events[i]));
}
free(connection->transportResources);
return ncclSuccess;
}
static ncclResult_t shmRecvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
struct shmProxyInfo* resources = (struct shmProxyInfo*)connection->transportResources;
CUDACHECK(hipStreamDestroy(resources->stream));
CUDACHECK(hipFree(resources->devFifo));
NCCLCHECK(ncclCudaHostFree(resources->ceRecvMem));
for (int i=0; i<NCCL_STEPS; i++) {
CUDACHECK(hipEventDestroy(resources->events[i]));
}
free(connection->transportResources);
return ncclSuccess;
}
static ncclResult_t shmSendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
resources->step = sub->base + sub->nsteps;
args->done++;
continue;
}
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
volatile int* sizesFifo = resources->ceRecvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->ceRecvMem->tail;
// Check GPU has sent everything
if ((*recvTail > sub->base+sub->transmitted)) {
int size = sizesFifo[buffSlot];
CUDACHECK(hipMemcpyAsync(resources->shmFifo+buffSlot*stepSize, resources->devFifo+buffSlot*stepSize, size, hipMemcpyDeviceToHost, resources->stream));
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
resources->recvMem->sizesFifo[buffSlot] = size;
__sync_synchronize(); // make sure sizesFifo is visible
sub->transmitted += args->sliceSteps;
}
}
if (sub->done < sub->transmitted) {
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
hipError_t res = hipEventQuery(resources->events[buffSlot]);
if (res != hipErrorNotReady) CUDACHECK(res);
if (res == hipSuccess) {
sub->done += args->sliceSteps;
// Notify SHM
resources->recvMem->tail = sub->base + sub->done;
}
if (sub->done == sub->nsteps) {
resources->step = sub->base + sub->nsteps;
args->done++;
}
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
}
static ncclResult_t shmRecvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
int stepSize = comm->buffSizes[p] / NCCL_STEPS;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
struct shmProxyInfo* resources = (struct shmProxyInfo*) (sub->connection->transportResources);
if (p != NCCL_PROTO_SIMPLE) { // Only Simple uses cudaMemcpy
resources->step = sub->base + sub->nsteps;
args->done++;
continue;
}
if (sub->transmitted < sub->done + NCCL_STEPS && sub->transmitted < sub->nsteps) {
int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
// Check data is ready in SHM
if ((*recvTail > sub->base+sub->transmitted)) {
int size = sizesFifo[buffSlot];
CUDACHECK(hipMemcpyAsync(resources->devFifo+buffSlot*stepSize, resources->shmFifo+buffSlot*stepSize, size, hipMemcpyHostToDevice, resources->stream));
CUDACHECK(hipEventRecord(resources->events[buffSlot], resources->stream));
sub->transmitted += args->sliceSteps;
}
}
if (sub->done < sub->transmitted) {
int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
hipError_t res = hipEventQuery(resources->events[buffSlot]);
if (res != hipErrorNotReady) CUDACHECK(res);
if (res == hipSuccess) {
sub->done += args->sliceSteps;
// Notify GPU
resources->ceRecvMem->tail = sub->base + sub->done;
}
if (sub->done == sub->nsteps) {
resources->step = sub->base + sub->nsteps;
args->done++;
}
}
}
if (args->done == args->nsubs) {
args->state = ncclProxyOpNone;
}
}
return ncclSuccess;
}
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
{ shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
};
static void initCeOperation() {
static int init = 0;
if (!init) {
useMemcpySend = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 1);
useMemcpyRecv = ncclParamShmUseCudaMemcpy() && (ncclParamShmMemcpyMode() & 2);
if (useMemcpySend) {
shmTransport.send.proxyConnect = shmSendProxyConnect;
shmTransport.send.proxyFree = shmSendProxyFree;
shmTransport.send.proxyProgress = shmSendProxyProgress;
}
if (useMemcpyRecv) {
shmTransport.recv.proxyConnect = shmRecvProxyConnect;
shmTransport.recv.proxyFree = shmRecvProxyFree;
shmTransport.recv.proxyProgress = shmRecvProxyProgress;
}
shmLocality = ncclParamShmLocality();
if (shmLocality != SHM_SEND_SIDE && shmLocality != SHM_RECV_SIDE) {
WARN("Ignoring SHM locality, must be 1 (sender side) or 2 (receiver side, default)");
shmLocality = SHM_RECV_SIDE;
}
init = 1;
}
}
+1 -1
Ver fichero
@@ -6,7 +6,7 @@ endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/rocm_smi/include/ -DTOPO_EXPL -DENABLE_TRACE
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/misc/param.cc \
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc
+1
Ver fichero
@@ -27,6 +27,7 @@ struct allGather3Data_t{
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
struct ncclTopoRanks topoRanks;
bool pivotA2AEnabled;
};
void initCollNet();
+8 -7
Ver fichero
@@ -179,10 +179,10 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
if (proxyRank == myInfo->rank) {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
} else {
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(comm), req.netDev,
proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
}
*((int*)connectInfo) = proxyRank;
@@ -205,7 +205,7 @@ ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
if (req.netDev < 0) NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(comm), req.netDev,
req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
return ncclSuccess;
}
@@ -250,8 +250,9 @@ struct ncclTransport collNetTransport = {
{ collNetRecvSetup, NULL, NULL, NULL }
};
struct ncclTransport ncclTransports[NTRANSPORTS] = {
p2pTransport,
shmTransport,
netTransport,
struct ncclTransport* ncclTransports[] = {
&p2pTransport,
&shmTransport,
&netTransport,
&collNetTransport,
};
+3 -5
Ver fichero
@@ -49,6 +49,8 @@ THE SOFTWARE.
#include "graph.h"
NodeModel *node_model;
extern ncclNet_t* ncclNet;
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
char ** itr = std::find(begin, end, option);
@@ -216,14 +218,12 @@ int main(int argc,char* argv[])
comm[i].nRanks = nranks;
NCCLCHECK(ncclCalloc(&comm[i].connectSend, NCCL_MAX_CONNS*comm->nRanks));
NCCLCHECK(ncclCalloc(&comm[i].connectRecv, NCCL_MAX_CONNS*comm->nRanks));
comm[i].p2pSendCount = comm[i].p2pRecvCount = 0;
NCCLCHECK(ncclCalloc(&comm[i].p2pSends, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm[i].p2pRecvs, comm->nRanks));
node_model = network.GetNode(i);
assert(node_model!=0);
comm[i].busId = node_model->getGpuBusId(i);
comm[i].topo = node_model->getSystem(i);
comm[i].peerInfo = peerInfo;
comm[i].ncclNet = ncclNet;
// Mark channels as non initialized.
for (int c=0; c<MAXCHANNELS; c++) comm[i].channels[c].id = -1;
NCCLCHECK(fillInfo(&comm[i], comm[i].peerInfo+comm[i].rank, 0));
@@ -272,8 +272,6 @@ int main(int argc,char* argv[])
for (int i = 0; i < nranks; i++) {
free(comm[i].connectSend);
free(comm[i].connectRecv);
free(comm[i].p2pSends);
free(comm[i].p2pRecvs);
}
free(treeGraph);
+154 -78
Ver fichero
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -216,20 +216,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclPeerInfo* peerInfo = comm->peerInfo+peer;
struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex :
comm->channels[channelId].peers[peer].recv + connIndex;
// handle intra-node network connections
int n1 = -1, n2 = -1;
if (connIndex == NCCL_CONN_IDX_P2P_NET) {
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, comm->rank, graph, channelId, (type == 1) ? 1 : 0, &n1));
NCCLCHECK(ncclTopoGetIntraNetDev(comm->topo, peer, graph, channelId, (type == 1) ? 0 : 1, &n2));
}
bool xgmi;
NCCLCHECK(ncclTopoGetLinkType(comm->topo, myInfo->cudaDev, peerInfo->cudaDev, &xgmi));
for (int t=0; t<NTRANSPORTS; t++) {
if (graph == NULL && connIndex == NCCL_CONN_IDX_P2P_NET && (t == TRANSPORT_SHM || (!xgmi && t == TRANSPORT_P2P))) continue;
if (graph && n1 >= 0 && n2 >= 0 && t != TRANSPORT_NET) continue;
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransport *transport = ncclTransports[t];
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
int ret = 0;
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
@@ -244,18 +243,19 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
return ncclSystemError;
}
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) {
TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
uint32_t mask = 1 << channel->id;
struct ncclChannel* channel = &comm->channels[channelId];
uint32_t mask = 1 << channelId;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue;
comm->connectRecv[peer+comm->nRanks*connIndex] |= mask;
comm->connectRecv[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue;
comm->connectSend[peer+comm->nRanks*connIndex] |= mask;
comm->connectSend[peer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] |= mask;
}
return ncclSuccess;
}
@@ -271,17 +271,18 @@ void dumpData(struct ncclConnect* data, int ndata) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
// Stream used during transport setup; need for P2P pre-connect + CUDA Graph
int highestType = TRANSPORT_P2P; // track highest transport type
//hipStream_t transportSetupStream;
//CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking));
int highestType = TRANSPORT_P2P; // track highest transport type
struct ncclConnect data[2*MAXCHANNELS];
for (int i=1; i<comm->nRanks; i++) {
int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*connIndex];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*connIndex];
uint32_t recvMask = comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
uint32_t sendMask = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)];
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
@@ -319,7 +320,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn));
conn->connected = 1;
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[sendPeer].send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
}
}
for (int c=0; c<MAXCHANNELS; c++) {
@@ -327,10 +328,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
//NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn));
conn->connected = 1;
//CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream));
//CUDACHECK(hipMemcpyAsync(&comm->channels[c].devPeers[recvPeer].recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice, transportSetupStream));
}
}
comm->connectRecv[recvPeer+comm->nRanks*connIndex] = comm->connectSend[sendPeer+comm->nRanks*connIndex] = 0;
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0;
}
//CUDACHECK(hipStreamSynchronize(transportSetupStream));
//CUDACHECK(hipStreamDestroy(transportSetupStream));
@@ -357,10 +358,6 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
// check if we can connect to collnet, whose root is the nranks-th rank
struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
peerInfo->rank = nranks;
int support = 1;
if (isMaster) {
NCCLCHECK(collNetTransport.canConnect(&support, comm->topo, collNetGraph, myInfo, peerInfo));
}
// send master receives connect info from peer recv master
if (isMaster && type == collNetSend) {
@@ -370,14 +367,14 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
}
// select
struct ncclPeer* root = channel->peers+nranks;
struct ncclChannelPeer* root = channel->peers+nranks;
// connector index: 0 for recv, 1 for send
struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type;
struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send);
conn->transportComm = transportComm;
// setup
struct ncclConnect myConnect;
if (isMaster && support) {
if (isMaster) {
NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type));
}
// prepare connect handles
@@ -407,11 +404,11 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
//if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
}
// connect
if (isMaster && support) {
if (isMaster) {
//NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
struct ncclPeer* devRoot = channel->devPeers+nranks;
struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
//CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup);
struct ncclDevChannelPeer* devRoot = channel->devPeers+nranks;
struct ncclConnInfo* devConnInfo = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type;
//CUDACHECKGOTO(hipMemcpy(devConnInfo, &conn->conn, sizeof(struct ncclConnInfo), hipMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == collNetRecv) {
@@ -420,7 +417,7 @@ int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collN
//NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (support) fail = 0;
fail = 0;
cleanup:
if (allConnects != NULL) free(allConnects);
if (masterConnects != NULL) free(masterConnects);
@@ -449,21 +446,24 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
// Free collNet resources
for (int r=0; r<comm->nChannels; r++) {
struct ncclChannel* channel = comm->channels+r;
struct ncclPeer* peer = channel->peers+comm->nRanks;
struct ncclChannelPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
//if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
//if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
//if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
//if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
}
}
return ncclSuccess;
}
RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
RCCL_PARAM(PivotAlltoallEnable, "PIVOT_ALLTOALL_ENABLE", 0);
ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
// We use 2 AllGathers
@@ -499,12 +499,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
comm->topo->nRanks = comm->nRanks;
// init netGdrLevel
comm->topo->netGdrLevel = -2;
// init Pivot A2A related fields
comm->topo->pivotA2AEnabled = false;
comm->topo->pivotA2ANumBiRings = 0;
// Compute paths between GPUs and NICs
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
// Remove inaccessible GPUs and unused NICs
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
// Recompute paths after trimming
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm));
// Init search
NCCLCHECK(ncclTopoSearchInit(comm->topo));
// Print final topology
@@ -571,39 +574,31 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
}
}
#if 0
{ // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager
CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
if (comm->localRanks == comm->nRanks && comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)
{
if (hasPeerAccess)
{
if (intraProcRanks == nranks)
cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
else
cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
}
// For now, only enable clique-based kernels on nodes where all GPUs are XGMI connected
if (!allXgmi && !rcclParamCliqueIgnoreTopo())
{
INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (ignore with RCCL_CLIQUE_IGNORE_TOPO)");
cliqueMode = CliqueManager::CLIQUE_DISABLED;
}
}
comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
} // [/RCCL]
#endif
if (comm->rank == ncclParamGraphDumpFileRank()) {
struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
}
// Determine local CollNet support before all-gather
if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
if (collNetSupport(comm)) {
char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
if (collNetEnable != NULL) {
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
if (strcmp(collNetEnable, "1") == 0) {
comm->collNetSupport = 1;
}
}
}
if (comm->collNetSupport == 1 && collNetGraph.nChannels <= 0) comm->collNetSupport = 0;
if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) {
if (rcclParamP2pNetDisable() == 0) {
if (!(comm->topo->type & RCCL_TOPO_FORCE_INTRA)) comm->p2pNet = 1;
INFO(NCCL_INIT, "RCCL enabled same node P2P over network");
}
else
INFO(NCCL_INIT, "RCCL force disabled same node P2P over network");
}
// AllGather3 - begin
#if 0
struct ncclGraphInfo {
@@ -624,6 +619,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
struct ncclGraphInfo ring;
struct ncclGraphInfo collNet;
struct ncclTopoRanks topoRanks;
bool pivotA2AEnabled;
} *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
@@ -666,6 +662,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t
allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter;
allGather3Data[rank].collNetSupport = comm->collNetSupport;
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count)
? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels;
@@ -758,6 +755,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
comm->topo->pivotA2AEnabled = comm->topo->pivotA2AEnabled && allGather3Data[i].pivotA2AEnabled;
}
comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
@@ -818,16 +816,16 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore);
if (ringGraph.nIntraChannels) {
if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
comm->useIntraNet = 1;
// Connect NET for intranode use
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, affinity_restore);
}
@@ -838,8 +836,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore);
INFO(NCCL_INIT, "Connected all trees");
@@ -861,7 +859,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
for (int h=0; h<nHeads; h++) {
const int head = heads[h];
collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetRecv);
if (!collNetSetupFail) collNetSetupFail = ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
collNetSetupFail += ncclTransportCollNetSetup(comm, &collNetGraph, channel, head, head, h, collNetSend);
}
// Verify CollNet setup across ranks after trying the first channel
if (c == 0) {
@@ -876,12 +874,12 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int highestTransportType0, highestTransportType1;
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 0, &highestTransportType0), ret, collnet_cleanup);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channelSend = comm->channels+c;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1), ret, collnet_cleanup);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &collNetGraph, 1, &highestTransportType1), ret, collnet_cleanup);
@@ -919,6 +917,52 @@ collnet_cleanup:
// Compute nChannels per peer for p2p
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
#if 0
do { // Setup p2p structures in comm->tasks
struct ncclTasks* tasks = &comm->tasks;
int nRanks = comm->nRanks;
int node = comm->node;
int nNodes = comm->nNodes;
struct ncclNodeRanks *nodeRanks = comm->nodeRanks;
int localRank = comm->localRank;
tasks->peers = ncclMemoryStackAlloc<ncclTasks::Peer>(&comm->memPermanent, nRanks);
tasks->p2pSendOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
tasks->p2pRecvOrder = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
int s=0, r=0;
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d <= nNodes/4; d++) {
int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
int index = 0;
int delta = deltas[index];
sched_delta:
int recvNode = (node+nNodes-delta)%nNodes;
int sendNode = (node+delta)%nNodes;
int steps = comm->maxLocalRanks;
for (int step=0; step < steps; step++) {
int recvIndex = (localRank-step+steps)%steps;
if (recvIndex < nodeRanks[recvNode].localRanks) {
tasks->p2pRecvOrder[r] = nodeRanks[recvNode].localRankToRank[recvIndex];
r++;
}
int sendIndex = (localRank+step)%steps;
if (sendIndex < nodeRanks[sendNode].localRanks) {
tasks->p2pSendOrder[s] = nodeRanks[sendNode].localRankToRank[sendIndex];
s++;
}
}
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
if (index == 3 && deltas[3] == deltas[2]) index++;
if (index == 3 && deltas[3] == deltas[1]) index++;
if (index < 4) {
delta = deltas[index];
goto sched_delta;
}
}
assert(s == nRanks && r == nRanks);
} while (0);
if (ncclParamNvbPreconnect()) {
// Connect p2p when using NVB path
int nvbNpeers;
@@ -926,18 +970,17 @@ collnet_cleanup:
NCCLCHECK(ncclTopoGetNvbGpus(comm->topo, comm->rank, &nvbNpeers, &nvbPeers));
for (int r=0; r<nvbNpeers; r++) {
int peer = nvbPeers[r];
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
int channelId;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncSend, &channelId));
if (comm->channels[channelId].peers[peer].send[1].connected == 0) {
comm->connectSend[peer] |= (1<<channelId);
}
}
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
NCCLCHECK(ncclChannelCompute(comm, peer, c, ncclFuncRecv, &channelId));
if (comm->channels[channelId].peers[peer].recv[1].connected == 0) {
comm->connectRecv[peer] |= (1<<channelId);
}
}
}
@@ -947,18 +990,17 @@ collnet_cleanup:
#endif
// Connect to local net proxy
struct ncclProxyConnector proxyConn;
NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
//NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
// Then to remote ones when using PXN
if (ncclPxnDisable() == 0) {
if (ncclPxnDisable(comm) == 0) {
int nranks;
int* pxnPeers;
NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
for (int r=0; r<nranks; r++) {
//NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
//NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
// NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
}
free(pxnPeers);
}
@@ -973,6 +1015,10 @@ collnet_cleanup:
if (intraProcRanks == 0) intraProcRank0 = i;
if (i == rank) intraProcRank = intraProcRanks;
intraProcRanks++;
if (intraProcRank0 == rank && rank != i) {
comm->peerInfo[i].comm->intraNext = comm->intraNext;
comm->intraNext = comm->peerInfo[i].comm;
}
}
}
TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
@@ -983,14 +1029,40 @@ collnet_cleanup:
intraProcRank, intraProcRanks, intraProcRank0);
return ncclInternalError;
}
//NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
struct ncclComm* comm0 = comm->peerInfo[intraProcRank0].comm;
assert(intraProcRank==0 ? comm==comm0 : true);
comm->intraComm0 = comm0;
comm->intraRefs = intraProcRank==0 ? intraProcRanks : 0;
comm->intraRank = intraProcRank;
comm->intraRanks = intraProcRanks;
comm->intraBarrierPhase = 0;
comm->intraBarrierCounter = 0;
comm->intraBarrierGate = 0;
} while(0);
#if 0
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
char* str = getenv("NCCL_LAUNCH_MODE");
enum ncclLaunchMode mode, modeOld;
if (str && strcasecmp(str, "GROUP") == 0) {
mode = ncclLaunchModeGroup;
} else {
mode = ncclLaunchModeParallel;
}
// In theory we could be racing with other communicators not associated with
// this one if the user is connecting to multiple ncclUniqueId's concurrently.
modeOld = __atomic_exchange_n(&ncclParamLaunchMode, mode, __ATOMIC_RELAXED);
if (modeOld == ncclLaunchModeInvalid && str && str[0]!='\0') {
INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", mode == ncclLaunchModeParallel ? "PARALLEL" : "GROUP");
}
}
/* Local intra-node barrier */
//NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
// Unlink proxy shm to make sure it will be properly cleaned up.
//NCCLCHECK(ncclProxyShmUnlink(comm));
NCCLCHECK(ncclProxyShmUnlink(comm));
#endif
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
@@ -1013,3 +1085,7 @@ ncclResult_t rocm_smi_getDeviceIndexByPciBusId(const char* pciBusId, uint32_t* d
ncclResult_t rocm_smi_getLinkInfo(int srcDev, int dstDev, RSMI_IO_LINK_TYPE* rsmi_type, int *hops, int *bw) {
return ncclSuccess;
}
int ncclNetVersion(struct ncclComm* comm) {
return 4;
}